Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F85143619
extractWordPosition.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Sep 27, 02:03
Size
32 KB
Mime Type
text/x-python
Expires
Sun, Sep 29, 02:03 (2 d)
Engine
blob
Format
Raw Data
Handle
21134418
Attached To
rNIETZSCHEPYTHON nietzsche-python
extractWordPosition.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract the position of the words in a svg file and write them to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import
inspect
import
getopt
from
lxml
import
etree
as
ET
from
os
import
sep
,
listdir
,
mkdir
,
path
from
os.path
import
exists
,
isfile
,
isdir
from
progress.bar
import
Bar
import
re
import
sys
from
svgpathtools
import
svg2paths2
import
warnings
from
myxmlwriter
import
write_pretty
from
datatypes.lineNumber
import
LineNumber
from
datatypes.matrix
import
Matrix
from
datatypes.page
import
Page
from
datatypes.pdf
import
PDFText
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.transkription_position
import
TranskriptionPosition
from
datatypes.word
import
Word
from
datatypes.word_insertion_mark
import
WordInsertionMark
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
class
Extractor
:
"""
This class can be used to extract the word positions in a svg file and write it to a xml file.
Args:
[xml_dir (str): target directory]
[title (str): title of document]
[manuscript_file (str): xml file containing information about the archival unity to which the current page belongs
[extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that
are part of the transkription field.
"""
SONDERZEICHEN_LIST
=
[
'A'
,
'B'
,
'{'
,
'}'
]
def
__init__
(
self
,
xml_dir
=
None
,
title
=
None
,
manuscript_file
=
None
,
extract_transkription_field_only
=
False
,
compare2pdf
=
False
):
if
bool
(
xml_dir
):
self
.
xml_dir
=
xml_dir
not
isdir
(
self
.
xml_dir
)
and
mkdir
(
self
.
xml_dir
)
else
:
self
.
xml_dir
=
'xml'
if
(
isdir
(
'xml'
))
else
''
self
.
compare2pdf
=
compare2pdf
self
.
xml_dir
=
self
.
xml_dir
+
sep
if
(
bool
(
self
.
xml_dir
))
else
''
self
.
title
=
title
self
.
manuscript_file
=
manuscript_file
self
.
extract_transkription_field_only
=
extract_transkription_field_only
self
.
manuscript_tree
=
None
if
not
bool
(
self
.
title
)
and
bool
(
self
.
manuscript_file
)
and
isfile
(
self
.
manuscript_file
):
self
.
manuscript_tree
=
ET
.
parse
(
self
.
manuscript_file
)
self
.
title
=
self
.
manuscript_tree
.
getroot
()
.
get
(
'title'
)
elif
bool
(
self
.
manuscript_file
):
raise
FileNotFoundError
(
'File "{}" does not exist!'
.
format
(
self
.
manuscript_file
))
elif
bool
(
self
.
title
):
if
not
bool
(
self
.
manuscript_file
):
self
.
manuscript_file
=
self
.
xml_dir
+
self
.
title
.
replace
(
' '
,
'_'
)
+
'.xml'
if
not
isfile
(
self
.
manuscript_file
):
self
.
manuscript_tree
=
ET
.
ElementTree
(
ET
.
Element
(
'page'
,
attrib
=
{
"title"
:
self
.
title
}))
write_pretty
(
xml_element_tree
=
self
.
manuscript_tree
,
file_name
=
self
.
manuscript_file
,
script_name
=
__file__
,
file_type
=
'xmlManuscriptFile'
)
def
get_page_number
(
self
,
file_name
,
page_number
=
None
):
""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
"""
if
not
bool
(
page_number
)
and
bool
(
re
.
search
(
r'\d'
,
file_name
)):
"""if page_number=None and filename contains digits,
then split filename into its parts that contain only digits, remove empty strings
and return the last part containing only digits.
"""
page_number
=
list
(
filter
(
lambda
x
:
x
!=
''
,
re
.
split
(
r'\D+'
,
file_name
)))
.
pop
()
if
bool
(
page_number
):
leading_zeros
=
'00'
if
(
len
(
page_number
)
==
1
)
else
'0'
if
(
len
(
page_number
)
==
2
)
else
''
return
leading_zeros
+
str
(
page_number
)
else
:
return
''
def
get_file_name
(
self
,
file_name
,
page_number
=
None
):
"""Returns the file_name of the target xml file.
"""
dir_name
=
path
.
dirname
(
self
.
xml_dir
)
+
sep
if
(
bool
(
self
.
xml_dir
))
else
''
if
bool
(
self
.
title
):
return
dir_name
+
self
.
title
.
replace
(
' '
,
'_'
)
+
'_page'
+
self
.
get_page_number
(
file_name
,
page_number
=
page_number
)
+
'.xml'
else
:
return
'{}{}'
.
format
(
dir_name
,
path
.
basename
(
file_name
)
.
replace
(
'.svg'
,
'.xml'
))
def
get_style
(
self
,
etree_root
):
"""Returns the style specification as a dictionary.
:returns:
sonderzeichen_list: list of keys for classes that are 'Sonderzeichen'
style_dict: dictionary: key = class name (str), value = style specification (dictionary)
"""
style_dict
=
{}
sonderzeichen_list
=
[]
letterspacing_list
=
[]
style
=
etree_root
.
find
(
'style'
,
etree_root
.
nsmap
)
if
style
is
not
None
:
for
style_item
in
list
(
filter
(
lambda
x
:
x
!=
''
,
style
.
text
.
split
(
"
\n\t
"
))):
style_key
=
style_item
.
split
(
'{'
)[
0
]
.
replace
(
'.'
,
''
)
style_value_dict
=
{
item
.
split
(
':'
)[
0
]:
item
.
split
(
':'
)[
1
]
.
replace
(
'
\'
'
,
''
)
\
for
item
in
list
(
filter
(
lambda
x
:
x
!=
''
,
style_item
.
split
(
'{'
)[
1
]
.
replace
(
'}'
,
''
)
.
replace
(
'
\n
'
,
''
)
.
split
(
';'
)))}
style_dict
[
style_key
]
=
style_value_dict
if
bool
(
style_value_dict
.
get
(
'font-family'
))
and
'Sonderzeichen'
in
style_value_dict
.
get
(
'font-family'
):
sonderzeichen_list
.
append
(
style_key
)
if
bool
(
style_value_dict
.
get
(
'letter-spacing'
)):
letterspacing_list
.
append
(
style_key
)
return
sonderzeichen_list
,
letterspacing_list
,
style_dict
def
get_word_from_part_obj
(
self
,
word_part_obj
):
"""Extracts all 'text' from a list of dicitonaries and concats it to a string.
"""
return
''
.
join
([
dict
[
'text'
]
for
dict
in
word_part_obj
])
def
find_inserted_words_by_position
(
self
,
target_tree
,
x
,
y
):
"""Returns an Array with the words that are inserted above the x, y position or [] if not found.
"""
warnings
.
warn
(
'Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.'
)
MINY
=
31.0
MAXY
=
10.0
DIFFX
=
9.0
if
(
len
(
target_tree
.
getroot
()
.
xpath
(
'//word[@id]'
))
>
0
):
result_list
=
[]
minus2left
=
20.0
minus2top
=
19.0
while
len
(
result_list
)
==
0
and
minus2top
<
MINY
and
minus2left
>
DIFFX
:
result_list
=
[
Word
.
CREATE_WORD
(
item
)
for
item
in
target_tree
.
getroot
()
.
xpath
(
\
'//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'
.
format
(
y
-
minus2top
,
y
-
MAXY
,
x
-
minus2left
,
x
+
DIFFX
))
]
minus2left
-=
1
minus2top
+=
1
if
len
(
result_list
)
>
0
:
result_bottom
=
result_list
[
len
(
result_list
)
-
1
]
.
bottom
result_left_min
=
result_list
[
len
(
result_list
)
-
1
]
.
left
+
result_list
[
len
(
result_list
)
-
1
]
.
width
for
item
in
target_tree
.
getroot
()
.
xpath
(
'//word[@bottom={0} and @left>{1}]'
.
format
(
result_bottom
,
result_left_min
)):
result_left_min
=
result_list
[
len
(
result_list
)
-
1
]
.
left
+
result_list
[
len
(
result_list
)
-
1
]
.
width
result_left_max
=
result_left_min
+
DIFFX
if
float
(
item
.
get
(
'left'
))
-
result_left_max
<
DIFFX
:
result_list
.
append
(
Word
.
CREATE_WORD
(
item
))
else
:
break
return
result_list
else
:
return
[]
def
find_inserted_words
(
self
,
target_tree
,
word_insertion_mark
):
"""Returns an Array with the words that are inserted above/underneath the word_insertion_mark.
"""
warnings
.
warn
(
'Function "find_inserted_words" does not work and it is not clear whether we need this.'
)
if
word_insertion_mark
.
line_number
<
2
or
word_insertion_mark
.
line_number
%
2
==
1
:
return
self
.
find_inserted_words_by_position
(
target_tree
,
word_insertion_mark
.
x
,
word_insertion_mark
.
y
)
if
(
len
(
target_tree
.
getroot
()
.
xpath
(
'//word[@id]'
))
>
0
):
MINY
=
31.0
MAXY
=
10.0
DIFFX
=
9.0
result_list
=
[]
x
=
word_insertion_mark
.
x
y
=
word_insertion_mark
.
y
if
word_insertion_mark
.
mark_type
!=
'B'
:
# all insertions that are above the current line
line_number
=
word_insertion_mark
.
line_number
-
1
words_on_line
=
[
Word
.
CREATE_WORD
(
item
)
for
item
in
target_tree
.
getroot
()
.
xpath
(
\
'//word[@line-number={0}]'
.
format
(
line_number
))
]
if
len
(
words_on_line
)
>
0
:
minus2top
=
1.0
while
len
(
result_list
)
==
0
and
minus2top
<
MINY
:
for
word
in
words_on_line
:
for
transkription_position
in
word
.
transkription_positions
:
if
transkription_position
.
top
>
y
-
minus2top
\
and
transkription_position
.
left
>
x
-
DIFFX
\
and
transkription_position
.
left
<
x
+
DIFFX
:
result_list
.
append
(
word
)
break
minus2top
+=
1
elif
word_insertion_mark
.
mark_type
==
'B'
:
# B means insertion is underneath the current line
line_number
=
word_insertion_mark
.
line_number
+
1
words_on_line
=
[
Word
.
CREATE_WORD
(
item
)
for
item
in
target_tree
.
getroot
()
.
xpath
(
\
'//word[@line-number={0}]'
.
format
(
line_number
))
]
if
len
(
words_on_line
)
>
0
:
plus2top
=
1.0
while
len
(
result_list
)
==
0
and
plus2top
<
MINY
:
for
word
in
words_on_line
:
for
transkription_position
in
word
.
transkription_positions
:
if
transkription_position
.
top
>
y
+
plus2top
\
and
transkription_position
.
left
>
x
-
DIFFX
\
and
transkription_position
.
left
<
x
+
DIFFX
:
result_list
.
append
(
word
)
break
plus2top
+=
1
if
len
(
result_list
)
>
0
:
# now, collect more words that are right of already collected words
result_bottom
=
result_list
[
len
(
result_list
)
-
1
]
.
transkription_positions
[
0
]
.
bottom
result_left_min
=
result_list
[
len
(
result_list
)
-
1
]
.
transkription_positions
[
0
]
.
left
\
+
result_list
[
len
(
result_list
)
-
1
]
.
transkription_positions
[
0
]
.
width
for
item
in
target_tree
.
getroot
()
.
xpath
(
\
'//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'
.
format
(
line_number
,
result_bottom
-
5
,
result_bottom
+
5
,
result_left_min
)):
result_left_min
=
result_list
[
len
(
result_list
)
-
1
]
.
transkription_positions
[
0
]
.
left
\
+
result_list
[
len
(
result_list
)
-
1
]
.
transkription_positions
[
0
]
.
width
result_left_max
=
result_left_min
+
DIFFX
if
float
(
item
.
get
(
'left'
))
-
result_left_max
<
DIFFX
:
result_list
.
append
(
Word
.
CREATE_WORD
(
item
))
else
:
break
return
result_list
else
:
return
[]
def
add_word
(
self
,
page
,
index
,
word_part_objs
,
endSign
,
endX
,
matrix
=
None
,
debug_msg
=
None
,
transkription_field
=
None
):
"""Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word).
If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created.
:returns: the new word counter (int)
"""
break_points
=
[]
if
(
len
(
page
.
sonderzeichen_list
)
>
0
):
# check for Sonderzeichen and special chars -> mark for word insertion, create break points
for
Sonderzeichen
in
self
.
SONDERZEICHEN_LIST
:
contains_Sonderzeichen
=
[
dict
[
'text'
]
==
Sonderzeichen
and
any
(
sz
in
dict
[
'class'
]
for
sz
in
page
.
sonderzeichen_list
)
for
dict
in
word_part_objs
]
if
True
in
contains_Sonderzeichen
:
break_points
+=
[
(
endPoint
,
endPoint
+
1
)
for
endPoint
in
[
i
for
i
,
e
in
enumerate
(
contains_Sonderzeichen
)
if
e
==
True
]]
for
sz_point
in
[
i
for
i
,
e
in
break_points
]:
wim_index
=
len
(
page
.
word_insertion_marks
)
x
=
float
(
word_part_objs
[
sz_point
][
'x'
])
y
=
float
(
word_part_objs
[
sz_point
][
'y'
])
if
page
.
svg_file
is
not
None
and
isfile
(
page
.
svg_file
)
and
transkription_field
is
not
None
:
svg_path_tree
=
ET
.
parse
(
page
.
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_path_tree
.
getroot
()
.
nsmap
.
items
()
}
xmin
=
transkription_field
.
xmin
ymin
=
transkription_field
.
ymin
wim
=
WordInsertionMark
.
CREATE_WORD_INSERTION_MARK
(
svg_path_tree
,
namespaces
,
id
=
wim_index
,
x
=
x
,
y
=
y
,
xmin
=
xmin
,
ymin
=
ymin
,
\
line_number
=
page
.
get_line_number
(
y
-
1
),
mark_type
=
Sonderzeichen
)
page
.
word_insertion_marks
.
append
(
wim
)
if
(
bool
(
re
.
search
(
r'\d[A-Za-z]'
,
self
.
get_word_from_part_obj
(
word_part_objs
)))):
# case: digits from line number and chars from words -> create break points
THRESHOLDX
=
20
# Threshold between line number and text
last_x
=
-
1
for
i
,
x
in
enumerate
([
float
(
dict
[
'x'
])
for
dict
in
word_part_objs
]):
if
(
last_x
>
-
1
and
(
x
-
last_x
>
THRESHOLDX
)):
break_points
.
append
((
i
,
i
))
last_x
=
x
if
(
len
(
break_points
)
>
0
):
# if there are break points -> split word_part_obj and add the corresponding words
from_index
=
0
for
end_point
,
next_from_index
in
break_points
:
new_word_part_objs
=
word_part_objs
[
from_index
:
end_point
]
new_endX
=
word_part_objs
[
end_point
][
'x'
]
from_index
=
next_from_index
index
=
self
.
add_word
(
page
,
index
,
new_word_part_objs
,
None
,
new_endX
,
matrix
=
matrix
,
debug_msg
=
debug_msg
,
transkription_field
=
transkription_field
)
if
from_index
>
0
and
from_index
<
len
(
word_part_objs
):
new_word_part_objs
=
word_part_objs
[
from_index
:]
index
=
self
.
add_word
(
page
,
index
,
new_word_part_objs
,
endSign
,
endX
,
matrix
=
matrix
,
debug_msg
=
debug_msg
,
transkription_field
=
transkription_field
)
return
index
else
:
if
len
(
word_part_objs
)
>
0
:
transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST
(
page
,
word_part_objs
,
matrix
=
matrix
,
\
debug_msg_string
=
debug_msg
,
transkription_field
=
transkription_field
)
text
=
self
.
get_word_from_part_obj
(
word_part_objs
)
line_number
=
page
.
get_line_number
((
transkription_positions
[
0
]
.
bottom
+
transkription_positions
[
0
]
.
top
)
/
2
)
newWord
=
Word
(
id
=
index
,
text
=
text
,
line_number
=
line_number
,
transkription_positions
=
transkription_positions
)
#newWord = Word.CREATE_WORD(page=page, word_part_objs=word_part_objs, id=index, endX=endX, endSign=endSign, matrix=matrix, debug_msg=debug_msg)
#newWord.attach_word_to_tree(page.page_tree) -> now we attach all words with update_and_attach_words2tree()
page
.
words
.
append
(
newWord
)
return
int
(
index
)
+
1
else
:
return
int
(
index
)
def
get_bottoms
(
self
,
tree_root
,
from_position
=-
1.0
,
to_position
=-
1.0
,
transkription_field
=
None
):
"""Returns all unique bottom values (Float) as a sorted list.
"""
bottom_list
=
sorted
(
set
(
item
.
get
(
'transform'
)
.
split
(
' '
)[
5
]
.
replace
(
')'
,
''
)
for
item
in
tree_root
.
findall
(
".//text"
,
tree_root
.
nsmap
)),
key
=
float
)
if
transkription_field
is
not
None
:
from_position
=
transkription_field
.
ymin
to_position
=
transkription_field
.
ymax
if
(
from_position
>
0.0
and
to_position
>
0.0
):
return
[
item
for
item
in
filter
(
lambda
x
:
float
(
x
)
>
from_position
and
float
(
x
)
<
to_position
,
bottom_list
)
]
else
:
return
bottom_list
def
get_text_items
(
self
,
tree_root
,
transkription_field
=
None
):
"""Returns all text elements with a matrix or (if transkription_field is specified)
all text elements that are located inside the transkription field.
"""
if
transkription_field
is
not
None
:
return
filter
(
lambda
x
:
Matrix
.
IS_PART_OF_TRANSKRIPTION_FIELD
(
x
.
get
(
'transform'
),
transkription_field
),
\
tree_root
.
iterfind
(
".//text"
,
tree_root
.
nsmap
))
else
:
return
tree_root
.
iterfind
(
".//text"
,
tree_root
.
nsmap
)
def
extract_line_numbers
(
self
,
svg_tree
,
transkription_field
):
"""Extracts line numbers and write them to a xml file.
"""
nodes_near_tf
=
[
item
for
item
in
filter
(
lambda
x
:
Matrix
.
IS_NEARX_TRANSKRIPTION_FIELD
(
x
.
get
(
'transform'
),
transkription_field
),
\
svg_tree
.
getroot
()
.
iterfind
(
'.//text'
,
svg_tree
.
getroot
()
.
nsmap
))]
line_numbers
=
[
LineNumber
(
raw_text_node
=
item
,
transkription_field
=
transkription_field
)
\
for
item
in
filter
(
lambda
x
:
LineNumber
.
IS_A_LINE_NUMBER
(
x
),
nodes_near_tf
)]
if
len
(
line_numbers
)
>
0
:
MINABOVE
=
3
last_to_position
=
transkription_field
.
ymin
for
line_number
in
line_numbers
:
above_current_line_bottom
=
line_number
.
bottom
+
transkription_field
.
ymin
-
MINABOVE
bottoms
=
self
.
get_bottoms
(
svg_tree
.
getroot
(),
from_position
=
last_to_position
,
to_position
=
above_current_line_bottom
)
last_to_position
=
above_current_line_bottom
if
len
(
bottoms
)
>
0
:
current_line_top
=
float
(
bottoms
[
len
(
bottoms
)
-
1
])
-
transkription_field
.
ymin
+
MINABOVE
line_number
.
setTop
(
current_line_top
)
return
line_numbers
def
get_word_object_multi_char_x
(
self
,
word_part_obj_dict
):
"""Returns the x of the last char of word_part_object.
TODO: get real widths from svg_file!!!
"""
WIDTHFACTOR
=
2.6
return
word_part_obj_dict
[
'x'
]
if
len
(
word_part_obj_dict
[
'text'
])
<
2
else
word_part_obj_dict
[
'x'
]
+
len
(
word_part_obj_dict
[
'text'
])
*
WIDTHFACTOR
def
extract_word_position
(
self
,
svg_tree
,
page
,
transkription_field
=
None
):
"""Extracts word positions.
"""
counter
=
0
word_part_obj
=
[]
endSign
=
'%'
last_matrix
=
None
MAXBOTTOMDIFF
=
5
MAXXDIFF
=
6
bar
=
Bar
(
'extracting word positions from text_item'
,
max
=
len
([
*
self
.
get_text_items
(
svg_tree
.
getroot
(),
transkription_field
=
transkription_field
)]))
for
text_item
in
self
.
get_text_items
(
svg_tree
.
getroot
(),
transkription_field
=
transkription_field
):
current_matrix
=
Matrix
(
text_item
.
get
(
'transform'
),
transkription_field
=
transkription_field
)
# check for line breaks
if
(
last_matrix
is
not
None
and
len
(
word_part_obj
)
>
0
and
(
\
Matrix
.
DO_CONVERSION_FACTORS_DIFFER
(
last_matrix
,
current_matrix
)
or
\
(
abs
(
current_matrix
.
getY
()
-
last_matrix
.
getY
())
>
MAXBOTTOMDIFF
)
or
\
(
abs
(
current_matrix
.
getX
()
-
word_part_obj
[
len
(
word_part_obj
)
-
1
][
'x'
])
>
MAXXDIFF
)))
\
or
(
len
(
word_part_obj
)
>
0
and
self
.
get_word_object_multi_char_x
(
word_part_obj
[
0
])
>
current_matrix
.
getX
()):
endSign
=
'%'
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
debug_msg
=
'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'
.
format
(
\
round
(
abs
(
current_matrix
.
getX
()
-
word_part_obj
[
len
(
word_part_obj
)
-
1
][
'x'
]),
3
),
round
(
abs
(
current_matrix
.
getY
()
-
last_matrix
.
getY
()),
3
),
\
str
(
Matrix
.
DO_CONVERSION_FACTORS_DIFFER
(
last_matrix
,
current_matrix
)))
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
last_matrix
,
debug_msg
=
debug_msg
,
transkription_field
=
transkription_field
)
word_part_obj
=
[]
endX
=
current_matrix
.
getX
()
if
(
len
(
text_item
.
findall
(
".//tspan"
,
svg_tree
.
getroot
()
.
nsmap
))
<
1
):
# case: <svg><text>TEXT
if
(
bool
(
text_item
.
text
)
and
not
bool
(
re
.
search
(
r'^\s*$'
,
text_item
.
text
))):
word_part_obj
.
append
(
{
"text"
:
text_item
.
text
,
"x"
:
current_matrix
.
getX
(),
"y"
:
current_matrix
.
getY
(),
"class"
:
text_item
.
get
(
'class'
),
"matrix"
:
current_matrix
}
)
else
:
endSign
=
text_item
.
text
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
last_matrix
,
debug_msg
=
'svg/text/\s'
,
transkription_field
=
transkription_field
)
word_part_obj
=
[]
endSign
=
'%'
for
tspan_item
in
text_item
.
findall
(
".//tspan"
,
svg_tree
.
getroot
()
.
nsmap
):
# case: <svg><text><tspan>TEXT
endX
=
current_matrix
.
add2X
(
tspan_item
.
get
(
'x'
))
if
(
tspan_item
.
text
!=
None
and
tspan_item
.
text
!=
''
and
not
bool
(
re
.
search
(
r'^\s*$'
,
tspan_item
.
text
))):
y
=
current_matrix
.
add2Y
(
tspan_item
.
get
(
'y'
))
word_part_obj
.
append
(
{
"text"
:
tspan_item
.
text
,
"x"
:
endX
,
"y"
:
y
,
"class"
:
tspan_item
.
get
(
'class'
),
"matrix"
:
current_matrix
})
if
len
(
set
(
page
.
letterspacing_list
)
&
set
(
tspan_item
.
get
(
'class'
)
.
split
(
' '
)))
>
0
:
"""text_item has letterspacing class
(set s & set t = new set with elements common to s and t)
"""
endSign
=
'%'
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
current_matrix
,
\
debug_msg
=
'tspan with letterspacing'
,
transkription_field
=
transkription_field
)
word_part_obj
=
[]
else
:
endSign
=
tspan_item
.
text
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
current_matrix
,
\
debug_msg
=
'svg/text/tspan/\s'
,
transkription_field
=
transkription_field
)
word_part_obj
=
[]
endSign
=
'%'
last_matrix
=
current_matrix
bar
.
next
()
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
current_matrix
,
debug_msg
=
'end of loop'
,
\
transkription_field
=
transkription_field
)
word_part_obj
=
[]
endSign
=
'%'
print
()
def
update_and_attach_words2tree
(
self
,
page
):
"""Update word ids and attach them to page.page_tree.
"""
for
node
in
page
.
page_tree
.
xpath
(
'//word'
):
node
.
getparent
()
.
remove
(
node
)
for
index
,
word
in
enumerate
(
page
.
words
):
word
.
id
=
index
word
.
attach_word_to_tree
(
page
.
page_tree
)
def
extract_information
(
self
,
file_name
,
page_number
=
None
,
xml_target_file
=
None
,
svg_file
=
None
,
pdfFile
=
None
):
"""Extracts information about positions of text elements.
[:returns:] (datatypes.page) the Page containing all information.
"""
if
isfile
(
file_name
):
if
not
bool
(
xml_target_file
):
xml_target_file
=
self
.
get_file_name
(
file_name
,
page_number
)
if
bool
(
self
.
xml_dir
)
and
not
bool
(
path
.
dirname
(
xml_target_file
)):
xml_target_file
=
path
.
dirname
(
self
.
xml_dir
)
+
sep
+
xml_target_file
transkription_field
=
TranskriptionField
(
file_name
)
if
bool
(
self
.
extract_transkription_field_only
)
else
None
svg_tree
=
ET
.
parse
(
file_name
)
page
=
Page
(
xml_target_file
=
xml_target_file
,
title
=
self
.
title
,
page_number
=
page_number
,
pdfFile
=
pdfFile
,
\
svg_file
=
svg_file
,
extract_transkription_field_only
=
self
.
extract_transkription_field_only
)
sonderzeichen_list
,
letterspacing_list
,
style_dict
=
self
.
get_style
(
svg_tree
.
getroot
())
page
.
add_style
(
sonderzeichen_list
=
sonderzeichen_list
,
letterspacing_list
=
letterspacing_list
,
style_dict
=
style_dict
)
if
transkription_field
is
not
None
:
page
.
init_line_numbers
(
self
.
extract_line_numbers
(
svg_tree
,
transkription_field
),
transkription_field
.
ymax
)
self
.
extract_word_position
(
svg_tree
,
page
,
transkription_field
=
transkription_field
)
if
page
.
pdfFile
is
not
None
and
isfile
(
page
.
pdfFile
):
pdftext
=
PDFText
(
page
.
pdfFile
,
sonderzeichen
=
self
.
SONDERZEICHEN_LIST
)
pdftext
.
compare_svgWords2pdfWords
(
page
,
transkription_field
=
transkription_field
,
split_wrongly_concatenated_words
=
self
.
compare2pdf
)
page
.
create_writing_processes_and_attach2tree
()
self
.
update_and_attach_words2tree
(
page
)
for
word_insertion_mark
in
page
.
word_insertion_marks
:
# it is not clear if we really need to know this alternative word ordering. See 'TODO.md'
#word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark)
word_insertion_mark
.
attach_object_to_tree
(
page
.
page_tree
)
return
page
else
:
raise
FileNotFoundError
(
'
\"
{}
\"
is not an existing file!'
.
format
(
file_name
))
def
extractAndWriteInformation
(
self
,
file_name
,
page_number
=
None
,
xml_target_file
=
None
,
svg_file
=
None
,
pdfFile
=
None
):
"""Extracts information about positions of text elements and writes them to a xml file.
"""
if
isfile
(
file_name
):
if
not
bool
(
xml_target_file
):
xml_target_file
=
self
.
get_file_name
(
file_name
,
page_number
)
if
bool
(
self
.
xml_dir
)
and
not
bool
(
path
.
dirname
(
xml_target_file
)):
xml_target_file
=
path
.
dirname
(
self
.
xml_dir
)
+
sep
+
xml_target_file
page
=
self
.
extract_information
(
file_name
,
page_number
=
page_number
,
xml_target_file
=
xml_target_file
,
svg_file
=
svg_file
,
pdfFile
=
pdfFile
)
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
xml_target_file
,
script_name
=
__file__
,
file_type
=
'svgWordPosition'
)
return
0
else
:
raise
FileNotFoundError
(
'
\"
{}
\"
is not an existing file!'
.
format
(
file_name
))
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to extract the position of the words in a svg file and write them to a xml file.
svgscripts/extractWordPosition.py [OPTIONS] <file|dir>
<file> svg file OR xml target file containing file name of svg file as "/page/@source".
<dir> directory containing svg files
OPTIONS:
-h|--help: show help
-c|--compare-to-pdf compare words to pdf and autocorrect
-d|--xml-dir=xmlDir: target directory for the xml output file(s)
-m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s)
-o|--only-transkription-field: extract only words that are part of the transkription field.
-p|--page=pageNumber: page number of the current page. For use with _one_ file only.
-P|--PDF=pdfFile: pdf file - used for word correction
-s|--svg=svgFile: svg web file
-t|--title=title: title of the manuscript to which the current page(s) belong(s)
-x|--xml-target-file=xmlOutputFile: xml target file
:return: exit code (int)
"""
compare2pdf
=
False
extract_transkription_field_only
=
True
manuscript_file
=
None
page_number
=
None
pdfFile
=
None
svg_file
=
None
title
=
None
xml_target_file
=
None
xml_dir
=
".{}xml"
.
format
(
sep
)
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hocd:m:t:p:s:x:P:"
,
[
"help"
,
"only-transkription-field"
,
"compare-to-pdf"
,
"xml-dir="
,
"manuscript-file="
,
"title="
,
"page="
,
"svg="
,
"xml-target-file="
,
"PDF="
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
)
or
not
args
:
usage
()
return
0
elif
opt
in
(
'-c'
,
'--compare-to-pdf'
):
compare2pdf
=
True
elif
opt
in
(
'-o'
,
'--only-transkription-field'
):
extract_transkription_field_only
=
True
elif
opt
in
(
'-d'
,
'--xml-dir'
):
xml_dir
=
arg
elif
opt
in
(
'-m'
,
'--manuscript-file'
):
manuscript_file
=
arg
elif
opt
in
(
'-t'
,
'--title'
):
title
=
arg
elif
opt
in
(
'-p'
,
'--page'
):
page_number
=
str
(
arg
)
elif
opt
in
(
'-s'
,
'--svg'
):
svg_file
=
arg
elif
opt
in
(
'-P'
,
'--PDF'
):
pdfFile
=
arg
elif
opt
in
(
'-x'
,
'--xml-target-file'
):
xml_target_file
=
str
(
arg
)
files_to_process
=
list
()
for
arg
in
args
:
if
isfile
(
arg
):
files_to_process
.
append
(
arg
)
elif
isdir
(
arg
):
files_to_process
=
files_to_process
+
list
(
filter
(
lambda
file
:
'.svg'
in
file
,
listdir
(
arg
)))
else
:
print
(
"'{}' does not exist!"
.
format
(
arg
))
return
2
if
len
(
files_to_process
)
<
1
or
args
[
0
]
.
endswith
(
'xml'
):
if
xml_target_file
is
None
:
xml_target_file
=
args
[
0
]
if
len
(
args
)
>
0
else
None
if
xml_target_file
is
not
None
and
isfile
(
xml_target_file
):
target_file_tree
=
ET
.
parse
(
xml_target_file
)
file_name
=
target_file_tree
.
getroot
()
.
get
(
'source'
)
title
=
target_file_tree
.
getroot
()
.
get
(
'title'
)
if
title
is
None
else
title
page_number
=
target_file_tree
.
getroot
()
.
get
(
'number'
)
if
page_number
is
None
else
page_number
extract_transkription_field_only
=
(
target_file_tree
.
getroot
()
.
get
(
'transkription-field-only'
)
==
'true'
)
\
if
target_file_tree
.
getroot
()
.
get
(
'transkription-field-only'
)
is
not
None
else
False
if
svg_file
is
None
:
svg_file
=
target_file_tree
.
xpath
(
'.//svg/@file'
)[
0
]
\
if
len
(
target_file_tree
.
xpath
(
'.//svg/@file'
))
>
0
else
None
files_to_process
.
insert
(
0
,
file_name
)
if
xml_target_file
in
files_to_process
:
files_to_process
.
remove
(
xml_target_file
)
else
:
usage
()
return
2
if
len
(
files_to_process
)
>
1
and
(
bool
(
page_number
)
or
bool
(
xml_target_file
)
or
bool
(
pdfFile
)
or
bool
(
svg_file
)):
print
(
"ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!"
)
usage
()
return
2
extractor
=
Extractor
(
xml_dir
=
xml_dir
,
title
=
title
,
manuscript_file
=
manuscript_file
,
extract_transkription_field_only
=
extract_transkription_field_only
,
compare2pdf
=
compare2pdf
)
for
file
in
files_to_process
:
extractor
.
extractAndWriteInformation
(
file
,
page_number
=
page_number
,
xml_target_file
=
xml_target_file
,
pdfFile
=
pdfFile
,
svg_file
=
svg_file
)
return
0
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment