Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F62023776
compare_faksimile_words_line_wise.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, May 10, 10:53
Size
32 KB
Mime Type
text/x-python
Expires
Sun, May 12, 10:53 (2 d)
Engine
blob
Format
Raw Data
Handle
17592037
Attached To
rNIETZSCHEPYTHON nietzsche-python
compare_faksimile_words_line_wise.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
from
deprecated
import
deprecated
from
functools
import
cmp_to_key
import
getopt
import
inspect
import
lxml.etree
as
ET
import
re
import
shutil
import
string
import
sys
import
tempfile
from
operator
import
attrgetter
import
os
from
os
import
listdir
,
sep
,
path
,
setpgrp
,
devnull
from
os.path
import
exists
,
isfile
,
isdir
,
dirname
,
basename
from
pathlib
import
Path
from
progress.bar
import
Bar
import
warnings
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
datatypes.faksimile
import
FaksimilePage
,
get_paths_inside_rect
from
datatypes.word_position
import
WordPosition
from
datatypes.faksimile_position
import
FaksimilePosition
from
datatypes.word
import
Word
from
datatypes.lineNumber
import
LineNumber
from
datatypes.page
import
Page
,
STATUS_MERGED_OK
from
datatypes.transkriptionField
import
TranskriptionField
from
interactive_merger
import
LineComposer
,
InteractiveMergerShell
,
ManualMergerShell
from
join_faksimileAndTranskription
import
get_filelist_and_manuscript_file
,
sort_faksimile_positions
,
sort_words
from
process_files
import
update_svgposfile_status
from
process_words_post_merging
import
post_merging_processing_and_saving
from
util
import
ExternalViewer
,
create_highlighted_svg_file
,
get_empty_node_ids
,
record_changes
,
\
record_changes_on_svg_file_to_page
,
record_changes_on_xml_file_to_page
,
get_mismatching_ids
,
\
replace_chars
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
PUNCTUATION_PATTERN
=
r"[{}]"
.
format
(
string
.
punctuation
)
PUNCTUATION_EOW_PATTERN
=
r"\w+[{}]$"
.
format
(
'
\"
'
)
SINGLE_PUNCTUATION_PATTERN
=
r"^[{}–]$"
.
format
(
string
.
punctuation
)
SINGLE_WORD_PATTERN
=
r"^[\w{}]$"
.
format
(
string
.
punctuation
)
HIGHLIGHT_COLOR
=
'red'
OPACITY
=
'0.5'
MIN_THRESHOLD
=
2
DO_DEBUG
=
False
class
FaksimileLineComposer
(
LineComposer
):
"""This class arranges the faksimile positions to lines.
"""
DEBUG
=
False
def
__init__
(
self
,
faksimile_positions
,
threshold
=
10
,
num_lines_with_words
=-
1
,
page
=
None
):
self
.
current_line_index
=
0
self
.
current_faksimile_index
=
0
reference_list
=
[
word
.
faksimile_positions
[
0
]
for
word
in
page
.
words
if
len
(
word
.
faksimile_positions
)
>
0
and
word
.
verified
]
\
if
page
is
not
None
\
else
None
if
reference_list
is
not
None
:
print
([
fp
.
text
for
fp
in
reference_list
])
faksimile_positions
=
sort_faksimile_positions
(
faksimile_positions
,
reference_list
=
reference_list
)
self
.
lines_of_faksimile_positions
=
self
.
_init_faksimile_positions_per_line
(
faksimile_positions
,
threshold
=
threshold
,
num_lines_with_words
=
num_lines_with_words
)
self
.
interactive_shell
=
InteractiveMergerShell
(
self
,
page
=
page
)
def
_init_faksimile_positions_per_line
(
self
,
faksimile_positions
,
threshold
=
10
,
num_lines_with_words
=-
1
)
->
list
:
"""Return a list containing for each line a list of faksimile positions.
"""
if
len
(
faksimile_positions
)
==
0
:
return
[[]]
lines_of_faksimile_positions
=
[[]]
last_wp
=
faksimile_positions
[
0
]
index
=
0
for
wp
in
faksimile_positions
:
if
(
wp
.
top
+
wp
.
bottom
)
/
2
-
(
last_wp
.
top
+
last_wp
.
bottom
)
/
2
>
threshold
:
lines_of_faksimile_positions
.
append
([])
index
+=
1
lines_of_faksimile_positions
[
index
]
.
append
(
wp
)
last_wp
=
wp
if
num_lines_with_words
>
-
1
\
and
num_lines_with_words
!=
len
(
lines_of_faksimile_positions
)
\
and
threshold
>
MIN_THRESHOLD
:
return
self
.
_init_faksimile_positions_per_line
(
faksimile_positions
,
\
threshold
=
threshold
-
1
,
num_lines_with_words
=
num_lines_with_words
)
return
lines_of_faksimile_positions
def
create_faksimile_dictionary
(
self
,
line_of_faksimile_positions
,
mergeables_only
=
False
)
->
dict
:
"""Create a faksimile_dictionary with fp.text as key and a list of fp as value.
"""
faksimile_text_dictionary
=
{}
for
faksimile_position
in
[
fp
for
fp
in
line_of_faksimile_positions
\
if
not
mergeables_only
or
not
fp
.
mergeable
]:
if
faksimile_position
.
text
not
in
faksimile_text_dictionary
.
keys
():
faksimile_text_dictionary
.
update
({
faksimile_position
.
text
:
[]})
faksimile_text_dictionary
[
faksimile_position
.
text
]
.
append
(
faksimile_position
)
if
faksimile_position
.
text
==
'-'
:
if
'–'
not
in
faksimile_text_dictionary
.
keys
():
faksimile_text_dictionary
.
update
({
'–'
:
[]})
faksimile_text_dictionary
[
'–'
]
.
append
(
faksimile_position
)
return
faksimile_text_dictionary
def
fix_for_unmereged_items_if_two_left
(
self
,
new_words
,
unmerged_words
,
unmerged_faksimile_positions
)
->
int
:
"""Merge if there are only two left.
[:return:] number of unmerged items
"""
if
len
(
unmerged_words
)
==
1
and
len
(
unmerged_faksimile_positions
)
==
1
:
self
.
merge_word_with_fp
(
unmerged_words
[
0
],
unmerged_faksimile_positions
[
0
],
new_words
)
unmerged_words
,
unmerged_faksimile_positions
=
[],
[]
return
len
(
unmerged_words
+
unmerged_faksimile_positions
)
def
fix_for_unmereged_items_split_words
(
self
,
new_words
,
unmerged_words
,
unmerged_faksimile_positions
)
->
int
:
"""Merge if there are only two left.
[:return:] number of unmerged items
"""
if
len
(
unmerged_words
)
<
len
(
unmerged_faksimile_positions
):
for
faksimile_position
in
unmerged_faksimile_positions
:
line_number
=
self
.
get_line_number
(
faksimile_position
,
new_words
)
words_on_line
=
[
word
for
word
in
new_words
\
if
word
.
line_number
==
line_number
and
len
(
word
.
faksimile_positions
)
>
0
]
for
word
in
words_on_line
:
if
word
.
text
.
replace
(
word
.
faksimile_positions
[
0
]
.
text
,
''
)
==
faksimile_position
.
text
:
left_word
,
right_word
,
_
=
word
.
split
(
faksimile_position
.
text
)
new_words
.
remove
(
word
)
self
.
merge_word_with_fp
(
left_word
,
word
.
faksimile_positions
[
0
],
new_words
)
self
.
merge_word_with_fp
(
right_word
,
faksimile_position
,
new_words
)
unmerged_faksimile_positions
.
remove
(
faksimile_position
)
return
len
(
unmerged_words
+
unmerged_faksimile_positions
)
def
fix_for_unmereged_items_startswith
(
self
,
new_words
,
unmerged_words
,
unmerged_faksimile_positions
,
ignoreCase
=
False
)
->
int
:
"""Do a final attempt at fixing unmerged words and faksimile_positions.
[:return:] number of unmerged items
"""
for
word
in
sorted
(
unmerged_words
,
key
=
lambda
word
:
len
(
word
.
text
),
reverse
=
True
):
matches
=
[
fp
for
fp
in
unmerged_faksimile_positions
if
text_starts_with
(
word
.
text
,
fp
.
text
,
ignoreCase
=
ignoreCase
)
and
not
fp
.
joined
]
if
len
(
matches
)
>
0
:
faksimile_position
=
sorted
(
matches
,
key
=
lambda
w
:
len
(
w
.
text
),
reverse
=
True
)[
0
]
self
.
merge_word_with_fp
(
word
,
faksimile_position
,
new_words
)
unmerged_words
.
remove
(
word
)
unmerged_faksimile_positions
.
remove
(
faksimile_position
)
else
:
matches
=
[
fp
for
fp
in
unmerged_faksimile_positions
if
text_starts_with
(
fp
.
text
,
word
.
text
,
ignoreCase
=
ignoreCase
)
and
not
fp
.
joined
]
if
len
(
matches
)
>
0
:
faksimile_position
=
sorted
(
matches
,
key
=
lambda
w
:
len
(
w
.
text
),
reverse
=
True
)[
0
]
self
.
merge_word_with_fp
(
word
,
faksimile_position
,
new_words
)
unmerged_words
.
remove
(
word
)
unmerged_faksimile_positions
.
remove
(
faksimile_position
)
return
len
(
unmerged_words
+
unmerged_faksimile_positions
)
def
final_fix_for_unmereged_items
(
self
,
new_words
,
unmerged_words
,
unmerged_faksimile_positions
)
->
int
:
"""Do a final attempt at fixing unmerged words and faksimile_positions.
[:return:] number of unmerged items
"""
self
.
fix_for_unmereged_items_if_two_left
(
new_words
,
unmerged_words
,
unmerged_faksimile_positions
)
self
.
fix_for_unmereged_items_split_words
(
new_words
,
unmerged_words
,
unmerged_faksimile_positions
)
num_unmerged
=
self
.
fix_for_unmereged_items_startswith
(
new_words
,
unmerged_words
,
unmerged_faksimile_positions
)
latest_unmerged_words
=
[
word
for
word
in
unmerged_words
if
not
word
.
joined
]
latest_unmerged_fps
=
[
fp
for
fp
in
unmerged_faksimile_positions
if
not
fp
.
joined
]
if
len
(
latest_unmerged_fps
)
>
0
:
fp_ln_dict
=
{}
for
fp
in
latest_unmerged_fps
:
line_number
=
self
.
get_line_number
(
fp
,
new_words
)
if
line_number
>
-
1
:
if
line_number
not
in
fp_ln_dict
.
keys
():
fp_ln_dict
.
update
({
line_number
:
[]})
fp_ln_dict
[
line_number
]
.
append
(
fp
)
for
word
in
latest_unmerged_words
:
if
word
.
line_number
in
fp_ln_dict
.
keys
():
matches
=
fp_ln_dict
[
word
.
line_number
]
if
len
(
matches
)
>
0
:
self
.
merge_word_with_fp
(
word
,
matches
.
pop
(
0
),
new_words
)
latest_unmerged_words
=
[
word
for
word
in
unmerged_words
if
not
word
.
joined
]
latest_unmerged_fps
=
[
fp
for
fp
in
unmerged_faksimile_positions
if
not
fp
.
joined
]
if
len
(
latest_unmerged_words
+
latest_unmerged_fps
)
>
0
:
if
self
.
fix_for_unmereged_items_startswith
(
new_words
,
latest_unmerged_words
,
latest_unmerged_fps
,
ignoreCase
=
True
)
==
0
:
return
0
return
self
.
fix_for_unmereged_items_if_two_left
(
new_words
,
latest_unmerged_words
,
latest_unmerged_fps
)
def
get_lines_of_faksimile_positions
(
self
)
->
list
:
"""Return lines_of_faksimile_positions.
"""
return
self
.
lines_of_faksimile_positions
def
get_next_faksimile
(
self
)
->
WordPosition
:
"""Return next faksimile position.
"""
if
len
(
self
.
lines_of_faksimile_positions
)
==
0
:
return
None
if
self
.
current_line_index
<
len
(
self
.
lines_of_faksimile_positions
):
if
self
.
current_faksimile_index
<
len
(
self
.
lines_of_faksimile_positions
[
self
.
current_line_index
]):
self
.
current_faksimile_index
+=
1
return
self
.
lines_of_faksimile_positions
[
self
.
current_line_index
][
self
.
current_faksimile_index
-
1
]
else
:
self
.
current_line_index
+=
1
else
:
self
.
current_line_index
=
0
self
.
current_faksimile_index
=
0
return
self
.
get_next_faksimile
()
def
get_line_number
(
self
,
faksimile_position
,
new_words
)
->
int
:
"""Return line_number of line containing faksimile_position.
"""
line_number
=
-
1
for
line
in
self
.
lines_of_faksimile_positions
:
if
faksimile_position
in
line
:
joined_fps
=
[
fp
for
fp
in
line
if
fp
.
joined
]
if
len
(
joined_fps
)
>
0
:
line_numbers_of_joined_words
=
[
word
for
word
in
new_words
\
if
len
(
word
.
faksimile_positions
)
>
0
\
and
any
(
fp
in
word
.
faksimile_positions
for
fp
in
joined_fps
)
]
if
len
(
line_numbers_of_joined_words
)
>
0
:
#print(faksimile_position.text, [ (w.line_number, w.text, w.faksimile_positions[0].id) for w in line_numbers_of_joined_words])
line_number
=
line_numbers_of_joined_words
[
0
]
.
line_number
return
line_number
def
get_line
(
self
,
line_of_words
,
index
=-
1
,
offset
=
2
,
interactive
=
False
)
->
list
:
"""Return the line that corresponds to the line_of_words.
"""
if
index
>
-
1
:
start_index
=
index
-
offset
\
if
index
>=
offset
\
else
0
end_index
=
index
+
offset
+
1
\
if
len
(
self
.
lines_of_faksimile_positions
)
>
index
+
offset
\
else
len
(
self
.
lines_of_faksimile_positions
)
else
:
start_index
=
0
end_index
=
len
(
self
.
lines_of_faksimile_positions
)
matched_line
=
[]
mergeable_line_of_word_texts
=
[
word
.
text
for
word
in
line_of_words
if
word
.
mergeable
]
word_text
=
''
.
join
(
mergeable_line_of_word_texts
)
interactive_list
=
[]
for
i
in
range
(
start_index
,
end_index
):
current_line
=
[
fp
for
fp
in
self
.
lines_of_faksimile_positions
[
i
]
if
not
fp
.
joined
]
current_text
=
''
.
join
([
fp
.
text
for
fp
in
current_line
if
fp
.
mergeable
])
if
(
len
(
word_text
)
==
len
(
current_text
)
and
word_text
==
current_text
)
\
or
\
(
len
(
word_text
)
<=
len
(
current_text
)
and
current_text
.
find
(
word_text
)
>
-
1
):
if
interactive
:
interactive_list
.
append
((
i
,
current_line
))
else
:
matched_line
=
current_line
break
elif
(
len
(
current_text
)
>
0
and
len
(
word_text
)
>
len
(
current_text
)
and
word_text
.
find
(
current_text
)
>
-
1
):
matched_index
=
word_text
.
find
(
current_text
)
next_i
=
i
+
1
if
matched_index
==
0
else
i
-
1
while
len
(
word_text
)
>
len
(
current_text
)
\
and
next_i
>
-
1
and
next_i
<
len
(
self
.
lines_of_faksimile_positions
):
current_line
+=
[
fp
for
fp
in
self
.
lines_of_faksimile_positions
[
next_i
]
if
not
fp
.
joined
]
current_text
=
''
.
join
([
fp
.
text
for
fp
in
current_line
if
fp
.
mergeable
])
next_i
=
next_i
+
1
if
matched_index
==
0
else
next_i
-
1
if
interactive
:
interactive_list
.
append
((
i
,
current_line
))
else
:
matched_line
=
current_line
break
if
interactive
:
if
len
(
interactive_list
)
>
0
:
return
interactive_list
else
:
for
i
in
range
(
start_index
,
end_index
):
current_line
=
[
fp
for
fp
in
self
.
lines_of_faksimile_positions
[
i
]
if
not
fp
.
joined
]
matched_line
.
append
((
i
,
current_line
))
return
matched_line
def
get_new_index
(
self
,
word
,
line_of_words
,
new_list_of_words
,
old_word_new_word_mapping
):
"""Return index of word in new_list_of_words such that it can be inserted before this index.
"""
old_index
=
line_of_words
.
index
(
word
)
new_index
=
0
if
old_index
>
0
:
previous_word
=
line_of_words
[
old_index
-
1
]
new_previous_word
=
old_word_new_word_mapping
[
previous_word
]
\
if
old_word_new_word_mapping
.
get
(
previous_word
)
is
not
None
\
else
previous_word
if
new_previous_word
in
new_list_of_words
:
new_index
=
new_list_of_words
.
index
(
new_previous_word
)
+
1
else
:
new_index
=
self
.
get_new_index
(
new_previous_word
,
line_of_words
,
\
new_list_of_words
,
old_word_new_word_mapping
)
+
1
return
new_index
def
join_unmergeable_words
(
self
,
words
,
old_word_new_word_mapping
)
->
Word
:
"""Join all words and return new word.
"""
if
len
(
words
)
>
1
:
new_word
=
words
[
0
]
for
word2join
in
words
[
1
:]:
new_word
.
join
(
word2join
)
old_word_new_word_mapping
.
update
({
word2join
:
new_word
})
old_word_new_word_mapping
.
update
({
words
[
0
]:
new_word
})
return
new_word
else
:
old_word_new_word_mapping
.
update
({
words
[
0
]:
words
[
0
]})
return
words
[
0
]
def
join_unmergeable_words_with_punctuation
(
self
,
line_of_words
,
old_word_new_word_mapping
):
"""Join unmergeable words on line with punctionation words.
"""
index
=
0
while
index
<
len
(
line_of_words
):
if
not
line_of_words
[
index
]
.
mergeable
\
and
index
+
1
<
len
(
line_of_words
)
\
and
not
line_of_words
[
index
+
1
]
.
mergeable
\
and
re
.
match
(
'^[.,]$'
,
line_of_words
[
index
+
1
]
.
text
):
line_of_words
[
index
]
.
join
(
line_of_words
[
index
+
1
])
old_word_new_word_mapping
.
update
({
line_of_words
[
index
+
1
]:
line_of_words
[
index
]})
line_of_words
.
remove
(
line_of_words
[
index
+
1
])
index
+=
1
index
+=
1
def
merge_lines
(
self
,
line_of_words
,
new_words
,
index
=-
1
,
offset
=
2
,
interactive
=
False
)
->
bool
:
"""Merge a line of words with the corresponding line of faksimile positions.
[:return:] interactive
"""
if
len
([
word
for
word
in
line_of_words
if
not
word
.
joined
])
==
0
:
return
[],
interactive
line_of_faksimile_positions
=
self
.
get_line
(
line_of_words
,
index
,
offset
=
offset
)
if
len
(
line_of_faksimile_positions
)
>
0
:
faksimile_text_dictionary
=
self
.
create_faksimile_dictionary
(
line_of_faksimile_positions
)
self
.
merge_mergeables
(
line_of_words
,
faksimile_text_dictionary
,
new_words
)
self
.
merge_unmergeables
(
line_of_words
,
line_of_faksimile_positions
,
new_words
)
elif
interactive
:
interactive
=
self
.
interactive_shell
.
interactive_merge_lines
(
line_of_words
,
new_words
,
index
,
offset
+
4
)
elif
len
(
line_of_words
)
==
1
and
line_of_words
[
0
]
.
text
==
'–'
:
line_of_words
[
0
]
.
line_number
-=
1
else
:
if
offset
<
10
:
interactive
=
self
.
merge_lines
(
line_of_words
,
new_words
,
index
,
offset
=
offset
+
1
)
return
interactive
def
merge_mergeables
(
self
,
line_of_words
,
faksimile_text_dictionary
,
new_words
):
"""Merge words with faksimile positions for which there are keys in in faksimile_text_dictionary.
"""
for
word
in
line_of_words
:
fp_list
=
faksimile_text_dictionary
.
get
(
word
.
text
)
if
fp_list
is
not
None
and
len
(
fp_list
)
>
0
:
self
.
merge_word_with_fp
(
word
,
fp_list
.
pop
(
0
),
new_words
)
def
merge_unmergeables
(
self
,
line_of_words
,
line_of_faksimile_positions
,
new_words
):
"""Merge unmergeable words and faksimile_positions
"""
old_word_new_word_mapping
=
{}
self
.
join_unmergeable_words_with_punctuation
(
line_of_words
,
old_word_new_word_mapping
)
unmerged_words
=
[
word
for
word
in
line_of_words
if
not
word
.
joined
and
not
word
.
mergeable
]
unmerged_fps
=
[
fp
for
fp
in
line_of_faksimile_positions
if
not
fp
.
joined
and
not
fp
.
mergeable
]
if
len
(
unmerged_words
)
>
0
:
if
len
(
unmerged_words
)
==
len
(
unmerged_fps
):
for
i
,
word
in
enumerate
(
unmerged_words
):
new_index
=
self
.
get_new_index
(
word
,
line_of_words
,
new_words
,
old_word_new_word_mapping
)
self
.
merge_word_with_fp
(
word
,
unmerged_fps
[
i
],
new_words
,
new_index
)
else
:
fp_index
=
0
unmerged_unity
=
[]
for
word
in
unmerged_words
:
if
len
(
unmerged_unity
)
>
0
and
fp_index
<
len
(
unmerged_fps
):
previous_word
=
unmerged_unity
[
len
(
unmerged_unity
)
-
1
]
previous_index
=
line_of_words
.
index
(
previous_word
)
if
line_of_words
.
index
(
word
)
-
previous_index
>
1
:
new_word
=
self
.
join_unmergeable_words
(
unmerged_unity
,
old_word_new_word_mapping
)
new_index
=
self
.
get_new_index
(
unmerged_unity
[
0
],
line_of_words
,
new_words
,
old_word_new_word_mapping
)
self
.
merge_word_with_fp
(
new_word
,
unmerged_fps
[
fp_index
],
new_words
,
new_index
)
fp_index
+=
1
unmerged_unity
=
[]
unmerged_unity
.
append
(
word
)
if
len
(
unmerged_unity
)
>
0
and
fp_index
<
len
(
unmerged_fps
):
new_word
=
self
.
join_unmergeable_words
(
unmerged_unity
,
old_word_new_word_mapping
)
new_index
=
self
.
get_new_index
(
unmerged_unity
[
0
],
line_of_words
,
new_words
,
old_word_new_word_mapping
)
self
.
merge_word_with_fp
(
new_word
,
unmerged_fps
[
fp_index
],
new_words
,
new_index
)
for
old_word
,
new_word
in
old_word_new_word_mapping
.
items
():
old_word
.
joined
=
new_word
.
joined
return
new_words
def
merge_word_with_fp
(
self
,
word
,
faksimile_position
,
list_of_new_words
,
index
=-
1
):
"""Merge word with faksimile position.
"""
word
.
joined
,
faksimile_position
.
joined
=
True
,
True
word
.
faksimile_positions
.
append
(
faksimile_position
)
if
index
==
-
1
:
list_of_new_words
.
append
(
word
)
else
:
list_of_new_words
.
insert
(
index
,
word
)
def
get_svgPosFile_and_manuscriptFile
(
faksimile_page
,
manuscript_file
=
None
,
redo_ok
=
False
):
"""Return svg_pos_file and manuscript_file if they are ready for processing.
"""
svg_pos_file
=
None
manuscript_tree
=
None
if
manuscript_file
is
not
None
\
and
basename
(
manuscript_file
)
.
startswith
(
faksimile_page
.
title
.
replace
(
' '
,
'_'
)):
manuscript_tree
=
ET
.
parse
(
manuscript_file
)
else
:
title_string
=
faksimile_page
.
title
.
replace
(
' '
,
'_'
)
manuscript_file
=
'.{}xml'
.
format
(
sep
)
+
sep
+
title_string
+
'.xml'
\
if
isdir
(
'.{}xml'
.
format
(
sep
))
else
title_string
+
'.xml'
if
isfile
(
manuscript_file
):
manuscript_tree
=
ET
.
parse
(
manuscript_file
)
if
manuscript_tree
is
not
None
:
if
redo_ok
and
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
" and contains(@status,"OK")]/@output'
%
faksimile_page
.
page_number
))
>
0
:
svg_pos_file
=
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]/@output'
%
faksimile_page
.
page_number
)[
0
]
if
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
" and @status="OK"]/@output'
%
faksimile_page
.
page_number
))
>
0
:
svg_pos_file
=
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]/@output'
%
faksimile_page
.
page_number
)[
0
]
else
:
if
not
UNITTESTING
:
if
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
" and contains(@status,"OK")]/@output'
%
faksimile_page
.
page_number
))
>
0
:
msg
=
Fore
.
LIGHTBLUE_EX
+
'->'
+
Fore
.
CYAN
+
'Data from page {0} already merged with {1}!'
.
format
(
\
faksimile_page
.
page_number
,
\
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
" and contains(@status,"OK")]/@output'
%
faksimile_page
.
page_number
)[
0
])
else
:
msg
=
Fore
.
MAGENTA
+
'Manuscript file {} does not contain a page number {} ready for joining ...'
.
format
(
manuscript_file
,
faksimile_page
.
page_number
)
print
(
msg
,
end
=
''
)
print
(
Style
.
RESET_ALL
)
return
svg_pos_file
,
manuscript_file
def
merge_faksimile_file_and_pages
(
faksimile_file
,
manuscript_file
=
None
,
page
=
None
)
->
int
:
"""Merge the data of a faksimile file with the data of svgposfile.
[:return:] exit status
"""
if
not
UNITTESTING
:
print
(
Fore
.
LIGHTBLUE_EX
+
'Processing file {} '
.
format
(
faksimile_file
),
end
=
''
)
print
(
Style
.
RESET_ALL
)
faksimile_tree
=
ET
.
parse
(
faksimile_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
faksimile_tree
.
getroot
()
.
nsmap
.
items
()
}
faksimile_pages
=
FaksimilePage
.
GET_FAKSIMILEPAGES
(
faksimile_tree
,
namespaces
=
namespaces
)
if
page
is
not
None
:
faksimile_pages
=
[
faksimile_page
for
faksimile_page
in
faksimile_pages
\
if
get_svgPosFile_and_manuscriptFile
(
faksimile_page
,
manuscript_file
=
manuscript_file
)[
0
]
\
==
page
.
page_tree
.
docinfo
.
URL
]
exit_status
=
0
for
faksimile_page
in
faksimile_pages
:
svg_pos_file
,
manuscript_file
=
get_svgPosFile_and_manuscriptFile
(
faksimile_page
,
manuscript_file
=
manuscript_file
)
if
svg_pos_file
is
not
None
:
image4page
=
faksimile_page
.
faksimile_image
.
get_image_joined_with_text_field
(
faksimile_page
.
text_field
)
if
page
is
None
:
page
=
Page
(
svg_pos_file
,
faksimile_image
=
image4page
,
faksimile_svgFile
=
faksimile_file
)
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
svg_pos_file
,
script_name
=
__file__
,
\
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
if
not
UNITTESTING
:
print
(
Fore
.
LIGHTBLUE_EX
+
'->'
,
end
=
''
)
print
(
Fore
.
CYAN
+
'Merging faksimile positions from page {0} with words from file {1} ... '
.
format
(
faksimile_page
.
page_number
,
svg_pos_file
),
end
=
''
)
exit_status
=
num_unmerged
=
merge_faksimile_positions_and_words
(
page
,
faksimile_page
.
word_positions
)
if
num_unmerged
>
0
:
page
=
Page
(
page
.
page_tree
.
docinfo
.
URL
)
for
carrier
in
faksimile_page
.
word_positions
:
carrier
.
joined
=
False
exit_status
=
num_unmerged
=
merge_faksimile_positions_and_words
(
page
,
faksimile_page
.
word_positions
,
interactive
=
True
)
if
not
UNITTESTING
:
if
num_unmerged
==
0
:
print
(
Fore
.
GREEN
+
'[OK]'
)
new_words
=
sort_words
(
page
)
for
word
in
new_words
:
if
len
(
word
.
faksimile_positions
)
==
0
or
word
.
text
!=
word
.
faksimile_positions
[
0
]
.
text
:
word
.
verified
=
False
if
page
.
is_locked
():
page
.
unlock
()
post_merging_processing_and_saving
(
svg_pos_file
=
page
.
page_tree
.
docinfo
.
URL
,
new_words
=
new_words
,
page
=
page
,
manuscript_file
=
manuscript_file
)
else
:
print
(
Fore
.
RED
+
f
'[ERROR: {num_unmerged} not joined!]
\n
'
)
print
([
(
word
.
id
,
word
.
text
,
word
.
line_number
)
for
word
in
page
.
words
if
not
word
.
joined
])
print
([
(
fp
.
id
,
fp
.
text
)
for
fp
in
faksimile_page
.
word_positions
if
not
fp
.
joined
])
print
(
Fore
.
RESET
)
else
:
if
num_unmerged
>
0
:
unmerged_words
=
[
word
for
word
in
page
.
words
if
not
word
.
joined
]
unmerged_fps
=
[
fp
for
fp
in
faksimile_page
.
word_positions
if
not
fp
.
joined
]
print
([
(
word
.
id
,
word
.
text
,
word
.
line_number
)
for
word
in
unmerged_words
])
print
([
(
fp
.
id
,
fp
.
text
)
for
fp
in
unmerged_fps
])
if
len
(
unmerged_fps
)
==
0
:
for
word
in
page
.
words
:
if
len
(
word
.
faksimile_positions
)
<
1
:
print
(
f
'{word.line_number}: {word.id} {word.text}'
)
elif
word
.
text
!=
word
.
faksimile_positions
[
0
]
.
text
:
print
(
f
'{word.line_number}: {word.id} {word.text} {[(fp.id,fp.text) for fp in word.faksimile_positions]}'
)
else
:
words
=
sort_words
(
page
)
for
word
in
words
:
if
len
(
word
.
faksimile_positions
)
<
1
:
print
(
f
'{word.line_number}: {word.id} {word.text}'
)
elif
not
word
.
verified
and
word
.
text
!=
word
.
faksimile_positions
[
0
]
.
text
:
print
(
f
'{word.line_number}: {word.id} {word.text} {[(fp.id,fp.text) for fp in word.faksimile_positions]}'
)
page
=
None
return
exit_status
def
merge_faksimile_positions_and_words
(
page
,
faksimile_positions
,
interactive
=
False
)
->
int
:
"""Merge words with faksimile positions.
[:return:] exit code
"""
words
=
sort_words
(
page
)
mark_unmergeable_words_and_faksimile_positions
(
words
,
faksimile_positions
)
lines_with_words
=
set
([
word
.
line_number
for
word
in
words
])
faksimile_lines_composer
=
FaksimileLineComposer
(
faksimile_positions
,
page
=
page
)
new_words
=
[]
if
interactive
:
faksimile_lines_composer
.
interactive_shell
.
set_command_history
()
for
index
,
line_number
in
enumerate
(
sorted
(
lines_with_words
)):
words_on_line
=
[
word
for
word
in
words
if
word
.
line_number
==
line_number
]
interactive
=
faksimile_lines_composer
.
merge_lines
(
words_on_line
,
new_words
,
index
,
interactive
=
interactive
)
unmerged_words
=
[
word
for
word
in
page
.
words
if
not
word
.
joined
]
unmerged_fps
=
[
fp
for
fp
in
faksimile_positions
if
not
fp
.
joined
]
exit_code
=
faksimile_lines_composer
.
final_fix_for_unmereged_items
(
new_words
,
unmerged_words
,
unmerged_fps
)
if
exit_code
==
0
:
page
.
words
=
new_words
else
:
faksimile_lines_composer
.
interactive_shell
.
print_command_history
()
manual_merger
=
ManualMergerShell
(
unmerged_words
,
unmerged_fps
,
new_words
,
page
=
page
)
try
:
exit_code
=
manual_merger
.
run
()
except
Exception
as
e
:
exit_code
=
666
print
(
e
)
manual_merger
.
print_history
()
return
exit_code
def
mark_unmergeable_words_and_faksimile_positions
(
words
,
faksimile_positions
):
"""Mark all words and faksimile_positions for which the number of text instances does not accord.
"""
unique_texts
=
set
()
for
text_carrier
in
words
+
faksimile_positions
:
if
'joined'
not
in
text_carrier
.
__dict__
.
keys
():
text_carrier
.
joined
=
False
text_carrier
.
mergeable
=
True
unique_texts
.
add
(
text_carrier
.
text
)
for
text
in
unique_texts
:
words_with_text
=
[
word
for
word
in
words
if
word
.
text
==
text
]
faksimile_positions_with_text
=
[
fp
for
fp
in
faksimile_positions
if
fp
.
text
==
text
]
if
len
(
words_with_text
)
!=
len
(
faksimile_positions_with_text
):
for
text_carrier
in
words_with_text
+
faksimile_positions_with_text
:
text_carrier
.
mergeable
=
False
def
text_starts_with
(
text1
,
text2
,
ignoreCase
=
False
)
->
bool
:
"""Return text1.startswith(text2)
"""
if
ignoreCase
:
return
text1
.
lower
()
.
startswith
(
text2
.
lower
())
else
:
return
text1
.
startswith
(
text2
)
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION line wise.
svgscripts/compare_faksimile_words_line_wise.py [OPTIONS] <FAKSIMILE_DIR|faksimile_svg_file> [xmlManuscriptFile]
<FAKSIMILE_DIR> a directory containing <faksimile_svg_file>
<faksimile_svg_file> a svg file containing information about the word positions on the faksimile.
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
correct_words_dir
=
None
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hc:"
,
[
"help"
,
"correct-words="
,
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
elif
opt
in
(
'-c'
,
'--correct-words'
):
correct_words_dir
=
arg
if
len
(
args
)
<
1
:
usage
()
return
2
exit_status
=
0
file_a
=
args
[
0
]
if
exists
(
file_a
):
file_b
=
None
if
len
(
args
)
>
1
and
exists
(
args
[
1
]):
file_b
=
args
[
1
]
file_list
,
manuscript_file
=
get_filelist_and_manuscript_file
(
file_a
,
file_b
=
file_b
,
correction_dir
=
correct_words_dir
)
for
faksimile_file
in
file_list
:
merge_faksimile_file_and_pages
(
faksimile_file
,
manuscript_file
=
manuscript_file
)
else
:
raise
FileNotFoundError
(
'File {} does not exist!'
.
format
(
file_a
))
return
exit_status
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment