Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91235631
fix_old_data.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Nov 9, 05:53
Size
28 KB
Mime Type
text/x-python
Expires
Mon, Nov 11, 05:53 (2 d)
Engine
blob
Format
Raw Data
Handle
22166010
Attached To
rNIETZSCHEPYTHON nietzsche-python
fix_old_data.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to fix old data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
from
deprecated
import
deprecated
from
functools
import
cmp_to_key
import
getopt
import
inspect
import
lxml.etree
as
ET
import
re
import
shutil
import
string
from
svgpathtools
import
svg2paths2
,
svg_to_paths
from
svgpathtools.path
import
Path
as
SVGPath
from
svgpathtools.path
import
Line
import
sys
import
tempfile
from
operator
import
attrgetter
import
os
from
os
import
listdir
,
sep
,
path
,
setpgrp
,
devnull
from
os.path
import
exists
,
isfile
,
isdir
,
dirname
,
basename
from
progress.bar
import
Bar
import
warnings
sys
.
path
.
append
(
'svgscripts'
)
from
convert_wordPositions
import
HTMLConverter
from
datatypes.box
import
Box
from
datatypes.faksimile
import
FaksimilePage
from
datatypes.imprint
import
Imprint
from
datatypes.archival_manuscript
import
ArchivalManuscriptUnity
from
datatypes.mark_foreign_hands
import
MarkForeignHands
from
datatypes.matrix
import
Matrix
from
datatypes.page
import
Page
,
STATUS_MERGED_OK
,
STATUS_POSTMERGED_OK
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
from
datatypes.positional_word_part
import
PositionalWordPart
from
datatypes.path
import
Path
from
datatypes.word
import
Word
from
datatypes.text_connection_mark
import
TextConnectionMark
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.transkription_position
import
TranskriptionPosition
from
datatypes.word
import
Word
,
update_transkription_position_ids
from
join_faksimileAndTranskription
import
sort_words
from
util
import
back_up
,
back_up_svg_file
,
copy_faksimile_svg_file
,
reset_tp_with_matrix
from
process_files
import
update_svgposfile_status
,
get_extended_text_field
from
process_footnotes
import
save_imprints
from
process_words_post_merging
import
update_faksimile_line_positions
,
MERGED_DIR
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
,
xml_has_type
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
from
main_util
import
create_function_dictionary
,
get_manuscript_files
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
MAX_SVG_XY_THRESHOLD
=
10
#TODO: fix all svg graphical files: change xlink:href to href!!!!
def
convert_old_matrix
(
tp
,
xmin
,
ymin
)
->
(
Matrix
,
float
,
float
):
"""Return new matrix, x and y for old transkription_position.
"""
matrix
=
tp
.
transform
.
clone_transformation_matrix
()
matrix
.
matrix
[
Matrix
.
XINDEX
]
=
round
(
tp
.
transform
.
matrix
[
Matrix
.
XINDEX
]
+
xmin
,
3
)
matrix
.
matrix
[
Matrix
.
YINDEX
]
=
round
(
tp
.
transform
.
matrix
[
Matrix
.
YINDEX
]
+
ymin
,
3
)
x
=
round
(
tp
.
left
-
tp
.
transform
.
matrix
[
Matrix
.
XINDEX
],
3
)
\
if
tp
.
left
>
0
\
else
0
y
=
round
((
tp
.
height
-
1.5
)
*-
1
,
3
)
return
matrix
,
x
,
y
def
extend_text_field
(
page
,
redo
=
False
)
->
bool
:
"""Extend text_field of svg_image in page.
"""
if
not
redo
and
page_already_changed
(
page
):
return
False
text_field
=
get_extended_text_field
(
page
.
svg_image
.
file_name
,
multipage_index
=
page
.
multipage_index
)
page
.
svg_image
.
width
=
text_field
.
width
page
.
svg_image
.
height
=
text_field
.
height
page
.
svg_image
.
text_field
=
text_field
page
.
svg_image
.
attach_object_to_tree
(
page
.
page_tree
)
tf
=
TranskriptionField
(
page
.
svg_image
.
file_name
,
multipage_index
=
page
.
multipage_index
)
tf
.
xmin
=
text_field
.
left
tf
.
ymin
=
text_field
.
top
tf
.
width
=
text_field
.
width
tf
.
height
=
text_field
.
height
tf
.
shrink_svg_to_transkription_field
(
redo
=
True
)
if
not
UNITTESTING
:
save_page
(
page
)
return
True
def
save_page
(
page
,
attach_first
=
False
,
backup
=
False
,
script_name
=
None
):
"""Write page to xml file
"""
if
backup
:
back_up
(
page
,
page
.
xml_file
)
if
attach_first
:
page
.
update_and_attach_words2tree
()
if
script_name
is
None
:
script_name
=
f
'{__file__}:{inspect.currentframe().f_back.f_code.co_name}'
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
page
.
page_tree
.
docinfo
.
URL
,
\
script_name
=
script_name
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
def
page_already_changed
(
page
)
->
bool
:
"""Return whether page has alreadybeen changed by function
"""
return
len
(
\
page
.
page_tree
.
xpath
(
f
'//metadata/modifiedBy[@script="{__file__}:{inspect.currentframe().f_back.f_code.co_name}"]'
)
\
)
>
0
def
fix_faksimile_line_position
(
page
,
redo
=
False
)
->
bool
:
"""Create a faksimile line position.
"""
if
not
redo
and
page_already_changed
(
page
):
return
False
;
update_faksimile_line_positions
(
page
)
if
not
UNITTESTING
:
save_page
(
page
)
return
True
def
check_faksimile_positions
(
page
,
redo
=
False
)
->
bool
:
"""Check faksimile line position.
"""
if
len
(
page
.
page_tree
.
xpath
(
'//data-source/@file'
))
>
0
:
svg_file
=
page
.
page_tree
.
xpath
(
'//data-source/@file'
)[
0
]
svg_tree
=
ET
.
parse
(
svg_file
)
positions_are_equal_counter
=
0
page_changed
=
False
for
faksimile_page
in
FaksimilePage
.
GET_FAKSIMILEPAGES
(
svg_tree
):
if
page
.
title
==
faksimile_page
.
title
\
and
page
.
number
==
faksimile_page
.
page_number
:
#print([fp.id for fp in faksimile_page.word_positions ])
for
word
in
page
.
words
:
for
fp
in
word
.
faksimile_positions
:
rect_fps
=
[
rfp
for
rfp
in
faksimile_page
.
word_positions
if
rfp
.
id
==
fp
.
id
]
if
len
(
rect_fps
)
>
0
:
rfp
=
rect_fps
[
0
]
if
fp
.
left
!=
rfp
.
left
or
fp
.
top
!=
rfp
.
top
:
#print(f'{fp.id}: {fp.left}/{rfp.left} {fp.top}/{rfp.top}')
fp
.
left
=
rfp
.
left
fp
.
top
=
rfp
.
top
fp
.
bottom
=
fp
.
top
+
rfp
.
height
word
.
attach_word_to_tree
(
page
.
page_tree
)
page_changed
=
True
else
:
positions_are_equal_counter
+=
1
print
(
f
'{positions_are_equal_counter}/{len(page.words)} are equal'
)
if
page_changed
and
not
UNITTESTING
:
save_page
(
page
)
return
page_changed
def
fix_faksimile_positions
(
page
,
redo
=
False
)
->
bool
:
"""Set faksimile positions to absolute values.
[:return:] fixed
"""
if
not
redo
and
len
(
page
.
page_tree
.
xpath
(
f
'//metadata/modifiedBy[@script="{__file__}"]'
))
>
0
:
return
False
x_min
=
page
.
text_field
.
xmin
y_min
=
page
.
text_field
.
ymin
for
word
in
page
.
words
:
for
fp
in
word
.
faksimile_positions
:
fp
.
left
=
fp
.
left
+
x_min
fp
.
top
=
fp
.
top
+
y_min
fp
.
bottom
=
fp
.
bottom
+
y_min
word
.
attach_word_to_tree
(
page
.
page_tree
)
if
not
UNITTESTING
:
print
(
f
'writing to {page.page_tree.docinfo.URL}'
)
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
page
.
page_tree
.
docinfo
.
URL
,
\
script_name
=
__file__
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
return
True
def
_fix_tp_of_word
(
page
,
word
,
text_field
):
"""Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top
"""
for
tp
in
word
.
transkription_positions
:
tp
.
left
+=
text_field
.
left
tp
.
top
+=
text_field
.
top
reset_tp_with_matrix
(
word
.
transkription_positions
)
if
type
(
word
)
==
Word
:
words_in_word
=
word
.
word_parts
+
[
item
for
item
in
word
.
__dict__
.
items
()
if
type
(
item
)
==
Word
]
for
wp
in
words_in_word
:
_fix_tp_of_word
(
page
,
wp
,
text_field
)
def
fix_tp_with_matrix
(
page
,
redo
=
False
)
->
bool
:
"""Fix transkription positions with rotation matrix ->set left to 0 and top to -5.
[:return:] fixed
"""
xmin
=
0
if
page
.
svg_image
is
None
or
page
.
svg_image
.
text_field
is
None
else
page
.
svg_image
.
text_field
.
left
ymin
=
0
if
page
.
svg_image
is
None
or
page
.
svg_image
.
text_field
is
None
else
page
.
svg_image
.
text_field
.
top
for
word
in
page
.
words
:
reset_tp_with_matrix
(
word
.
transkription_positions
,
tr_xmin
=
xmin
,
tr_ymin
=
ymin
)
for
wp
in
word
.
word_parts
:
reset_tp_with_matrix
(
wp
.
transkription_positions
,
tr_xmin
=
xmin
,
tr_ymin
=
ymin
)
if
not
UNITTESTING
:
print
(
f
'writing to {page.page_tree.docinfo.URL}'
)
save_page
(
page
,
attach_first
=
True
)
return
True
def
_fix_old_transkription_positions
(
page
,
redo
=
False
)
->
bool
:
"""Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top
[:return:] fixed
"""
if
page
.
svg_image
is
not
None
\
and
page
.
svg_image
.
text_field
is
None
:
if
page
.
svg_image
is
None
:
if
page
.
svg_file
is
not
None
:
transkription_field
=
TranskriptionField
(
page
.
svg_file
)
width
=
round
(
tf
.
documentWidth
,
3
)
height
=
round
(
tf
.
documentHeight
,
3
)
page
.
svg_image
=
SVGImage
(
file_name
=
svg_file
,
width
=
width
,
\
height
=
height
,
text_field
=
transkription_field
.
convert_to_text_field
())
page
.
svg_image
.
attach_object_to_tree
(
page
.
page_tree
)
else
:
raise
Exception
(
f
'ERROR page {page.page_tree.docinfo.URL} does not have a svg_file!'
)
elif
page
.
svg_image
.
text_field
is
None
:
page
.
svg_image
.
text_field
=
TranskriptionField
(
page
.
svg_image
.
file_name
)
.
convert_to_text_field
()
page
.
svg_image
.
attach_object_to_tree
(
page
.
page_tree
)
for
line_number
in
page
.
line_numbers
:
line_number
.
top
+=
page
.
svg_image
.
text_field
.
top
line_number
.
bottom
+=
page
.
svg_image
.
text_field
.
top
line_number
.
attach_object_to_tree
(
page
.
page_tree
)
for
word
in
page
.
words
:
_fix_tp_of_word
(
page
,
word
,
page
.
svg_image
.
text_field
)
for
mark
in
page
.
mark_foreign_hands
:
_fix_tp_of_word
(
page
,
mark
,
page
.
svg_image
.
text_field
)
for
tcm
in
page
.
text_connection_marks
:
_fix_tp_of_word
(
page
,
tcm
,
page
.
svg_image
.
text_field
)
if
not
UNITTESTING
:
print
(
f
'writing to {page.page_tree.docinfo.URL}'
)
save_page
(
page
,
attach_first
=
True
)
return
True
return
False
def
_fix_old_pwps
(
page
,
old_tps
):
"""Adjust positional_word_parts to corrected transkription_positions.
"""
for
tp
in
old_tps
:
for
pwp
in
tp
.
xpath
(
f
'./{PositionalWordPart.XML_TAG}'
):
left
=
float
(
pwp
.
get
(
'left'
))
top
=
float
(
pwp
.
get
(
'top'
))
bottom
=
float
(
pwp
.
get
(
'bottom'
))
pwp
.
set
(
'left'
,
str
(
left
+
page
.
svg_image
.
text_field
.
left
))
pwp
.
set
(
'top'
,
str
(
top
+
page
.
svg_image
.
text_field
.
top
))
pwp
.
set
(
'bottom'
,
str
(
bottom
+
page
.
svg_image
.
text_field
.
top
))
def
_fix_quotation_mark_tps
(
page
,
old_tps
):
"""Fix the height of transkription_positions of words with quotation marks.
"""
for
tp
in
old_tps
:
heighest_pwp
=
sorted
(
tp
.
xpath
(
f
'./{PositionalWordPart.XML_TAG}'
),
key
=
lambda
pwp
:
float
(
pwp
.
get
(
'height'
)),
reverse
=
True
)[
0
]
toppest_pwp
=
sorted
(
tp
.
xpath
(
f
'./{PositionalWordPart.XML_TAG}'
),
key
=
lambda
pwp
:
float
(
pwp
.
get
(
'top'
)))[
0
]
new_height
=
float
(
tp
.
get
(
'height'
))
+
abs
(
float
(
heighest_pwp
.
get
(
'top'
))
-
float
(
toppest_pwp
.
get
(
'top'
)))
tp
.
set
(
'height'
,
str
(
new_height
))
def
fix_transkription_positions
(
page
,
redo
=
False
)
->
bool
:
"""Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top
[:return:] fixed
"""
THRESHOLD
=
10
if
page
.
svg_image
is
not
None
\
and
page
.
svg_image
.
text_field
is
None
:
if
not
_fix_old_transkription_positions
(
page
):
return
False
_fix_old_pwps
(
page
,
[
pwp
.
getparent
()
for
pwp
in
page
.
page_tree
.
xpath
(
f
'//{PositionalWordPart.XML_TAG}[@id="0"]'
)
\
if
abs
(
float
(
pwp
.
get
(
'left'
))
-
float
(
pwp
.
getparent
()
.
get
(
'left'
)))
>
THRESHOLD
])
_fix_quotation_mark_tps
(
page
,
[
tp
for
tp
in
page
.
page_tree
.
xpath
(
f
'//{TranskriptionPosition.XML_TAG}'
)
\
if
len
(
tp
.
xpath
(
f
'./{PositionalWordPart.XML_TAG}'
))
>
0
\
and
sorted
(
tp
.
xpath
(
f
'./{PositionalWordPart.XML_TAG}'
),
key
=
lambda
pwp
:
float
(
pwp
.
get
(
'height'
)),
reverse
=
True
)[
0
]
\
!=
sorted
(
tp
.
xpath
(
f
'./{PositionalWordPart.XML_TAG}'
),
key
=
lambda
pwp
:
float
(
pwp
.
get
(
'top'
)))[
0
]
])
if
not
UNITTESTING
:
print
(
f
'writing to {page.page_tree.docinfo.URL}'
)
save_page
(
page
)
return
True
def
fix_styles
(
page
,
redo
=
False
):
"""Remove unused styles from tree.
"""
if
len
(
page
.
page_tree
.
xpath
(
'//style'
))
>
1
:
for
node
in
page
.
page_tree
.
xpath
(
'//style'
)[
1
:]:
node
.
getparent
()
.
remove
(
node
)
if
not
UNITTESTING
:
print
(
f
'writing to {page.page_tree.docinfo.URL}'
)
save_page
(
page
)
return
True
def
fix_imprints
(
page
,
redo
=
False
):
"""Remove unused styles from tree.
"""
if
len
(
page
.
page_tree
.
xpath
(
'//'
+
Imprint
.
XML_TAG
))
==
0
:
save_imprints
(
page
)
return
True
def
merge_transkription_positions
(
page
,
redo
=
False
)
->
bool
:
"""Fix transkription positions of merged words
[:return:] fixed
"""
if
not
isdir
(
dirname
(
page
.
page_tree
.
docinfo
.
URL
)
+
sep
+
MERGED_DIR
)
\
or
not
isfile
(
dirname
(
page
.
page_tree
.
docinfo
.
URL
)
+
sep
+
MERGED_DIR
+
sep
+
basename
(
page
.
page_tree
.
docinfo
.
URL
)):
return
False
merged_page
=
Page
(
dirname
(
page
.
page_tree
.
docinfo
.
URL
)
+
sep
+
MERGED_DIR
+
sep
+
basename
(
page
.
page_tree
.
docinfo
.
URL
))
sync_dictionary
=
sync_words_linewise
(
merged_page
.
words
,
page
.
words
,
merged_page
.
line_numbers
)
words
=
[]
for
source_word
in
merged_page
.
words
:
words
.
append
(
source_word
)
if
bool
(
sync_dictionary
.
get
(
source_word
)):
_sync_transkriptions_with_words
(
source_word
,
sync_dictionary
)
if
source_word
.
text
!=
''
.
join
([
t
.
get_text
()
for
t
in
source_word
.
transkription_positions
]):
text
=
''
.
join
([
t
.
get_text
()
for
t
in
source_word
.
transkription_positions
])
print
(
f
'{source_word.line_number}: {source_word.text} has transkription_positions with text "{text}".'
)
response
=
input
(
'Change? [Y/n]>'
)
if
not
response
.
startswith
(
'n'
):
new_sync_dictionary
=
sync_words_linewise
(
merged_page
.
words
,
page
.
words
,
\
[
line
for
line
in
merged_page
.
line_numbers
if
line
.
id
==
source_word
.
line_number
],
force_sync_on_word
=
source_word
)
if
bool
(
new_sync_dictionary
.
get
(
source_word
)):
_sync_transkriptions_with_words
(
source_word
,
new_sync_dictionary
)
else
:
raise
Exception
(
f
'Could not find sourc_word {source_word.text} in {new_sync_dictionary}!'
)
page
.
words
=
words
page
.
update_and_attach_words2tree
()
if
not
UNITTESTING
:
print
(
f
'writing to {page.page_tree.docinfo.URL}'
)
save_page
(
page
)
return
True
def
fix_graphical_svg_file
(
page
,
redo
=
False
)
->
bool
:
"""Fix glyphs of word for which there is a /changed-word in page.page_tree
"""
svg_tree
=
ET
.
parse
(
page
.
svg_file
)
transkription_field
=
TranskriptionField
(
page
.
source
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
back_up_svg_file
(
svg_tree
,
namespaces
=
namespaces
)
tr_xmin
=
transkription_field
.
xmin
if
(
page
.
svg_image
is
None
or
page
.
svg_image
.
text_field
is
None
)
else
0
tr_ymin
=
transkription_field
.
ymin
if
(
page
.
svg_image
is
None
or
page
.
svg_image
.
text_field
is
None
)
else
0
for
deleted_word_node
in
page
.
page_tree
.
xpath
(
'//deleted-word'
):
deleted_word
=
Word
.
create_cls
(
deleted_word_node
)
_run_function_on_nodes_for_word
(
svg_tree
,
namespaces
,
deleted_word
,
tr_xmin
,
tr_ymin
,
_set_node_attribute_to
,
'visibility'
,
'hidden'
)
for
changed_word_node
in
page
.
page_tree
.
xpath
(
'//changed-word'
):
changed_word
=
Word
.
create_cls
(
changed_word_node
)
try
:
word
=
[
word
for
word
in
page
.
words
if
word
.
id
==
changed_word
.
id
and
word
.
text
==
changed_word
.
text
][
0
]
left_difference
=
word
.
transkription_positions
[
0
]
.
left
-
changed_word
.
transkription_positions
[
0
]
.
left
_run_function_on_nodes_for_word
(
svg_tree
,
namespaces
,
word
,
tr_xmin
,
tr_ymin
,
_add_value2attribute
,
'x'
,
left_difference
)
except
IndexError
:
warnings
.
warn
(
f
'There is no word for changed_word {changed_word.id}: "{changed_word.text}" in {page.page_tree.docinfo.URL}!'
)
copy_faksimile_svg_file
(
target_file
=
page
.
svg_file
,
faksimile_tree
=
svg_tree
,
namespaces
=
namespaces
)
def
_add_value2attribute
(
node
,
attribute
,
value
):
"""Add left_difference to x of node.
"""
node
.
set
(
attribute
,
str
(
float
(
node
.
get
(
attribute
))
+
value
))
node
.
set
(
'changed'
,
'true'
)
def
_get_nodes_with_symbol_id
(
svg_tree
,
namespaces
,
symbol_id
,
svg_x
,
svg_y
,
threshold
=
0.1
)
->
list
:
"""Return nodes with symbol_id n x = svg_x and y = svg_y.
"""
nodes
=
[
node
for
node
in
svg_tree
.
xpath
(
\
f
'//ns:use[@xlink:href="#{symbol_id}" and @x > {svg_x-threshold} and @x < {svg_x+threshold} and @y > {svg_y-threshold} and @y < {svg_y+threshold} ]'
,
\
namespaces
=
namespaces
)
if
not
bool
(
node
.
get
(
'changed'
))
]
if
len
(
nodes
)
==
0
and
threshold
<
MAX_SVG_XY_THRESHOLD
:
return
_get_nodes_with_symbol_id
(
svg_tree
,
namespaces
,
symbol_id
,
svg_x
,
svg_y
,
threshold
=
threshold
+
1
)
return
nodes
def
_run_function_on_nodes_for_word
(
svg_tree
,
namespaces
,
word
,
tr_xmin
,
tr_ymin
,
function_on_node
,
attribute
,
value
):
"""Run function on nodes for words.
"""
for
tp
in
word
.
transkription_positions
:
for
pwp
in
tp
.
positional_word_parts
:
symbol_id
=
pwp
.
symbol_id
svg_x
=
pwp
.
left
+
tr_xmin
svg_y
=
pwp
.
bottom
+
tr_ymin
nodes
=
_get_nodes_with_symbol_id
(
svg_tree
,
namespaces
,
symbol_id
,
svg_x
,
svg_y
)
if
len
(
nodes
)
>
0
:
node
=
nodes
[
0
]
function_on_node
(
node
,
attribute
,
value
)
def
_set_node_attribute_to
(
node
,
attribute
,
value
):
"""Set attribute of node to value.
"""
node
.
set
(
attribute
,
str
(
value
))
node
.
set
(
'changed'
,
'true'
)
def
sync_words_linewise
(
source_words
,
target_words
,
lines
,
force_sync_on_word
=
None
)
->
dict
:
"""Sync words an create a dictionary with source_words as keys, refering to a list of corresponding words.
"""
result_dict
=
{}
for
word
in
target_words
+
source_words
:
word
.
processed
=
False
for
line
in
lines
:
source_words_on_line
=
sorted
([
word
for
word
in
source_words
if
word
.
line_number
==
line
.
id
],
key
=
lambda
word
:
word
.
transkription_positions
[
0
]
.
left
)
target_words_on_line
=
sorted
([
word
for
word
in
target_words
if
word
.
line_number
==
line
.
id
],
key
=
lambda
word
:
word
.
transkription_positions
[
0
]
.
left
)
if
len
(
target_words_on_line
)
==
len
(
source_words_on_line
):
_sync_same_length
(
result_dict
,
source_words_on_line
,
target_words_on_line
,
force_sync_on_word
=
force_sync_on_word
)
elif
len
(
source_words_on_line
)
<
len
(
target_words_on_line
):
_sync_more_target_words
(
result_dict
,
source_words_on_line
,
target_words_on_line
,
force_sync_on_word
=
force_sync_on_word
)
else
:
print
(
'okey dokey'
)
return
result_dict
def
_force_sync_on_word
(
force_sync_on_word
,
target_words_on_line
,
result_dict
):
"""Force sync on word.
"""
unprocessed_target_words
=
[
t_word
for
t_word
in
target_words_on_line
if
not
t_word
.
processed
]
if
len
(
unprocessed_target_words
)
>
0
:
print
([
(
i
,
t_word
.
text
)
for
i
,
t_word
in
enumerate
(
unprocessed_target_words
)])
response
=
input
(
f
'Please specify indices of words to sync {force_sync_on_word.text} with: [default:0-{len(unprocessed_target_words)-1}]>'
)
indices
=
[
i
for
i
in
range
(
0
,
len
(
unprocessed_target_words
))
]
if
re
.
match
(
r'\d+-\d+'
,
response
):
index_strings
=
response
.
split
(
'-'
)
indices
=
[
i
for
i
in
range
(
int
(
index_strings
[
0
]),
int
(
index_strings
[
1
])
+
1
)
]
elif
response
!=
''
:
indices
=
[
int
(
i
)
for
i
in
response
.
split
(
' '
)
]
target_words
=
[]
for
i
in
indices
:
target_words
.
append
(
unprocessed_target_words
[
i
])
result_dict
.
update
({
force_sync_on_word
:
target_words
})
else
:
raise
Exception
(
f
'There are no unprocessed target_words for {force_sync_on_word.text} on line {force_sync_on_word.line_number}!'
)
def
_sync_transkriptions_with_words
(
word
,
sync_dictionary
):
"""Sync transkription_positions of word with syncronized words.
"""
word
.
transkription_positions
=
[]
for
target_word
in
sync_dictionary
[
word
]:
word
.
transkription_positions
+=
target_word
.
transkription_positions
def
_sync_more_target_words
(
result_dict
,
source_words_on_line
,
target_words_on_line
,
force_sync_on_word
=
None
):
"""Sync if there are more target words.
"""
current_source_word
=
None
for
target_word
in
target_words_on_line
:
if
current_source_word
is
not
None
\
and
current_source_word
.
text
.
startswith
(
''
.
join
([
w
.
text
for
w
in
result_dict
[
current_source_word
]])
+
target_word
.
text
):
result_dict
[
current_source_word
]
.
append
(
target_word
)
target_word
.
processed
=
True
if
current_source_word
.
text
==
''
.
join
([
w
.
text
for
w
in
result_dict
[
current_source_word
]]):
current_source_word
=
None
elif
len
([
s_word
for
s_word
in
source_words_on_line
if
not
s_word
.
processed
and
s_word
.
text
==
target_word
.
text
])
>
0
:
source_word
=
[
s_word
for
s_word
in
source_words_on_line
if
not
s_word
.
processed
and
s_word
.
text
==
target_word
.
text
][
0
]
target_word
.
processed
=
True
source_word
.
processed
=
True
result_dict
.
update
({
source_word
:
[
target_word
]
})
elif
len
([
s_word
for
s_word
in
source_words_on_line
if
not
s_word
.
processed
and
s_word
.
text
.
startswith
(
target_word
.
text
)
])
>
0
:
current_source_word
=
[
s_word
for
s_word
in
source_words_on_line
if
not
s_word
.
processed
and
s_word
.
text
.
startswith
(
target_word
.
text
)
][
0
]
current_source_word
.
processed
=
True
target_word
.
processed
=
True
result_dict
.
update
({
current_source_word
:
[
target_word
]
})
else
:
msg
=
f
'On line {target_word.line_number}: target_word "{target_word.text}" does not have a sibling in {[ s.text for s in source_words_on_line if not s.processed ]}'
warnings
.
warn
(
msg
)
if
force_sync_on_word
is
not
None
:
_force_sync_on_word
(
force_sync_on_word
,
target_words_on_line
,
result_dict
)
def
_sync_same_length
(
result_dict
,
source_words_on_line
,
target_words_on_line
,
force_sync_on_word
=
None
):
"""Sync same length
"""
for
i
,
word
in
enumerate
(
source_words_on_line
):
if
word
.
text
==
target_words_on_line
[
i
]
.
text
:
word
.
processed
=
True
target_words_on_line
[
i
]
.
processed
=
True
result_dict
.
update
({
word
:
[
target_words_on_line
[
i
]
]
})
elif
len
([
t_word
for
t_word
in
target_words_on_line
if
not
t_word
.
processed
and
t_word
.
text
==
word
.
text
])
>
0
:
target_word
=
[
t_word
for
t_word
in
target_words_on_line
if
not
t_word
.
processed
and
t_word
.
text
==
word
.
text
][
0
]
word
.
processed
=
True
target_word
.
processed
=
True
result_dict
.
update
({
word
:
[
target_word
]
})
else
:
msg
=
f
'On line {word.line_number}: source_word "{word.text}" does not have a sibling in {[ s.text for s in target_words_on_line]}'
warnings
.
warn
(
msg
)
if
force_sync_on_word
is
not
None
:
_force_sync_on_word
(
force_sync_on_word
,
target_words_on_line
,
result_dict
)
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to fix old data.
svgscripts/fix_old_data.py [OPTIONS] <xmlManuscriptFile|svg_pos_file>
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-c|--check-faksimile-positions check whether faksimile positions have been updated
-e|--update-extended-textfield update extended textfield to svg_image
-i|--fix-imprints add imprints to page
-l|--faksimile-line-position create faksimile line positions
-p|--faksimile-positions fix old faksimile positions
-r|--redo rerun
-s|--fix-graphical-svg fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file.
-S|--fix-styles fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file.
-t|--transkription-positions fix old transkription positions
-M|--matrix fix old transkription positions with transform matrix
:return: exit code (int)
"""
function_list
=
[]
function_dict
=
create_function_dictionary
([
'-c'
,
'--check-faksimile-positions'
],
check_faksimile_positions
)
function_dict
=
create_function_dictionary
([
'-l'
,
'--faksimile-line-position'
],
fix_faksimile_line_position
,
function_dictionary
=
function_dict
)
function_dict
=
create_function_dictionary
([
'-p'
,
'--faksimile-positions'
],
fix_faksimile_positions
,
function_dictionary
=
function_dict
)
function_dict
=
create_function_dictionary
([
'-m'
,
'--merge-positions'
],
merge_transkription_positions
,
function_dictionary
=
function_dict
)
function_dict
=
create_function_dictionary
([
'-s'
,
'--fix-graphical-svg'
],
fix_graphical_svg_file
,
function_dictionary
=
function_dict
)
function_dict
=
create_function_dictionary
([
'-M'
,
'--matrix'
],
fix_tp_with_matrix
,
function_dictionary
=
function_dict
)
function_dict
=
create_function_dictionary
([
'-t'
,
'--transkription-positions'
],
fix_transkription_positions
,
function_dictionary
=
function_dict
)
function_dict
=
create_function_dictionary
([
'-S'
,
'--fix-styles'
],
fix_styles
,
function_dictionary
=
function_dict
)
function_dict
=
create_function_dictionary
([
'-i'
,
'--fix-imprints'
],
fix_imprints
,
function_dictionary
=
function_dict
)
function_dict
=
create_function_dictionary
([
'default'
,
'-e'
,
'--update-extended-textfield'
],
extend_text_field
,
function_dictionary
=
function_dict
)
redo
=
False
;
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hcplrmsStMie"
,
[
"help"
,
"check-faksimile-positions"
,
"faksimile-positions"
,
"faksimile-line-position"
,
\
"redo"
,
"merge-positions"
,
"fix-graphical-svg"
,
"fix-styles"
,
"transkription-positions"
,
'matrix'
,
'fix-imprints'
,
'update-extended-textfield'
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
elif
opt
in
(
'-r'
,
'--redo'
):
redo
=
True
;
elif
opt
in
function_dict
.
keys
():
function_list
.
append
(
function_dict
[
opt
])
if
len
(
function_list
)
==
0
:
function_list
.
append
(
function_dict
[
'default'
])
if
len
(
args
)
<
1
:
usage
()
return
2
exit_status
=
0
for
xml_file
in
get_manuscript_files
(
args
):
if
isfile
(
xml_file
):
counters
=
{
f
.
__name__
:
0
for
f
in
function_list
}
for
current_function
in
function_list
:
status_contains
=
STATUS_MERGED_OK
if
'faksimile'
in
current_function
.
__name__
else
'OK'
if
'extend_text_field'
in
current_function
.
__name__
:
status_contains
=
'blank'
for
page
in
Page
.
get_pages_from_xml_file
(
xml_file
,
status_contains
=
status_contains
):
if
not
UNITTESTING
:
print
(
Fore
.
CYAN
+
f
'Processing {page.title}, {page.number} with function {current_function.__name__} ...'
+
Style
.
RESET_ALL
)
back_up
(
page
,
page
.
xml_file
)
counters
[
current_function
.
__name__
]
+=
1
if
current_function
(
page
,
redo
=
redo
)
else
0
if
not
UNITTESTING
:
for
function_name
,
counter
in
counters
.
items
():
print
(
Style
.
RESET_ALL
+
f
'[{counter} pages changed by {function_name}]'
)
else
:
raise
FileNotFoundError
(
'File {} does not exist!'
.
format
(
xml_file
))
return
exit_status
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment