Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F85139693
util.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Sep 27, 01:24
Size
29 KB
Mime Type
text/x-python
Expires
Sun, Sep 29, 01:24 (2 d)
Engine
blob
Format
Raw Data
Handle
21133869
Attached To
rNIETZSCHEPYTHON nietzsche-python
util.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to copy a faksimile svg file with the option of highlighting some word boxes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
from
datetime
import
datetime
from
functools
import
cmp_to_key
import
getopt
import
inspect
import
itertools
import
lxml.etree
as
ET
import
re
import
shutil
import
signal
import
string
import
subprocess
from
svgpathtools
import
svg_to_paths
import
sys
import
tempfile
import
os
from
os
import
listdir
,
sep
,
path
,
setpgrp
,
devnull
,
makedirs
from
os.path
import
basename
,
commonpath
,
dirname
,
exists
,
isfile
,
isdir
,
realpath
,
splitext
import
warnings
import
wget
import
xml.etree.ElementTree
as
XET
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
datatypes.faksimile
import
FaksimilePage
,
get_paths_inside_rect
from
datatypes.faksimile_image
import
FaksimileImage
from
datatypes.lineNumber
import
LineNumber
from
datatypes.mark_foreign_hands
import
MarkForeignHands
from
datatypes.matrix
import
Matrix
from
datatypes.page
import
Page
from
datatypes.page_creator
import
PageCreator
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.transkription_position
import
TranskriptionPosition
from
datatypes.word
import
Word
,
update_transkription_position_ids
from
local_config
import
FAKSIMILE_LOCATION
,
PDF_READER
,
SVG_EDITOR
,
USER_ROOT_LOCATION_DICT
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
HIGHLIGHT_COLOR
=
'red'
OPACITY
=
'0.5'
class
ExternalViewer
:
"""This class can be used to show files with external viewers.
"""
file_format_viewer_dict
=
{
'.pdf'
:
PDF_READER
,
'.svg'
:
SVG_EDITOR
}
@classmethod
def
show_files
(
cls
,
single_file
=
None
,
list_of_files
=
[]):
"""Opens file(s) with corresponding external viewer(s).
"""
DEVNULL
=
None
if
type
(
single_file
)
==
list
:
list_of_files
=
single_file
elif
single_file
is
not
None
:
list_of_files
.
append
(
single_file
)
if
len
(
list_of_files
)
>
1
:
DEVNULL
=
open
(
devnull
,
'wb'
)
process_list
=
[]
list_of_files
.
reverse
()
while
len
(
list_of_files
)
>
0
:
file2open
=
list_of_files
.
pop
()
viewer
=
cls
.
file_format_viewer_dict
.
get
(
splitext
(
file2open
)[
1
])
if
viewer
is
not
None
:
if
len
(
list_of_files
)
>
0
:
process_list
.
append
(
\
subprocess
.
Popen
([
viewer
,
file2open
],
stdout
=
DEVNULL
,
stderr
=
DEVNULL
,
preexec_fn
=
os
.
setsid
))
else
:
subprocess
.
run
([
viewer
,
file2open
])
for
process
in
process_list
:
os
.
killpg
(
os
.
getpgid
(
process
.
pid
),
signal
.
SIGTERM
)
if
DEVNULL
is
not
None
:
DEVNULL
.
close
()
def
back_up
(
page
:
Page
,
reference_file
,
bak_dir
=
'./bak'
)
->
str
:
"""Back up a xml_source_file.
:return: target_file_name
"""
date_string
=
datetime
.
now
()
.
strftime
(
'%Y-%m-
%d
_%H:%M:%S'
)
makedirs
(
bak_dir
,
exist_ok
=
True
)
page
.
bak_file
=
bak_dir
+
sep
+
basename
(
page
.
page_tree
.
docinfo
.
URL
)
+
'_'
+
date_string
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
page
.
bak_file
,
\
script_name
=
__file__
+
'({0},{1})'
.
format
(
inspect
.
currentframe
()
.
f_code
.
co_name
,
reference_file
),
\
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
return
page
.
bak_file
def
back_up_svg_file
(
svg_tree
:
ET
.
ElementTree
,
namespaces
=
None
,
bak_dir
=
'./bak'
)
->
str
:
"""Back up a xml_source_file.
:return: target_file_name
"""
if
namespaces
is
None
:
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
date_string
=
datetime
.
now
()
.
strftime
(
'%Y-%m-
%d
_%H:%M:%S'
)
makedirs
(
bak_dir
,
exist_ok
=
True
)
bak_file
=
bak_dir
+
sep
+
date_string
+
'_'
+
basename
(
svg_tree
.
docinfo
.
URL
)
copy_faksimile_svg_file
(
target_file
=
bak_file
,
faksimile_tree
=
svg_tree
,
namespaces
=
namespaces
)
return
bak_file
def
copy_faksimile_svg_file
(
target_file
=
None
,
faksimile_source_file
=
None
,
faksimile_tree
=
None
,
target_directory
=
None
,
abs_image_path
=
None
,
local_image_path
=
None
,
namespaces
=
None
):
"""Copy a faksimile_svg_file to target_file.
"""
if
faksimile_source_file
is
None
and
faksimile_tree
is
not
None
:
faksimile_source_file
=
faksimile_tree
.
docinfo
.
URL
elif
faksimile_source_file
is
None
:
raise
Exception
(
'copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file'
)
if
target_file
is
not
None
and
target_directory
is
not
None
:
target_file
=
target_directory
+
sep
+
target_file
elif
target_file
is
None
and
target_directory
is
not
None
:
target_file
=
target_directory
+
sep
+
basename
(
faksimile_source_file
)
elif
target_file
is
None
:
raise
Exception
(
'copy_faksimile_svg_file needs either a target_file or a target_directory'
)
paths
,
attributes
,
svg_attributes
=
svg_to_paths
.
svg2paths
(
faksimile_source_file
,
return_svg_attributes
=
True
)
for
key
in
[
key
for
key
in
svg_attributes
.
keys
()
if
key
.
startswith
(
'xmlns:'
)
]:
try
:
XET
.
register_namespace
(
key
.
replace
(
'xmlns:'
,
''
),
svg_attributes
[
key
])
except
ValueError
:
pass
XET
.
register_namespace
(
''
,
'http://www.w3.org/2000/svg'
)
if
namespaces
is
None
:
xsodi
=
svg_attributes
[
'xmlns:sodipodi'
]
if
bool
(
svg_attributes
.
get
(
'xmlns:sodipodi'
))
else
svg_attributes
.
get
(
'sodipodi'
)
namespaces
=
{
'ns'
:
svg_attributes
[
'xmlns'
],
'xlink'
:
svg_attributes
[
'xmlns:xlink'
],
'sodipodi'
:
xsodi
}
if
not
bool
(
namespaces
.
get
(
'sodipodi'
)):
namespaces
[
'sodipodi'
]
=
'http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd'
if
faksimile_tree
is
not
None
:
element
=
XET
.
fromstring
(
ET
.
tostring
(
faksimile_tree
))
\
if
type
(
faksimile_tree
)
==
ET
.
_ElementTree
\
else
XET
.
fromstring
(
XET
.
tostring
(
faksimile_tree
.
getroot
()))
target_tree
=
XET
.
ElementTree
(
element
)
else
:
target_tree
=
XET
.
parse
(
faksimile_source_file
)
if
(
local_image_path
is
not
None
or
abs_image_path
is
not
None
)
\
and
len
(
target_tree
.
findall
(
'.//ns:image'
,
namespaces
=
namespaces
))
>
0
:
image_node
=
target_tree
.
findall
(
'.//ns:image'
,
namespaces
=
namespaces
)[
0
]
if
local_image_path
is
not
None
:
image_node
.
set
(
'{
%s
}href'
%
namespaces
[
'xlink'
],
local_image_path
)
if
abs_image_path
is
not
None
:
image_node
.
set
(
'{
%s
}absref'
%
namespaces
[
'sodipodi'
],
abs_image_path
)
target_tree
.
write
(
target_file
)
def
copy_faksimile_update_image_location
(
faksimile_source_file
=
None
,
faksimile_tree
=
None
,
target_file
=
None
,
target_directory
=
None
,
overwrite
=
False
):
"""Copy a faksimile_svg_file to target_file and update image location.
"""
if
faksimile_source_file
is
None
and
faksimile_tree
is
not
None
:
faksimile_source_file
=
faksimile_tree
.
docinfo
.
URL
elif
faksimile_source_file
is
None
:
raise
Exception
(
'copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file'
)
if
target_file
is
not
None
and
target_directory
is
not
None
:
target_file
=
target_directory
+
sep
+
target_file
elif
target_file
is
None
and
target_directory
is
not
None
:
target_file
=
target_directory
+
sep
+
basename
(
faksimile_source_file
)
elif
target_directory
is
None
and
target_file
is
not
None
:
target_directory
=
dirname
(
target_file
)
elif
target_file
is
None
:
raise
Exception
(
'copy_faksimile_svg_file needs either a target_file or a target_directory'
)
source_tree
=
ET
.
parse
(
faksimile_source_file
)
if
faksimile_tree
is
None
else
faksimile_tree
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
source_tree
.
getroot
()
.
nsmap
.
items
()
}
image_nodes
=
source_tree
.
xpath
(
'//ns:image'
,
namespaces
=
namespaces
)
local_image_path
=
None
abs_image_path
=
None
user_abs_image_path
=
None
if
len
(
image_nodes
)
>
0
:
image
=
FaksimileImage
.
CREATE_IMAGE
(
image_nodes
[
0
],
source_file
=
faksimile_source_file
)
abs_image_path
=
image
.
local_path
for
user_name
in
USER_ROOT_LOCATION_DICT
.
keys
():
if
user_name
in
target_directory
:
user_abs_image_path
=
abs_image_path
.
replace
(
FAKSIMILE_LOCATION
,
USER_ROOT_LOCATION_DICT
[
user_name
])
.
replace
(
'//'
,
'/'
)
break
# if target_directory is subdir of FAKSIMILE_LOCATION
if
realpath
(
target_directory
)
.
startswith
(
realpath
(
FAKSIMILE_LOCATION
)):
common_path
=
commonpath
([
realpath
(
target_directory
),
realpath
(
dirname
(
image
.
local_path
))
])
relative_directory
=
'/'
.
join
(
\
[
'..'
for
d
in
realpath
(
target_directory
)
.
replace
(
common_path
+
'/'
,
''
)
.
split
(
'/'
)
])
local_image_path
=
relative_directory
+
realpath
(
image
.
local_path
)
.
replace
(
common_path
,
''
)
if
not
isfile
(
target_directory
+
sep
+
local_image_path
):
local_image_path
=
None
elif
abs_image_path
is
not
None
:
local_image_path
=
abs_image_path
if
abs_image_path
is
not
None
and
not
isfile
(
abs_image_path
):
wget
.
download
(
image
.
URL
,
out
=
dirname
(
abs_image_path
))
if
not
isfile
(
target_file
)
or
overwrite
:
abs_image_path
=
user_abs_image_path
if
user_abs_image_path
is
not
None
else
abs_image_path
copy_faksimile_svg_file
(
target_file
=
target_file
,
faksimile_source_file
=
faksimile_source_file
,
\
faksimile_tree
=
faksimile_tree
,
abs_image_path
=
abs_image_path
,
\
local_image_path
=
local_image_path
,
namespaces
=
namespaces
)
else
:
msg
=
'File {0} not copied to directory {1}, it already contains a file {2}.'
.
format
(
faksimile_source_file
,
target_directory
,
target_file
)
warnings
.
warn
(
msg
)
def
copy_xml_file_word_pos_only
(
xml_source_file
,
target_directory
):
"""Copy word positions of a xml file to target directory.
:return: (str) xml_target_file
"""
xml_target_file
=
target_directory
+
sep
+
basename
(
xml_source_file
)
source_page
=
Page
(
xml_source_file
)
target_page
=
PageCreator
(
xml_target_file
,
title
=
source_page
.
title
,
page_number
=
source_page
.
number
,
orientation
=
source_page
.
orientation
)
target_page
.
words
=
source_page
.
words
target_page
.
update_and_attach_words2tree
()
write_pretty
(
xml_element_tree
=
target_page
.
page_tree
,
file_name
=
xml_target_file
,
\
script_name
=
__file__
+
'({})'
.
format
(
inspect
.
currentframe
()
.
f_code
.
co_name
),
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
return
xml_target_file
def
create_highlighted_svg_file
(
faksimile_tree
,
node_ids
,
nodes_color_dict
=
None
,
target_file
=
None
,
target_directory
=
None
,
local_image_path
=
None
,
namespaces
=
None
,
highlight_color
=
HIGHLIGHT_COLOR
,
opacity
=
OPACITY
):
"""Highlights the nodes of a faksimile_tree that are specified by the list of node_ids and writes the tree to a file.
"""
if
namespaces
is
None
:
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
faksimile_tree
.
getroot
()
.
nsmap
.
items
()
}
for
node
in
itertools
.
chain
(
*
[
\
faksimile_tree
.
xpath
(
'//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'
.
format
(
node_id
),
namespaces
=
namespaces
)
\
for
node_id
in
node_ids
\
]):
node
.
set
(
'fill'
,
highlight_color
)
node
.
set
(
'opacity'
,
opacity
)
node
.
set
(
'style'
,
''
)
copy_faksimile_update_image_location
(
target_file
=
target_file
,
faksimile_tree
=
faksimile_tree
,
target_directory
=
target_directory
)
def
get_empty_node_ids
(
faksimile_tree
,
x_min
=
0.0
,
x_max
=
0.0
,
y_min
=
0.0
,
y_max
=
0.0
,
text_field_id
=
None
,
faksimile_page
=
None
,
namespaces
=
{}):
"""Returns a list of ids of rect and path nodes that do not have a title element.
"""
THRESHOLD_X
=
10
if
faksimile_page
is
not
None
:
x_min
=
faksimile_page
.
text_field
.
xmin
+
faksimile_page
.
faksimile_image
.
x
x_max
=
faksimile_page
.
text_field
.
xmax
+
faksimile_page
.
faksimile_image
.
x
-
THRESHOLD_X
y_min
=
faksimile_page
.
text_field
.
ymin
+
faksimile_page
.
faksimile_image
.
y
y_max
=
faksimile_page
.
text_field
.
ymax
+
faksimile_page
.
faksimile_image
.
y
text_field_id
=
faksimile_page
.
text_field
.
id
if
len
(
namespaces
)
==
0
:
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
faksimile_tree
.
getroot
()
.
nsmap
.
items
()
}
empyt_node_ids
=
[]
nodes_without_title
=
faksimile_tree
.
xpath
(
'//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'
.
format
(
\
x_min
,
x_max
,
y_min
,
y_max
,
text_field_id
),
namespaces
=
namespaces
)
nodes_without_title
+=
get_paths_inside_rect
(
faksimile_tree
,
'//ns:path[not(./ns:title)]'
,
x_min
,
x_max
,
y_min
,
y_max
,
text_field_id
,
namespaces
=
namespaces
)
for
node_without_title
in
nodes_without_title
:
empyt_node_ids
.
append
(
node_without_title
.
get
(
'id'
))
return
empyt_node_ids
def
get_mismatching_ids
(
words
,
faksimile_positions
):
""" Return the list of mismatching words and the list of mismatching faksimile_positions
as a 2-tuple.
"""
mismatching_words
=
[]
mismatching_faksimile_positions
=
[]
faksimile_positions
,
unique_faksimile_words
=
replace_chars
(
words
,
faksimile_positions
)
word_texts
=
[
word
.
text
for
word
in
words
if
word
.
text
!=
'.'
]
for
word_text
in
set
(
word_texts
):
if
word_text
not
in
unique_faksimile_words
:
mismatching_words
+=
[
word
for
word
in
words
if
word
.
text
==
word_text
]
for
faksimile_position_text
in
unique_faksimile_words
:
if
faksimile_position_text
not
in
set
(
word_texts
):
mismatching_faksimile_positions
+=
[
faksimile_position
for
faksimile_position
in
faksimile_positions
\
if
faksimile_position
.
text
==
faksimile_position_text
]
return
mismatching_words
,
mismatching_faksimile_positions
def
process_warnings4status
(
warnings
,
warning_messages
,
current_status
,
ok_status
,
status_prefix
=
''
)
->
str
:
"""Process potential warnings and return actual status.
"""
if
warnings
is
not
None
and
len
(
warnings
)
>
0
:
status
=
status_prefix
for
warning_message
in
warning_messages
:
if
True
in
[
str
(
warn
.
message
)
.
startswith
(
warning_message
)
for
warn
in
warnings
]:
status
+=
f
':{warning_message}:'
if
status
!=
status_prefix
:
return
status
return
f
'{current_status}:{ok_status}:'
else
:
return
f
'{current_status}:{ok_status}:'
def
change_title_of_svg
(
svg_file
,
node_id
,
text
):
"""Change the title of a rect/path node.
"""
svg_tree
=
ET
.
parse
(
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
nodes
=
svg_tree
.
xpath
(
'//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'
.
format
(
node_id
),
namespaces
=
namespaces
)
if
len
(
nodes
)
>
0
:
nodes
[
0
]
.
text
=
text
copy_faksimile_svg_file
(
target_file
=
svg_file
,
faksimile_tree
=
svg_tree
)
def
change_id_of_textfield
(
svg_file
,
manuscript_title
,
page_number
,
faksimie_page_number
):
"""Change the title of a rect/path node.
"""
svg_tree
=
ET
.
parse
(
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
aliases
=
[
id
for
id
in
svg_tree
.
xpath
(
'//ns:rect/@id'
,
namespaces
=
namespaces
)
if
not
id
.
startswith
(
'rect'
)
and
id
.
endswith
(
faksimie_page_number
)
]
if
len
(
aliases
)
>
0
:
alias
=
aliases
[
0
]
id
=
manuscript_title
.
replace
(
' '
,
'-'
)
+
'_'
+
page_number
text_fields
=
svg_tree
.
xpath
(
f
'//ns:rect[@id="{alias}"]'
,
namespaces
=
namespaces
)
if
len
(
text_fields
)
>
0
:
text_fields
[
0
]
.
set
(
'id'
,
id
)
copy_faksimile_svg_file
(
target_file
=
svg_file
,
faksimile_tree
=
svg_tree
)
def
record_changes
(
original_svg_file
,
changed_svg_file
,
node_ids
,
namespaces
=
{}):
"""Copy changes made to changed_svg_file to original_svg_file.
"""
old_tree
=
ET
.
parse
(
original_svg_file
)
new_tree
=
ET
.
parse
(
changed_svg_file
)
if
len
(
namespaces
)
==
0
:
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
new_tree
.
getroot
()
.
nsmap
.
items
()
}
for
node_id
in
node_ids
:
new_titles
=
new_tree
.
xpath
(
'//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'
.
format
(
node_id
),
namespaces
=
namespaces
)
old_nodes
=
old_tree
.
xpath
(
'//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'
.
format
(
node_id
),
namespaces
=
namespaces
)
if
len
(
new_titles
)
>
0
and
len
(
old_nodes
)
>
0
:
if
old_nodes
[
0
]
.
find
(
'ns:title'
,
namespaces
=
namespaces
)
is
not
None
:
old_nodes
[
0
]
.
find
(
'ns:title'
,
namespaces
=
namespaces
)
.
text
=
new_titles
[
0
]
.
text
else
:
old_title_id_string
=
new_titles
[
0
]
.
get
(
'id'
)
old_title
=
ET
.
SubElement
(
old_nodes
[
0
],
'title'
,
attrib
=
{
'id'
:
old_title_id_string
})
old_title
.
text
=
new_titles
[
0
]
.
text
elif
len
(
old_nodes
)
>
0
:
for
old_node
in
old_nodes
:
old_node
.
getparent
()
.
remove
(
old_node
)
copy_faksimile_svg_file
(
target_file
=
original_svg_file
,
faksimile_tree
=
old_tree
)
def
record_changes_on_svg_file_to_page
(
xml_source_file
,
svg_file
,
word_ids
=
None
):
"""Copy changes made to svg_file to xml_source_file.
:return: datatypes.page.Page
"""
svg_tree
=
ET
.
parse
(
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
transkription_field
=
TranskriptionField
(
svg_file
)
page
=
Page
(
xml_source_file
)
words
=
[
word
for
word
in
page
.
words
if
word
.
id
in
word_ids
]
\
if
word_ids
is
not
None
else
page
.
words
new_page_words
=
[]
for
word
in
words
:
word_id
=
'word_'
+
str
(
word
.
id
)
+
'_'
recorded_ids
=
[]
for
transkription_position
in
word
.
transkription_positions
:
transkription_position_id
=
word_id
+
str
(
transkription_position
.
id
)
tp_nodes
=
svg_tree
.
xpath
(
'//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]'
.
format
(
transkription_position_id
),
namespaces
=
namespaces
)
if
len
(
tp_nodes
)
>
0
:
record_changes_to_transkription_position
(
tp_nodes
[
0
],
transkription_position
,
\
transkription_field
.
xmin
,
transkription_field
.
ymin
,
namespaces
=
namespaces
)
recorded_ids
.
append
(
transkription_position_id
)
extra_nodes
=
[
node
for
node
in
\
svg_tree
.
xpath
(
'//ns:g[@id="Transkription"]/ns:rect[contains(@id, "{0}")]'
.
format
(
word_id
),
namespaces
=
namespaces
)
\
if
node
.
get
(
'id'
)
not
in
recorded_ids
]
if
len
(
extra_nodes
)
>
0
:
for
extra_node
in
extra_nodes
:
old_ids
=
[
inkscape_id
.
replace
(
'#'
,
''
)
for
inkscape_id
in
\
svg_tree
.
xpath
(
'//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]/@inkscape:label'
.
format
(
extra_node
.
get
(
'id'
)),
\
namespaces
=
namespaces
)
]
if
len
(
old_ids
)
>
0
and
re
.
match
(
r'word_[0-9]+_[0-9]+'
,
old_ids
[
0
]):
old_id_list
=
old_ids
[
0
]
.
split
(
'_'
)
ref_word_id
=
int
(
old_id_list
[
1
])
ref_tp_id
=
old_id_list
[
2
]
ref_words
=
[
word
for
word
in
page
.
words
if
word
.
id
==
ref_word_id
]
if
len
(
ref_words
)
>
0
:
ref_tps
=
[
tp
for
tp
in
ref_words
[
0
]
.
transkription_positions
\
if
tp
.
id
==
ref_tp_id
]
if
len
(
ref_tps
)
>
0
:
ref_words
[
0
]
.
transkription_positions
.
remove
(
ref_tps
[
0
])
record_changes_to_transkription_position
(
extra_node
,
\
ref_tps
[
0
],
transkription_field
.
xmin
,
transkription_field
.
ymin
,
namespaces
=
namespaces
)
word
.
transkription_positions
.
append
(
ref_tps
[
0
])
for
word
in
page
.
words
:
if
word
.
has_mixed_status
(
'text'
):
new_page_words
+=
[
word
for
word
in
word
.
split_according_to_status
(
'text'
)
if
word
.
text
is
not
None
and
word
.
text
!=
''
]
elif
len
(
word
.
transkription_positions
)
>
0
:
new_text
=
[
tp
.
text
for
tp
in
word
.
transkription_positions
if
tp
.
text
is
not
None
and
tp
.
text
!=
''
]
if
len
(
new_text
)
>
0
:
word
.
text
=
new_text
[
0
]
new_page_words
.
append
(
word
)
page
.
words
=
new_page_words
page
.
update_and_attach_words2tree
(
update_function_on_word
=
update_transkription_position_ids
)
page
.
unlock
()
if
not
UNITTESTING
:
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
xml_source_file
,
\
script_name
=
__file__
+
' -> '
+
inspect
.
currentframe
()
.
f_code
.
co_name
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
return
page
def
record_changes_on_xml_file_to_page
(
xml_source_file
,
xml_file
)
->
Page
:
"""Copy changes made to xml_file to xml_source_file.
:return: datatypes.page.Page
"""
copy_page
=
Page
(
xml_file
)
page
=
Page
(
xml_source_file
)
page
.
unlock
()
back_up
(
page
,
xml_file
)
page
.
words
=
[]
for
word
in
copy_page
.
words
:
if
word
.
split_strings
is
None
\
or
len
(
word
.
split_strings
)
==
0
:
page
.
words
.
append
(
word
)
else
:
next_word
=
word
for
split_string
in
word
.
split_strings
:
_
,
new_word
,
next_word
=
next_word
.
split
(
split_string
)
page
.
words
.
append
(
new_word
)
if
next_word
is
not
None
:
page
.
words
.
append
(
next_word
)
page
.
update_and_attach_words2tree
(
update_function_on_word
=
update_transkription_position_ids
)
remove_words_if_done
=
[]
for
word
in
page
.
words
:
if
'join_string'
in
word
.
__dict__
.
keys
()
\
and
word
.
join_string
is
not
None
:
if
word
.
id
>
0
\
and
page
.
words
[
word
.
id
-
1
]
.
text
+
word
.
text
==
word
.
join_string
:
page
.
words
[
word
.
id
-
1
]
.
join
(
word
)
remove_words_if_done
.
append
(
word
)
elif
word
.
id
<
len
(
page
.
words
)
\
and
word
.
text
+
page
.
words
[
word
.
id
+
1
]
.
text
==
word
.
join_string
:
word
.
join
(
page
.
words
[
word
.
id
+
1
])
remove_words_if_done
.
append
(
page
.
words
[
word
.
id
+
1
])
for
word
in
remove_words_if_done
:
page
.
words
.
remove
(
word
)
page
.
update_and_attach_words2tree
(
update_function_on_word
=
update_transkription_position_ids
)
if
not
UNITTESTING
:
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
xml_source_file
,
\
script_name
=
__file__
+
'({0},{1})'
.
format
(
inspect
.
currentframe
()
.
f_code
.
co_name
,
xml_file
),
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
return
page
def
record_changes_to_transkription_position
(
node
,
transkription_position
,
xmin
=
0.0
,
ymin
=
0.0
,
namespaces
=
None
):
"""Record changes made to node to transkription_position.
"""
if
namespaces
is
None
:
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
node
.
nsmap
.
items
()
}
if
bool
(
node
.
get
(
'x'
)):
transkription_position
.
left
=
float
(
node
.
get
(
'x'
))
-
xmin
if
bool
(
node
.
get
(
'y'
)):
transkription_position
.
top
=
float
(
node
.
get
(
'y'
))
-
ymin
if
bool
(
node
.
get
(
'width'
)):
transkription_position
.
width
=
float
(
node
.
get
(
'width'
))
if
bool
(
node
.
get
(
'height'
)):
transkription_position
.
height
=
float
(
node
.
get
(
'height'
))
if
len
(
node
.
xpath
(
'./ns:title/text()'
,
namespaces
=
namespaces
))
>
0
:
transkription_position
.
text
=
node
.
xpath
(
'./ns:title/text()'
,
namespaces
=
namespaces
)[
0
]
def
replace_chars
(
words
,
faksimile_positions
,
unique_faksimile_words
=
None
):
"""Return unique_faksimile_words and faksimile_positions, with characters changed according to transcription words.
"""
if
unique_faksimile_words
is
None
:
unique_faksimile_words
=
sorted
(
set
(
faksimile_position
.
text
for
faksimile_position
in
faksimile_positions
),
\
key
=
lambda
text
:
len
(
text
))
for
index
,
word_text
in
enumerate
(
unique_faksimile_words
):
if
len
([
word
for
word
in
words
if
word
.
text
==
word_text
])
==
0
:
if
re
.
match
(
r'.*".*'
,
word_text
)
\
and
len
([
word
for
word
in
words
if
word
.
text
==
word_text
.
replace
(
'"'
,
'“'
)
])
>
0
:
unique_faksimile_words
[
index
]
=
word_text
.
replace
(
'"'
,
'“'
)
elif
re
.
match
(
r'.*ss.*'
,
word_text
)
\
and
len
([
word
for
word
in
words
if
word
.
text
==
word_text
.
replace
(
'ss'
,
'ß'
)
])
>
0
:
unique_faksimile_words
[
index
]
=
word_text
.
replace
(
'ss'
,
'ß'
)
elif
re
.
match
(
r'.*-.*'
,
word_text
)
\
and
len
([
word
for
word
in
words
if
word
.
text
==
word_text
.
replace
(
'-'
,
'–'
)
])
>
0
:
unique_faksimile_words
[
index
]
=
word_text
.
replace
(
'-'
,
'–'
)
for
faksimile_position
in
[
faksimile_position
for
faksimile_position
in
faksimile_positions
\
if
faksimile_position
.
text
==
word_text
]:
faksimile_position
.
text
=
unique_faksimile_words
[
index
]
elif
word_text
==
'-'
\
and
len
([
word
for
word
in
words
if
word
.
text
==
'–'
])
>
0
:
print
([
word
.
text
for
word
in
words
if
word
.
text
==
word_text
])
print
([
word
.
text
for
word
in
words
if
word
.
text
==
'–'
])
return
faksimile_positions
,
unique_faksimile_words
def
reset_tp_with_matrix
(
transkription_positions
,
new_left
=
0
,
new_top
=-
5
,
tr_xmin
=
0.0
,
tr_ymin
=
0.0
):
"""Fix transkription_position with transform matrix.
"""
if
len
(
transkription_positions
)
>
0
:
for
tp
in
transkription_positions
:
if
tp
.
transform
is
not
None
\
and
tp
.
transform
.
isRotationMatrix
():
tp
.
transform
.
matrix
[
Matrix
.
XINDEX
]
=
round
(
tp
.
transform
.
matrix
[
Matrix
.
XINDEX
]
+
tr_xmin
,
3
)
tp
.
left
=
round
(
tp
.
left
,
3
)
-
tp
.
transform
.
matrix
[
Matrix
.
XINDEX
]
\
if
abs
(
round
(
tp
.
left
,
3
)
-
tp
.
transform
.
matrix
[
Matrix
.
XINDEX
])
>
1
\
else
0
tp
.
bottom
=
round
(
tp
.
bottom
,
3
)
-
tp
.
transform
.
matrix
[
Matrix
.
YINDEX
]
tp
.
transform
.
matrix
[
Matrix
.
YINDEX
]
=
round
(
tp
.
transform
.
matrix
[
Matrix
.
YINDEX
]
+
tr_ymin
,
3
)
tp
.
top
=
tp
.
bottom
-
tp
.
height
+
2
def
update_svgposfile_status
(
file_name
,
manuscript_file
=
None
,
status
=
'changed'
,
append
=
True
):
"""Updates svg position file's status. Changes its status to status if it does not contain 'OK',
else it appends new status to old status.
"""
if
isfile
(
file_name
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
file_tree
=
ET
.
parse
(
file_name
,
parser
)
old_status
=
file_tree
.
getroot
()
.
get
(
'status'
)
if
old_status
is
None
or
'OK'
not
in
old_status
.
split
(
':'
):
file_tree
.
getroot
()
.
set
(
'status'
,
status
)
elif
append
:
if
status
not
in
old_status
.
split
(
':'
):
new_status
=
old_status
+
':'
+
status
file_tree
.
getroot
()
.
set
(
'status'
,
new_status
)
else
:
file_tree
.
getroot
()
.
set
(
'status'
,
new_status
)
write_pretty
(
xml_element_tree
=
file_tree
,
file_name
=
file_name
,
script_name
=
__file__
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
if
manuscript_file
is
not
None
and
isfile
(
manuscript_file
):
page_number
=
file_tree
.
getroot
()
.
get
(
'number'
)
update_manuscript_file
(
manuscript_file
,
page_number
,
file_name
,
status
=
status
)
def
update_manuscript_file
(
manuscript_file
,
page_number
,
file_name
,
status
=
'changed'
,
append
=
True
):
"""Updates manuscript file: adds status information about page.
"""
if
isfile
(
manuscript_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
manuscript_tree
=
ET
.
parse
(
manuscript_file
,
parser
)
if
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
))
>
0
:
node
=
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
old_status
=
node
.
get
(
'status'
)
if
old_status
is
None
or
'OK'
not
in
old_status
.
split
(
':'
):
node
.
set
(
'status'
,
status
)
elif
append
:
if
status
not
in
old_status
.
split
(
':'
):
new_status
=
old_status
+
':'
+
status
node
.
set
(
'status'
,
new_status
)
else
:
node
.
set
(
'status'
,
new_status
)
if
not
bool
(
node
.
get
(
'output'
)):
node
.
set
(
'output'
,
file_name
)
else
:
pages_node
=
manuscript_tree
.
getroot
()
.
find
(
'pages'
)
\
if
manuscript_tree
.
getroot
()
.
find
(
'pages'
)
is
not
None
\
else
ET
.
SubElement
(
manuscript_tree
.
getroot
(),
'pages'
)
new_id
=
len
(
pages_node
.
findall
(
'page'
))
+
1
ET
.
SubElement
(
pages_node
,
'page'
,
attrib
=
{
'id'
:
str
(
new_id
),
'number'
:
str
(
page_number
),
'status'
:
status
,
'output'
:
file_name
})
write_pretty
(
xml_element_tree
=
manuscript_tree
,
file_name
=
manuscript_file
,
script_name
=
__file__
,
file_type
=
FILE_TYPE_XML_MANUSCRIPT
)
Event Timeline
Log In to Comment