Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F59961992
extract_line_continuation.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Apr 26, 09:26
Size
10 KB
Mime Type
text/x-python
Expires
Sun, Apr 28, 09:26 (2 d)
Engine
blob
Format
Raw Data
Handle
17234073
Attached To
rNIETZSCHEPYTHON nietzsche-python
extract_line_continuation.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract line continuations.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
import
getopt
import
lxml.etree
as
ET
import
re
import
sys
from
os
import
listdir
,
sep
,
path
from
os.path
import
isfile
,
isdir
,
dirname
import
warnings
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
datatypes.box
import
text_node_is_inside_match_box
,
tspan_node_is_inside_match_box
from
datatypes.line
import
Line
from
datatypes.line_continuation
import
LineContinuation
from
datatypes.matrix
import
Matrix
from
datatypes.page
import
Page
,
STATUS_MERGED_OK
,
STATUS_POSTMERGED_OK
from
datatypes.reference
import
Reference
from
datatypes.transkriptionField
import
TranskriptionField
from
util
import
back_up
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
,
xml_has_type
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
UNITTESTING
=
False
DEBUG
=
False
def
extract_line_continuations
(
page
:
Page
,
svg_file
=
None
,
warning_message
=
'WARNING'
):
"""Extract line continuations.
"""
if
svg_file
is
None
:
if
page
.
source
is
None
or
not
isfile
(
page
.
source
):
raise
Exception
(
'Function "extract_line_continuations" needs a page with a valid source or a svg_file!'
)
svg_file
=
page
.
source
if
not
UNITTESTING
:
print
(
Fore
.
CYAN
+
f
'Extracting line continuations on {page.title}, {page.number} ...'
+
Style
.
RESET_ALL
)
svg_tree
=
ET
.
parse
(
svg_file
)
transkription_field
=
TranskriptionField
(
svg_file
,
multipage_index
=
page
.
multipage_index
)
set_to_text_field_zero
=
(
page
.
svg_image
is
None
or
page
.
svg_image
.
text_field
is
None
)
tr_xmin
=
transkription_field
.
xmin
if
set_to_text_field_zero
else
0
tr_ymin
=
transkription_field
.
ymin
if
set_to_text_field_zero
else
0
page
.
update_line_number_area
(
transkription_field
,
svg_tree
=
svg_tree
,
set_to_text_field_zero
=
set_to_text_field_zero
)
for
line
in
page
.
lines
:
line
.
editor_comments
=
[]
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
arrow_style_key
=
[
key
for
key
,
value
in
page
.
style_dict
.
items
()
if
value
.
get
(
'font-family'
)
==
'Frutiger-Europeen'
][
0
]
\
if
len
([
key
for
key
,
value
in
page
.
style_dict
.
items
()
if
value
.
get
(
'font-family'
)
==
'Frutiger-Europeen'
])
>
0
\
else
None
if
arrow_style_key
is
not
None
:
if
DEBUG
:
print
(
arrow_style_key
)
for
arrow
in
_extract_arrow_nodes
(
svg_tree
,
arrow_style_key
,
transkription_field
,
namespaces
):
matrix
=
Matrix
(
transform_matrix_string
=
arrow
.
get
(
'transform'
))
\
if
not
arrow
.
tag
.
endswith
(
'tspan'
)
\
else
Matrix
(
transform_matrix_string
=
arrow
.
getparent
()
.
get
(
'transform'
))
line
=
_get_line_of_arrow
(
arrow
,
page
,
tr_ymin
)
if
line
is
not
None
:
reference_counter
=
0
reference
=
None
while
reference
is
None
and
reference_counter
<
2
:
reference
=
_get_reference
(
svg_tree
,
arrow
,
matrix
,
transkription_field
,
namespaces
,
is_from_reference
=
(
reference_counter
==
0
))
reference_counter
+=
1
if
reference
is
not
None
:
line
.
editor_comments
.
append
(
LineContinuation
(
reference
=
reference
,
to_reference
=
(
reference_counter
>
1
)))
else
:
to_reference
=
(
matrix
.
getX
()
>
transkription_field
.
xmax
)
line
.
editor_comments
.
append
(
LineContinuation
(
reference
=
Reference
(),
to_reference
=
to_reference
))
else
:
y
=
round
(
matrix
.
getY
()
-
tr_ymin
,
2
)
warnings
.
warn
(
f
'{warning_message}: There is no line for {y}'
)
for
line
in
page
.
lines
:
line
.
attach_object_to_tree
(
page
.
page_tree
)
if
not
UNITTESTING
:
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
page
.
page_tree
.
docinfo
.
URL
,
\
script_name
=
__file__
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
def
_extract_arrow_nodes
(
svg_tree
:
ET
.
ElementTree
,
arrow_style_key
:
str
,
transkription_field
=
None
,
namespaces
=
None
)
->
list
:
"""Extract arrow nodes from svg_tree.
"""
if
transkription_field
is
None
:
transkription_field
=
TranskriptionField
(
svg_tree
.
docinfo
.
URL
)
if
namespaces
is
None
:
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
return
[
arrow
for
arrow
in
svg_tree
.
xpath
(
'//ns:text[contains(@class, "{0}")]'
.
format
(
arrow_style_key
)
\
+
'|//ns:tspan[contains(@class, "{0}")]'
.
format
(
arrow_style_key
),
\
namespaces
=
namespaces
)
\
if
arrow
.
text
==
')'
and
node_is_on_marginalia
(
arrow
,
transkription_field
)
]
def
_get_arrow_y
(
arrow
:
ET
.
Element
,
matrix
=
None
)
->
float
:
"""Return y of arrow node.
"""
if
matrix
is
None
:
matrix
=
Matrix
(
transform_matrix_string
=
arrow
.
get
(
'transform'
))
\
if
not
arrow
.
tag
.
endswith
(
'tspan'
)
\
else
Matrix
(
transform_matrix_string
=
arrow
.
getparent
()
.
get
(
'transform'
))
if
arrow
.
tag
.
endswith
(
'tspan'
):
return
matrix
.
add2Y
(
add_to_y
=
arrow
.
get
(
'y'
))
else
:
return
matrix
.
getY
()
def
_get_line_of_arrow
(
arrow
:
ET
.
Element
,
page
:
Page
,
tr_ymin
:
float
,
matrix
=
None
)
->
Line
:
"""Return Line next to arrow.
"""
arrow_y
=
_get_arrow_y
(
arrow
,
matrix
=
matrix
)
line_number
=
page
.
get_line_number
(
round
(
arrow_y
-
tr_ymin
,
2
)
-.
5
)
lines
=
[
line
for
line
in
page
.
lines
if
line
.
id
==
line_number
]
if
len
(
lines
)
>
0
:
return
lines
[
0
]
return
None
def
_get_reference
(
svg_tree
:
ET
.
ElementTree
,
arrow
:
ET
.
Element
,
arrow_matrix
:
Matrix
,
transkription_field
:
TranskriptionField
,
namespaces
:
dict
,
is_from_reference
=
True
)
->
Reference
:
"""Return reference.
"""
reference
=
None
arrow_left
=
arrow_matrix
.
add2X
(
add_to_x
=
arrow
.
get
(
'x'
))
\
if
arrow
.
tag
.
endswith
(
'tspan'
)
\
else
arrow_matrix
.
getX
()
arrow_y
=
_get_arrow_y
(
arrow
,
matrix
=
arrow_matrix
)
xmin
=
0
\
if
arrow_left
<
transkription_field
.
xmin
\
else
transkription_field
.
xmax
+
transkription_field
.
line_number_area_width
xmax
=
arrow_left
ymin
=
arrow_y
-
5
ymax
=
arrow_y
+
5
if
not
is_from_reference
:
xmin
=
xmax
xmax
=
transkription_field
.
xmin
-
transkription_field
.
line_number_area_width
\
if
arrow_left
<
transkription_field
.
xmin
\
else
transkription_field
.
documentWidth
+
transkription_field
.
line_number_area_width
text_nodes_on_arrow_line
=
sorted
([
text_node
for
text_node
in
svg_tree
.
xpath
(
'//ns:text'
,
namespaces
=
namespaces
)
\
if
text_node
!=
arrow
and
text_node_is_inside_match_box
(
text_node
,
xmin
,
xmax
,
ymin
,
ymax
)
],
\
key
=
lambda
node
:
Matrix
(
transform_matrix_string
=
node
.
get
(
'transform'
))
.
getX
())
reference_string
=
''
for
text_node
in
text_nodes_on_arrow_line
:
reference_string
+=
''
.
join
([
child
.
text
for
child
in
text_node
.
getchildren
()])
\
if
len
(
text_node
.
getchildren
())
>
0
\
else
text_node
.
text
if
reference_string
!=
''
:
try
:
reference
=
Reference
.
create_cls
(
reference_string
=
reference_string
)
except
Exception
:
print
(
reference_string
)
return
reference
def
node_is_on_marginalia
(
node
:
ET
.
Element
,
transkription_field
:
TranskriptionField
)
->
bool
:
"""Return true if node is on marginalia.
"""
if
node
.
tag
.
endswith
(
'tspan'
):
return
tspan_node_is_inside_match_box
(
node
,
0
,
transkription_field
.
xmin
,
transkription_field
.
ymin
,
transkription_field
.
ymax
)
\
or
tspan_node_is_inside_match_box
(
node
,
transkription_field
.
xmax
,
transkription_field
.
documentWidth
,
transkription_field
.
ymin
,
transkription_field
.
ymax
)
return
text_node_is_inside_match_box
(
node
,
0
,
transkription_field
.
xmin
,
transkription_field
.
ymin
,
transkription_field
.
ymax
)
\
or
text_node_is_inside_match_box
(
node
,
transkription_field
.
xmax
,
transkription_field
.
documentWidth
,
transkription_field
.
ymin
,
transkription_field
.
ymax
)
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to extract the line continuations.
svgscripts/extract_line_continuation.py [OPTIONS] <xmlManuscriptFile|svg_pos_file>
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
:return: exit code (int)
"""
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"h"
,
[
"help"
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
if
len
(
args
)
<
1
:
usage
()
return
2
exit_status
=
0
file_a
=
args
[
0
]
if
isfile
(
file_a
):
manuscript_file
=
file_a
\
if
xml_has_type
(
FILE_TYPE_XML_MANUSCRIPT
,
xml_source_file
=
file_a
)
\
else
None
counter
=
0
for
page
in
Page
.
get_pages_from_xml_file
(
file_a
,
status_contains
=
STATUS_MERGED_OK
):
if
not
UNITTESTING
:
back_up
(
page
,
page
.
xml_file
)
extract_line_continuations
(
page
)
counter
+=
1
not
UNITTESTING
and
print
(
Style
.
RESET_ALL
+
f
'[{counter} pages processed]'
)
else
:
raise
FileNotFoundError
(
'File {} does not exist!'
.
format
(
file_a
))
return
exit_status
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment