Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F65733417
page.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Jun 5, 20:51
Size
21 KB
Mime Type
text/x-python
Expires
Fri, Jun 7, 20:51 (2 d)
Engine
blob
Format
Raw Data
Handle
18119145
Attached To
rNIETZSCHEPYTHON nietzsche-python
page.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
lxml
import
etree
as
ET
from
os.path
import
isfile
from
progress.bar
import
Bar
from
svgpathtools
import
svg2paths2
,
svg_to_paths
from
svgpathtools.parser
import
parse_path
from
.class_spec
import
SemanticClass
from
.image
import
Image
from
.lineNumber
import
LineNumber
from
.path
import
Path
from
.positional_word_part
import
PositionalWordPart
from
.transkriptionField
import
TranskriptionField
from
.writing_process
import
WritingProcess
from
.word
import
Word
from
.word_insertion_mark
import
WordInsertionMark
class
Page
(
SemanticClass
):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
UNITTESTING
=
False
WARNING_MISSING_USE_NODE4PWP
=
PositionalWordPart
.
WARN_NO_USE_NODE_FOUND
WARNING_MISSING_GLYPH_ID4WIM
=
WordInsertionMark
.
WARN_NO_GLYPH_ID
def
__init__
(
self
,
xml_source_file
=
None
,
xml_target_file
=
None
,
title
=
None
,
page_number
=
None
,
pdfFile
=
None
,
svg_file
=
None
,
orientation
=
'North'
,
extract_transkription_field_only
=
False
):
self
.
title
=
title
self
.
line_numbers
=
[]
self
.
style_dict
=
{}
self
.
sonderzeichen_list
=
[]
self
.
svg_file
=
None
self
.
pdfFile
=
None
self
.
source
=
None
self
.
number
=
page_number
if
page_number
is
not
None
else
-
1
self
.
orientation
=
orientation
self
.
word_deletion_paths
=
[]
if
xml_source_file
is
not
None
:
if
isfile
(
xml_source_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
page_tree
=
ET
.
parse
(
xml_source_file
,
parser
)
self
.
title
=
self
.
page_tree
.
getroot
()
.
get
(
'title'
)
self
.
number
=
self
.
page_tree
.
getroot
()
.
get
(
'number'
)
self
.
source
=
self
.
page_tree
.
getroot
()
.
get
(
'source'
)
self
.
orientation
=
self
.
page_tree
.
getroot
()
.
get
(
'orientation'
)
self
.
init_words
()
self
.
add_style
(
style_node
=
self
.
page_tree
.
getroot
()
.
find
(
'.//style'
))
self
.
svg_file
=
self
.
page_tree
.
xpath
(
'.//svg/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@file'
))
>
0
else
None
self
.
width
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
))
>
0
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
))
>
0
else
0.0
self
.
pdfFile
=
self
.
page_tree
.
xpath
(
'.//pdf/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//pdf/@file'
))
>
0
else
None
if
pdfFile
is
not
None
and
self
.
pdfFile
is
None
:
self
.
pdfFile
=
pdfFile
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'pdf'
,
attrib
=
{
'file'
:
self
.
pdfFile
})
if
svg_file
is
not
None
and
self
.
svg_file
is
None
:
self
.
svg_file
=
svg_file
tf
=
TranskriptionField
(
svg_file
)
self
.
width
=
round
(
tf
.
documentWidth
,
3
)
self
.
height
=
round
(
tf
.
documentHeight
,
3
)
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'svg'
,
attrib
=
{
'width'
:
str
(
self
.
width
),
'height'
:
str
(
self
.
height
),
'file'
:
self
.
svg_file
})
else
:
raise
Exception
(
'File "{}" does not exist!'
.
format
(
xml_source_file
))
elif
xml_target_file
is
not
None
:
self
.
word_insertion_marks
=
[]
self
.
words
=
[]
self
.
writing_processes
=
[]
self
.
svg_file
=
svg_file
self
.
pdfFile
=
pdfFile
if
isfile
(
xml_target_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
page_tree
=
ET
.
parse
(
xml_target_file
,
parser
)
self
.
source
=
self
.
page_tree
.
getroot
()
.
get
(
'source'
)
if
bool
(
self
.
page_tree
.
getroot
()
.
get
(
'orientation'
)):
self
.
orientation
=
self
.
page_tree
.
getroot
()
.
get
(
'orientation'
)
elif
orientation
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'orientation'
,
orientation
)
if
bool
(
self
.
page_tree
.
getroot
()
.
get
(
'title'
)):
self
.
title
=
self
.
page_tree
.
getroot
()
.
get
(
'title'
)
elif
title
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'title'
,
title
)
if
self
.
svg_file
is
None
:
self
.
svg_file
=
self
.
page_tree
.
xpath
(
'.//svg/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@file'
))
>
0
else
None
self
.
width
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
))
>
0
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
))
>
0
else
0.0
elif
len
(
self
.
page_tree
.
xpath
(
'.//svg/@file'
))
==
0
:
tf
=
TranskriptionField
(
svg_file
)
self
.
width
=
round
(
tf
.
documentWidth
,
3
)
self
.
height
=
round
(
tf
.
documentHeight
,
3
)
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'svg'
,
attrib
=
{
'width'
:
str
(
self
.
width
),
'height'
:
str
(
self
.
height
),
'file'
:
self
.
svg_file
})
else
:
self
.
width
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
))
>
0
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
))
>
0
else
0.0
if
self
.
pdfFile
is
None
:
self
.
pdfFile
=
self
.
page_tree
.
xpath
(
'.//pdf/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//pdf/@file'
))
>
0
else
None
elif
len
(
self
.
page_tree
.
xpath
(
'.//pdf/@file'
))
==
0
:
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'pdf'
,
attrib
=
{
'file'
:
self
.
pdfFile
})
for
xpath2remove
in
[
'word'
,
'style'
,
'freehand'
,
LineNumber
.
XML_TAG
,
WordInsertionMark
.
XML_TAG
,
\
WritingProcess
.
XML_TAG
,
Path
.
WORD_DELETION_PATH_TAG
]:
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
xpath2remove
):
node
.
getparent
()
.
remove
(
node
)
else
:
self
.
page_tree
=
ET
.
ElementTree
(
ET
.
Element
(
'page'
))
self
.
pdfFile
=
pdfFile
self
.
svg_file
=
svg_file
if
title
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'title'
,
title
)
if
orientation
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'orientation'
,
orientation
)
self
.
page_tree
.
getroot
()
.
set
(
'transkription-field-only'
,
str
(
extract_transkription_field_only
)
.
lower
())
if
page_number
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'number'
,
str
(
page_number
))
if
self
.
pdfFile
is
not
None
:
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'pdf'
,
attrib
=
{
'file'
:
self
.
pdfFile
})
if
self
.
svg_file
is
not
None
:
tf
=
TranskriptionField
(
self
.
svg_file
)
self
.
width
=
round
(
tf
.
documentWidth
,
3
)
self
.
height
=
round
(
tf
.
documentHeight
,
3
)
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'svg'
,
attrib
=
{
'width'
:
str
(
self
.
width
),
'height'
:
str
(
self
.
height
),
'file'
:
self
.
svg_file
})
self
.
svg_image
=
Image
(
file_name
=
self
.
svg_file
,
height
=
self
.
height
,
width
=
self
.
width
)
if
self
.
svg_file
is
not
None
\
else
None
def
categorize_paths
(
self
,
transkription_field
=
None
):
"""Categorize all paths that are part of the transkription field.
"""
if
self
.
source
is
not
None
and
isfile
(
self
.
source
):
MAX_HEIGHT_LINES
=
1
max_line
=
sorted
(
\
[
line_number
.
bottom
-
line_number
.
top
for
line_number
in
self
.
line_numbers
if
line_number
.
id
%
2
==
0
],
\
reverse
=
True
)[
0
]
+
2
if
len
(
self
.
line_numbers
)
>
0
else
17
tr_xmin
=
transkription_field
.
xmin
if
transkription_field
is
not
None
else
0.0
tr_ymin
=
transkription_field
.
ymin
if
transkription_field
is
not
None
else
0.0
paths
,
attributes
=
svg_to_paths
.
svg2paths
(
self
.
source
)
allpaths_on_tf
=
[]
if
transkription_field
is
not
None
:
for
index
in
range
(
0
,
len
(
paths
)):
path
=
paths
[
index
]
attribute
=
attributes
[
index
]
if
len
(
path
)
>
0
\
and
path
!=
transkription_field
.
path
\
and
path
.
start
.
real
>
tr_xmin
\
and
path
.
end
.
real
<
transkription_field
.
xmax
:
allpaths_on_tf
.
append
(
Path
(
id
=
index
,
path
=
path
,
style_class
=
attribute
.
get
(
'class'
)))
text_area_deletion_paths
=
[]
deletion_or_underline_paths
=
[]
box_paths
=
[]
dots_paths
=
[]
word_connector_paths
=
[]
uncategorized_paths
=
[]
for
mypath
in
allpaths_on_tf
:
xmin
,
xmax
,
ymin
,
ymax
=
mypath
.
path
.
bbox
()
start_line_number
=
self
.
get_line_number
(
mypath
.
path
.
start
.
imag
-
tr_ymin
)
if
abs
(
xmax
-
xmin
)
<
1
and
abs
(
ymax
-
ymin
)
<
1
:
dots_paths
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
>
MAX_HEIGHT_LINES
and
abs
(
ymax
-
ymin
)
<
max_line
and
mypath
.
path
.
iscontinuous
()
and
not
mypath
.
path
.
isclosed
():
deletion_or_underline_paths
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
>
MAX_HEIGHT_LINES
and
abs
(
ymax
-
ymin
)
<
max_line
and
mypath
.
path
.
iscontinuous
()
and
mypath
.
path
.
isclosed
():
box_paths
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
>
MAX_HEIGHT_LINES
and
abs
(
ymax
-
ymin
)
>
max_line
and
mypath
.
path
.
iscontinuous
()
and
not
mypath
.
path
.
isclosed
():
word_connector_paths
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
<
MAX_HEIGHT_LINES
:
deletion_or_underline_paths
.
append
(
mypath
)
elif
start_line_number
!=
-
1
and
start_line_number
!=
self
.
get_line_number
(
mypath
.
path
.
end
.
imag
-
tr_ymin
):
text_area_deletion_paths
.
append
(
mypath
)
else
:
uncategorized_paths
.
append
(
mypath
)
self
.
mark_words_intersecting_with_paths_as_deleted
(
deletion_or_underline_paths
,
tr_xmin
,
tr_ymin
)
elif
not
Page
.
UNITTESTING
:
error_msg
=
'Svg source file {} does not exist!'
.
format
(
self
.
source
)
\
if
self
.
source
is
not
None
else
'Page does not contain a source file!'
raise
FileNotFoundError
(
error_msg
)
def
init_line_numbers
(
self
,
line_numbers
,
document_bottom
):
"""Init line numbers.
"""
even_index
=
0
MINABOVE
=
1
self
.
line_numbers
=
[]
if
len
(
line_numbers
)
>
0
:
first_line_bottom
=
line_numbers
[
even_index
]
.
top
-
MINABOVE
self
.
line_numbers
.
append
(
LineNumber
(
id
=
1
,
top
=
0
,
bottom
=
first_line_bottom
))
self
.
line_numbers
.
append
(
line_numbers
[
even_index
])
even_index
+=
1
while
even_index
<
len
(
line_numbers
):
self
.
line_numbers
.
append
(
LineNumber
(
id
=
line_numbers
[
even_index
]
.
id
-
1
,
\
top
=
line_numbers
[
even_index
-
1
]
.
bottom
+
MINABOVE
,
\
bottom
=
line_numbers
[
even_index
]
.
top
-
MINABOVE
))
self
.
line_numbers
.
append
(
line_numbers
[
even_index
])
even_index
+=
1
self
.
line_numbers
.
append
(
LineNumber
(
id
=
line_numbers
[
even_index
-
1
]
.
id
+
1
,
\
top
=
line_numbers
[
even_index
-
1
]
.
bottom
+
MINABOVE
,
\
bottom
=
document_bottom
))
for
line_number
in
self
.
line_numbers
:
line_number
.
attach_object_to_tree
(
self
.
page_tree
)
def
init_words
(
self
):
self
.
word_insertion_marks
=
[
WordInsertionMark
(
wim_node
=
wim_node
)
for
wim_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
WordInsertionMark
.
XML_TAG
)
]
self
.
words
=
[
Word
.
CREATE_WORD
(
word_node
=
word_node
)
for
word_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//word'
)
]
self
.
line_numbers
=
[
LineNumber
(
xml_text_node
=
line_number_node
)
for
line_number_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
LineNumber
.
XML_TAG
)
]
self
.
writing_processes
=
[
WritingProcess
.
create_writing_process_from_xml
(
node
,
self
.
words
)
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
WritingProcess
.
XML_TAG
)
]
self
.
word_deletion_paths
=
[
Path
(
node
=
node
)
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
Path
.
WORD_DELETION_PATH_TAG
)
]
"""
for index, word in enumerate(self.words):
for word_insertion_mark in self.word_insertion_marks:
self.words[index] = word_insertion_mark.attach_and_update_word_if_involved(word)
if self.words[index] != word:
break
"""
def
create_writing_processes_and_attach2tree
(
self
):
"""Creates three stages of Nietzsche's process of writing.
"""
self
.
writing_processes
=
[
WritingProcess
(
version
=
WritingProcess
.
FIRST_VERSION
),
\
WritingProcess
(
version
=
WritingProcess
.
INSERTION_AND_ADDITION
),
\
WritingProcess
(
version
=
WritingProcess
.
LATER_INSERTION_AND_ADDITION
)
]
for
writing_process
in
self
.
writing_processes
:
writing_process
.
attach_object_to_tree
(
self
.
page_tree
)
for
word
in
self
.
words
:
for
transkription_position
in
word
.
transkription_positions
:
for
font_key
in
transkription_position
.
positional_word_parts
[
0
]
.
style_class
.
split
(
' '
):
if
font_key
in
self
.
fontsizekey2stage_mapping
.
keys
():
transkription_position
.
writing_process_id
=
self
.
fontsizekey2stage_mapping
.
get
(
font_key
)
def
add_style
(
self
,
sonderzeichen_list
=
[],
letterspacing_list
=
[],
style_dict
=
{},
style_node
=
None
):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self
.
sonderzeichen_list
=
sonderzeichen_list
self
.
letterspacing_list
=
letterspacing_list
self
.
style_dict
=
style_dict
if
style_node
is
not
None
:
self
.
style_dict
=
{
item
.
get
(
'name'
):
{
key
:
value
for
key
,
value
in
item
.
attrib
.
items
()
if
key
!=
'name'
}
for
item
in
style_node
.
findall
(
'.//class'
)
}
self
.
sonderzeichen_list
=
[
item
.
get
(
'name'
)
for
item
in
style_node
.
findall
(
'.//class'
)
\
if
bool
(
item
.
get
(
'font-family'
))
and
'Sonderzeichen'
in
item
.
get
(
'font-family'
)
]
self
.
letterspacing_list
=
[
item
.
get
(
'name'
)
for
item
in
style_node
.
findall
(
'.//class'
)
\
if
bool
(
item
.
get
(
'letterspacing-list'
))
]
elif
bool
(
self
.
style_dict
):
style_node
=
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'style'
)
if
len
(
self
.
sonderzeichen_list
)
>
0
:
style_node
.
set
(
'Sonderzeichen'
,
' '
.
join
(
self
.
sonderzeichen_list
))
if
len
(
self
.
letterspacing_list
)
>
0
:
style_node
.
set
(
'letterspacing-list'
,
' '
.
join
(
self
.
letterspacing_list
))
for
key
in
self
.
style_dict
.
keys
():
self
.
style_dict
[
key
][
'name'
]
=
key
ET
.
SubElement
(
style_node
,
'class'
,
attrib
=
self
.
style_dict
[
key
])
fontsize_dict
=
{
key
:
float
(
value
.
get
(
'font-size'
)
.
replace
(
'px'
,
''
))
for
key
,
value
in
self
.
style_dict
.
items
()
if
'font-size'
in
value
}
fontsizes
=
sorted
(
fontsize_dict
.
values
(),
reverse
=
True
)
# create a mapping between fontsizes and word stages
self
.
fontsizekey2stage_mapping
=
{}
for
fontsize_key
,
value
in
fontsize_dict
.
items
():
if
value
>=
fontsizes
[
0
]
-
1
:
self
.
fontsizekey2stage_mapping
.
update
({
fontsize_key
:
WritingProcess
.
FIRST_VERSION
})
elif
value
<=
fontsizes
[
len
(
fontsizes
)
-
1
]
+
1
:
self
.
fontsizekey2stage_mapping
.
update
({
fontsize_key
:
WritingProcess
.
LATER_INSERTION_AND_ADDITION
})
else
:
self
.
fontsizekey2stage_mapping
.
update
({
fontsize_key
:
WritingProcess
.
INSERTION_AND_ADDITION
})
def
add_source
(
self
,
source
):
"""Adds a source to page and attaches it to page_tree.
"""
self
.
source
=
source
self
.
page_tree
.
getroot
()
.
set
(
'source'
,
self
.
source
)
def
get_biggest_fontSize4styles
(
self
,
style_set
=
{}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if
bool
(
self
.
style_dict
):
sorted_font_sizes
=
sorted
(
(
float
(
self
.
style_dict
[
key
][
'font-size'
]
.
replace
(
'px'
,
''
))
for
key
in
style_set
if
bool
(
self
.
style_dict
[
key
]
.
get
(
'font-size'
))),
reverse
=
True
)
return
sorted_font_sizes
[
0
]
if
len
(
sorted_font_sizes
)
>
0
else
1
else
:
return
1
def
get_line_number
(
self
,
y
):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if
len
(
self
.
line_numbers
)
>
0
:
result_list
=
[
line_number
.
id
for
line_number
in
self
.
line_numbers
if
y
>=
line_number
.
top
and
y
<=
line_number
.
bottom
]
return
result_list
[
0
]
if
len
(
result_list
)
>
0
else
-
1
else
:
return
-
1
@classmethod
def
get_semantic_dictionary
(
cls
):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary
=
{}
class_dict
=
cls
.
get_class_dictionary
()
properties
=
{
'title'
:
(
str
,
1
,
'/page/@title'
),
'number'
:
(
str
,
1
,
'/page/@number'
),
\
'line_numbers'
:
(
LineNumber
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
),
\
'orientation'
:
{
'class'
:
str
,
'cardinality'
:
1
,
'xpath'
:
'/page/@orientation'
},
\
'words'
:
(
Word
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
),
\
'svg_image'
:
(
Image
,
1
,
'/page/svg'
),
\
'writing_processes'
:
(
WritingProcess
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
),
\
'word_deletion_paths'
:
(
Path
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
),
\
'word_insertion_marks'
:
(
WordInsertionMark
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
)}
dictionary
.
update
({
'class'
:
class_dict
})
dictionary
.
update
({
'properties'
:
properties
})
return
dictionary
def
mark_words_intersecting_with_paths_as_deleted
(
self
,
deletion_paths
,
tr_xmin
=
0.0
,
tr_ymin
=
0.0
):
"""Marks all words that intersect with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] list of .path.Path that might be word_underline_paths
"""
if
not
Page
.
UNITTESTING
:
bar
=
Bar
(
'mark words that intersect with deletion paths'
,
max
=
len
(
self
.
words
))
for
word
in
self
.
words
:
not
bool
(
Page
.
UNITTESTING
)
and
bar
.
next
()
for
transkription_position
in
word
.
transkription_positions
:
first_pwp
=
transkription_position
.
positional_word_parts
[
0
]
last_pwp
=
transkription_position
.
positional_word_parts
[
len
(
transkription_position
.
positional_word_parts
)
-
1
]
xmin
=
tr_xmin
+
first_pwp
.
left
xmax
=
tr_xmin
+
last_pwp
.
left
+
last_pwp
.
width
ymin
=
tr_ymin
+
sorted
(
pwp
.
top
for
pwp
in
transkription_position
.
positional_word_parts
)[
0
]
ymax
=
tr_ymin
+
sorted
([
pwp
.
bottom
for
pwp
in
transkription_position
.
positional_word_parts
],
reverse
=
True
)[
0
]
word_path
=
parse_path
(
'M {}, {} L {}, {} L {}, {} L {}, {} z'
.
format
(
xmin
,
ymin
,
xmax
,
ymin
,
xmax
,
ymax
,
xmin
,
ymax
))
intersecting_paths
=
[
deletion_path
for
deletion_path
in
deletion_paths
\
if
do_paths_intersect_saveMode
(
deletion_path
.
path
,
word_path
)
]
if
len
(
intersecting_paths
)
>
0
:
word
.
deleted
=
True
for
deletion_path
in
intersecting_paths
:
if
deletion_path
not
in
self
.
word_deletion_paths
:
deletion_path
.
tag
=
Path
.
WORD_DELETION_PATH_TAG
deletion_path
.
attach_object_to_tree
(
self
.
page_tree
)
self
.
word_deletion_paths
.
append
(
deletion_path
)
not
bool
(
Page
.
UNITTESTING
)
and
bar
.
finish
()
# return those paths in deletion_paths that are not in self.word_deletion_paths
return
[
word_underline_path
for
word_underline_path
in
set
(
deletion_paths
)
-
set
(
self
.
word_deletion_paths
)
]
def
do_paths_intersect_saveMode
(
path1
,
path2
):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try
:
return
path1
.
intersect
(
path2
,
justonemode
=
True
)
except
AssertionError
:
return
False
Event Timeline
Log In to Comment