Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F84253571
page.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Sep 21, 16:01
Size
36 KB
Mime Type
text/x-python
Expires
Mon, Sep 23, 16:01 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
20970973
Attached To
rNIETZSCHEPYTHON nietzsche-python
page.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
lxml
import
etree
as
ET
from
os.path
import
isfile
from
progress.bar
import
Bar
from
svgpathtools
import
svg2paths2
,
svg_to_paths
from
svgpathtools.parser
import
parse_path
import
sys
from
.box
import
Box
from
.image
import
Image
,
SVGImage
from
.faksimile_image
import
FaksimileImage
from
.lineNumber
import
LineNumber
from
.mark_foreign_hands
import
MarkForeignHands
from
.matrix
import
Matrix
from
.path
import
Path
from
.positional_word_part
import
PositionalWordPart
from
.text_connection_mark
import
TextConnectionMark
from
.transkriptionField
import
TranskriptionField
from
.writing_process
import
WritingProcess
from
.word
import
Word
from
.word_insertion_mark
import
WordInsertionMark
sys
.
path
.
append
(
'py2ttl'
)
from
class_spec
import
SemanticClass
FILE_TYPE_SVG_WORD_POSITION
=
'svgWordPosition'
FILE_TYPE_XML_MANUSCRIPT
=
'xmlManuscriptFile'
class
Page
(
SemanticClass
):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
UNITTESTING
=
False
WARNING_MISSING_USE_NODE4PWP
=
PositionalWordPart
.
WARN_NO_USE_NODE_FOUND
WARNING_MISSING_GLYPH_ID4WIM
=
WordInsertionMark
.
WARN_NO_GLYPH_ID
PAGE_RECTO
=
'recto'
PAGE_VERSO
=
'verso'
def
__init__
(
self
,
xml_source_file
=
None
,
xml_target_file
=
None
,
title
=
None
,
page_number
=
None
,
faksimile_image
=
None
,
faksimile_svgFile
=
None
,
pdfFile
=
None
,
svg_file
=
None
,
orientation
=
'North'
,
page_type
=
PAGE_VERSO
,
extract_transkription_field_only
=
False
):
self
.
title
=
title
self
.
mark_foreign_hands
=
[]
self
.
text_connection_marks
=
[]
self
.
line_numbers
=
[]
self
.
style_dict
=
{}
self
.
sonderzeichen_list
=
[]
self
.
svg_file
=
None
self
.
svg_image
=
None
self
.
pdfFile
=
None
self
.
faksimile_svgFile
=
None
self
.
source
=
None
self
.
number
=
page_number
if
page_number
is
not
None
else
-
1
self
.
orientation
=
orientation
self
.
page_type
=
page_type
self
.
word_deletion_paths
=
[]
self
.
faksimile_image
=
faksimile_image
if
xml_source_file
is
not
None
:
if
isfile
(
xml_source_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
page_tree
=
ET
.
parse
(
xml_source_file
,
parser
)
self
.
title
=
self
.
page_tree
.
getroot
()
.
get
(
'title'
)
self
.
number
=
self
.
page_tree
.
getroot
()
.
get
(
'number'
)
self
.
source
=
self
.
page_tree
.
getroot
()
.
get
(
'source'
)
self
.
orientation
=
self
.
page_tree
.
getroot
()
.
get
(
'orientation'
)
self
.
page_type
=
self
.
page_tree
.
getroot
()
.
get
(
'pageType'
)
self
.
init_words
()
self
.
add_style
(
style_node
=
self
.
page_tree
.
getroot
()
.
find
(
'.//style'
))
self
.
pdfFile
=
self
.
page_tree
.
xpath
(
'.//pdf/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//pdf/@file'
))
>
0
else
None
self
.
faksimile_svgFile
=
self
.
page_tree
.
xpath
(
'.//faksimile-svg/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//faksimile-svg/@file'
))
>
0
else
None
self
.
svg_image
=
SVGImage
(
node
=
self
.
page_tree
.
xpath
(
'.//'
+
SVGImage
.
XML_TAG
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//'
+
SVGImage
.
XML_TAG
))
>
0
else
None
self
.
faksimile_image
=
FaksimileImage
(
node
=
self
.
page_tree
.
xpath
(
'.//'
+
FaksimileImage
.
XML_TAG
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//'
+
FaksimileImage
.
XML_TAG
))
>
0
else
None
self
.
svg_file
=
self
.
page_tree
.
xpath
(
'.//svg/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@file'
))
>
0
else
None
self
.
width
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
))
>
0
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
))
>
0
else
0.0
if
pdfFile
is
not
None
and
self
.
pdfFile
is
None
:
self
.
pdfFile
=
pdfFile
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'pdf'
,
attrib
=
{
'file'
:
self
.
pdfFile
})
if
faksimile_svgFile
is
not
None
and
self
.
faksimile_svgFile
is
None
:
self
.
faksimile_svgFile
=
faksimile_svgFile
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'faksimile-svg'
,
attrib
=
{
'file'
:
self
.
faksimile_svgFile
})
if
faksimile_image
is
not
None
:
self
.
faksimile_image
=
faksimile_image
self
.
faksimile_image
.
attach_object_to_tree
(
self
.
page_tree
)
if
svg_file
is
not
None
and
self
.
svg_file
is
None
:
self
.
svg_file
=
svg_file
tf
=
TranskriptionField
(
svg_file
)
self
.
width
=
round
(
tf
.
documentWidth
,
3
)
self
.
height
=
round
(
tf
.
documentHeight
,
3
)
self
.
svg_image
=
SVGImage
(
file_name
=
self
.
svg_file
,
width
=
self
.
width
,
height
=
self
.
height
)
self
.
svg_image
.
attach_object_to_tree
(
self
.
page_tree
)
if
self
.
svg_image
is
not
None
and
self
.
svg_file
is
None
:
self
.
svg_file
=
self
.
svg_image
.
file_name
if
self
.
svg_image
is
not
None
and
self
.
width
==
0.0
:
self
.
width
=
self
.
svg_image
.
width
if
self
.
svg_image
is
not
None
and
self
.
height
==
0.0
:
self
.
height
=
self
.
svg_image
.
height
else
:
raise
Exception
(
'File "{}" does not exist!'
.
format
(
xml_source_file
))
elif
xml_target_file
is
not
None
:
self
.
word_insertion_marks
=
[]
self
.
words
=
[]
self
.
writing_processes
=
[]
self
.
svg_file
=
svg_file
self
.
pdfFile
=
pdfFile
self
.
faksimile_svgFile
=
faksimile_svgFile
if
isfile
(
xml_target_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
page_tree
=
ET
.
parse
(
xml_target_file
,
parser
)
self
.
source
=
self
.
page_tree
.
getroot
()
.
get
(
'source'
)
if
bool
(
self
.
page_tree
.
getroot
()
.
get
(
'orientation'
)):
self
.
orientation
=
self
.
page_tree
.
getroot
()
.
get
(
'orientation'
)
elif
orientation
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'orientation'
,
orientation
)
if
bool
(
self
.
page_tree
.
getroot
()
.
get
(
'title'
)):
self
.
title
=
self
.
page_tree
.
getroot
()
.
get
(
'title'
)
elif
title
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'title'
,
title
)
if
self
.
svg_file
is
None
:
self
.
svg_file
=
self
.
page_tree
.
xpath
(
'.//svg/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@file'
))
>
0
else
None
self
.
width
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
))
>
0
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
))
>
0
else
0.0
elif
len
(
self
.
page_tree
.
xpath
(
'.//svg/@file'
))
==
0
:
tf
=
TranskriptionField
(
svg_file
)
self
.
width
=
round
(
tf
.
documentWidth
,
3
)
self
.
height
=
round
(
tf
.
documentHeight
,
3
)
self
.
svg_image
=
SVGImage
(
file_name
=
self
.
svg_file
,
width
=
self
.
width
,
height
=
self
.
height
)
self
.
svg_image
.
attach_object_to_tree
(
self
.
page_tree
)
#ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
else
:
self
.
width
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
))
>
0
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
))
>
0
else
0.0
if
self
.
pdfFile
is
None
:
self
.
pdfFile
=
self
.
page_tree
.
xpath
(
'.//pdf/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//pdf/@file'
))
>
0
else
None
elif
len
(
self
.
page_tree
.
xpath
(
'.//pdf/@file'
))
==
0
:
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'pdf'
,
attrib
=
{
'file'
:
self
.
pdfFile
})
for
xpath2remove
in
[
'word'
,
'style'
,
'freehand'
,
LineNumber
.
XML_TAG
,
WordInsertionMark
.
XML_TAG
,
\
WritingProcess
.
XML_TAG
,
Path
.
WORD_DELETION_PATH_TAG
]:
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
xpath2remove
):
node
.
getparent
()
.
remove
(
node
)
else
:
self
.
page_tree
=
ET
.
ElementTree
(
ET
.
Element
(
'page'
))
self
.
pdfFile
=
pdfFile
self
.
svg_file
=
svg_file
if
title
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'title'
,
title
)
if
orientation
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'orientation'
,
orientation
)
self
.
page_tree
.
getroot
()
.
set
(
'transkription-field-only'
,
str
(
extract_transkription_field_only
)
.
lower
())
if
page_number
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'number'
,
str
(
page_number
))
if
self
.
pdfFile
is
not
None
:
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'pdf'
,
attrib
=
{
'file'
:
self
.
pdfFile
})
if
self
.
svg_file
is
not
None
:
tf
=
TranskriptionField
(
self
.
svg_file
)
self
.
width
=
round
(
tf
.
documentWidth
,
3
)
self
.
height
=
round
(
tf
.
documentHeight
,
3
)
self
.
svg_image
=
SVGImage
(
file_name
=
self
.
svg_file
,
width
=
self
.
width
,
height
=
self
.
height
)
self
.
svg_image
.
attach_object_to_tree
(
self
.
page_tree
)
#ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
if
self
.
svg_image
is
None
and
self
.
svg_file
is
not
None
:
self
.
svg_image
=
SVGImage
(
file_name
=
self
.
svg_file
,
width
=
self
.
width
,
height
=
self
.
height
)
self
.
svg_image
.
attach_object_to_tree
(
self
.
page_tree
)
def
add_style
(
self
,
sonderzeichen_list
=
[],
letterspacing_list
=
[],
style_dict
=
{},
style_node
=
None
):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self
.
sonderzeichen_list
=
sonderzeichen_list
self
.
letterspacing_list
=
letterspacing_list
self
.
style_dict
=
style_dict
if
style_node
is
not
None
:
self
.
style_dict
=
{
item
.
get
(
'name'
):
{
key
:
value
for
key
,
value
in
item
.
attrib
.
items
()
if
key
!=
'name'
}
for
item
in
style_node
.
findall
(
'.//class'
)
}
self
.
sonderzeichen_list
=
[
item
.
get
(
'name'
)
for
item
in
style_node
.
findall
(
'.//class'
)
\
if
bool
(
item
.
get
(
'font-family'
))
and
'Sonderzeichen'
in
item
.
get
(
'font-family'
)
]
self
.
letterspacing_list
=
[
item
.
get
(
'name'
)
for
item
in
style_node
.
findall
(
'.//class'
)
\
if
bool
(
item
.
get
(
'letterspacing-list'
))
]
elif
bool
(
self
.
style_dict
):
style_node
=
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'style'
)
if
len
(
self
.
sonderzeichen_list
)
>
0
:
style_node
.
set
(
'Sonderzeichen'
,
' '
.
join
(
self
.
sonderzeichen_list
))
if
len
(
self
.
letterspacing_list
)
>
0
:
style_node
.
set
(
'letterspacing-list'
,
' '
.
join
(
self
.
letterspacing_list
))
for
key
in
self
.
style_dict
.
keys
():
self
.
style_dict
[
key
][
'name'
]
=
key
ET
.
SubElement
(
style_node
,
'class'
,
attrib
=
self
.
style_dict
[
key
])
fontsize_dict
=
{
key
:
float
(
value
.
get
(
'font-size'
)
.
replace
(
'px'
,
''
))
for
key
,
value
in
self
.
style_dict
.
items
()
if
'font-size'
in
value
}
fontsizes
=
sorted
(
fontsize_dict
.
values
(),
reverse
=
True
)
# create a mapping between fontsizes and word stages
self
.
fontsizekey2stage_mapping
=
{}
for
fontsize_key
,
value
in
fontsize_dict
.
items
():
if
value
>=
fontsizes
[
0
]
-
1
:
self
.
fontsizekey2stage_mapping
.
update
({
fontsize_key
:
WritingProcess
.
FIRST_VERSION
})
elif
value
<=
fontsizes
[
len
(
fontsizes
)
-
1
]
+
1
:
self
.
fontsizekey2stage_mapping
.
update
({
fontsize_key
:
WritingProcess
.
LATER_INSERTION_AND_ADDITION
})
else
:
self
.
fontsizekey2stage_mapping
.
update
({
fontsize_key
:
WritingProcess
.
INSERTION_AND_ADDITION
})
def
add_source
(
self
,
source
):
"""Adds a source to page and attaches it to page_tree.
"""
self
.
source
=
source
self
.
page_tree
.
getroot
()
.
set
(
'source'
,
self
.
source
)
def
categorize_paths
(
self
,
transkription_field
=
None
):
"""Categorize all paths that are part of the transkription field.
:return: a dictionary containig a list for each category of path.
"""
if
self
.
source
is
not
None
and
isfile
(
self
.
source
):
MAX_HEIGHT_LINES
=
1
max_line
=
sorted
(
\
[
line_number
.
bottom
-
line_number
.
top
for
line_number
in
self
.
line_numbers
if
line_number
.
id
%
2
==
0
],
\
reverse
=
True
)[
0
]
+
2
if
len
(
self
.
line_numbers
)
>
0
else
17
tr_xmin
=
transkription_field
.
xmin
if
transkription_field
is
not
None
else
0.0
tr_ymin
=
transkription_field
.
ymin
if
transkription_field
is
not
None
else
0.0
paths
,
attributes
=
svg_to_paths
.
svg2paths
(
self
.
source
)
allpaths_on_tf
=
[]
allpaths_outside_tf
=
[]
attributes_outside_tf
=
[]
if
transkription_field
is
not
None
:
for
index
in
range
(
0
,
len
(
paths
)):
path
=
paths
[
index
]
attribute
=
attributes
[
index
]
if
len
(
path
)
>
0
\
and
path
!=
transkription_field
.
path
\
and
path
.
bbox
()[
0
]
>
tr_xmin
\
and
path
.
bbox
()[
1
]
<
transkription_field
.
xmax
:
allpaths_on_tf
.
append
(
Path
(
id
=
index
,
path
=
path
,
style_class
=
attribute
.
get
(
'class'
)))
elif
len
(
path
)
>
0
\
and
path
!=
transkription_field
.
path
:
allpaths_outside_tf
.
append
(
path
)
attributes_outside_tf
.
append
(
attribute
)
path_dict
=
{
'text_area_deletion_paths'
:
[],
\
'deletion_or_underline_paths'
:
[],
\
'box_paths'
:
[],
\
'dots_paths'
:
[],
\
'word_connector_paths'
:
[],
\
'uncategorized_paths'
:
[]
}
for
mypath
in
allpaths_on_tf
:
xmin
,
xmax
,
ymin
,
ymax
=
mypath
.
path
.
bbox
()
start_line_number
=
self
.
get_line_number
(
mypath
.
path
.
start
.
imag
-
tr_ymin
)
if
abs
(
xmax
-
xmin
)
<
1
and
abs
(
ymax
-
ymin
)
<
1
:
path_dict
.
get
(
'dots_paths'
)
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
>
MAX_HEIGHT_LINES
and
abs
(
ymax
-
ymin
)
<
max_line
and
mypath
.
path
.
iscontinuous
()
and
mypath
.
path
.
isclosed
():
path_dict
.
get
(
'box_paths'
)
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
>
MAX_HEIGHT_LINES
and
abs
(
ymax
-
ymin
)
>
max_line
and
mypath
.
path
.
iscontinuous
()
and
not
mypath
.
path
.
isclosed
():
path_dict
.
get
(
'word_connector_paths'
)
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
<
MAX_HEIGHT_LINES
:
path_dict
.
get
(
'deletion_or_underline_paths'
)
.
append
(
mypath
)
elif
start_line_number
!=
-
1
and
start_line_number
!=
self
.
get_line_number
(
mypath
.
path
.
end
.
imag
-
tr_ymin
):
path_dict
.
get
(
'text_area_deletion_paths'
)
.
append
(
mypath
)
else
:
path_dict
.
get
(
'uncategorized_paths'
)
.
append
(
mypath
)
underline_path
=
self
.
mark_words_intersecting_with_paths_as_deleted
(
path_dict
.
get
(
'deletion_or_underline_paths'
),
tr_xmin
,
tr_ymin
)
path_dict
.
update
({
'underline_path'
:
underline_path
})
self
.
process_word_boxes
(
path_dict
.
get
(
'box_paths'
),
transkription_field
,
\
paths
=
allpaths_outside_tf
,
attributes
=
attributes_outside_tf
,
max_line
=
max_line
)
return
path_dict
elif
not
Page
.
UNITTESTING
:
error_msg
=
'Svg source file {} does not exist!'
.
format
(
self
.
source
)
\
if
self
.
source
is
not
None
else
'Page does not contain a source file!'
raise
FileNotFoundError
(
error_msg
)
return
{}
def
create_writing_processes_and_attach2tree
(
self
):
"""Creates three stages of Nietzsche's process of writing.
"""
self
.
writing_processes
=
[
WritingProcess
(
version
=
WritingProcess
.
FIRST_VERSION
),
\
WritingProcess
(
version
=
WritingProcess
.
INSERTION_AND_ADDITION
),
\
WritingProcess
(
version
=
WritingProcess
.
LATER_INSERTION_AND_ADDITION
)
]
for
writing_process
in
self
.
writing_processes
:
writing_process
.
attach_object_to_tree
(
self
.
page_tree
)
for
word
in
self
.
words
:
for
transkription_position
in
word
.
transkription_positions
:
for
font_key
in
transkription_position
.
positional_word_parts
[
0
]
.
style_class
.
split
(
' '
):
if
font_key
in
self
.
fontsizekey2stage_mapping
.
keys
():
transkription_position
.
writing_process_id
=
self
.
fontsizekey2stage_mapping
.
get
(
font_key
)
def
find_special_words
(
self
,
transkription_field
=
None
):
"""Find special words, remove them from words, process their content.
"""
if
self
.
source
is
None
or
not
isfile
(
self
.
source
):
raise
FileNotFoundError
(
'Page does not have a source!'
)
if
transkription_field
is
None
:
transkription_field
=
TranskriptionField
(
self
.
source
)
special_char_list
=
MarkForeignHands
.
get_special_char_list
()
special_char_list
+=
TextConnectionMark
.
get_special_char_list
()
single_char_words
=
[
word
for
word
in
self
.
words
if
len
(
word
.
text
)
==
1
and
word
.
text
in
special_char_list
]
for
word
in
single_char_words
:
if
word
.
text
==
MarkForeignHands
.
CLASS_MARK
:
id
=
len
(
self
.
mark_foreign_hands
)
self
.
mark_foreign_hands
.
append
(
MarkForeignHands
.
create_cls_from_word
(
word
,
id
=
id
))
self
.
words
.
remove
(
word
)
elif
word
.
text
in
TextConnectionMark
.
SPECIAL_CHAR_LIST
[
0
]
\
or
(
word
.
text
in
TextConnectionMark
.
SPECIAL_CHAR_LIST
\
and
any
(
style
in
self
.
sonderzeichen_list
for
style
\
in
word
.
transkription_positions
[
0
]
.
positional_word_parts
[
0
]
.
style_class
.
split
(
' '
))):
id
=
len
(
self
.
text_connection_marks
)
self
.
text_connection_marks
.
append
(
TextConnectionMark
.
create_cls_from_word
(
word
,
id
=
id
))
self
.
words
.
remove
(
word
)
svg_tree
=
ET
.
parse
(
self
.
source
)
self
.
update_page_type
(
transkription_field
=
transkription_field
)
self
.
update_line_number_area
(
transkription_field
,
svg_tree
=
svg_tree
)
italic_classes
=
[
key
for
key
in
self
.
style_dict
\
if
bool
(
self
.
style_dict
[
key
]
.
get
(
'font-family'
))
and
self
.
style_dict
[
key
][
'font-family'
]
.
endswith
(
'Italic'
)
]
if
len
(
self
.
mark_foreign_hands
)
>
0
:
MarkForeignHands
.
find_content
(
self
.
mark_foreign_hands
,
transkription_field
,
svg_tree
,
italic_classes
=
italic_classes
,
\
SonderzeichenList
=
self
.
sonderzeichen_list
)
if
len
(
self
.
text_connection_marks
)
>
0
:
TextConnectionMark
.
find_content_in_footnotes
(
self
.
text_connection_marks
,
transkription_field
,
svg_tree
,
\
title
=
self
.
title
,
page_number
=
self
.
number
)
def
get_biggest_fontSize4styles
(
self
,
style_set
=
{}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if
bool
(
self
.
style_dict
):
sorted_font_sizes
=
sorted
(
(
float
(
self
.
style_dict
[
key
][
'font-size'
]
.
replace
(
'px'
,
''
))
for
key
in
style_set
if
bool
(
self
.
style_dict
[
key
]
.
get
(
'font-size'
))),
reverse
=
True
)
return
sorted_font_sizes
[
0
]
if
len
(
sorted_font_sizes
)
>
0
else
1
else
:
return
1
def
get_line_number
(
self
,
y
):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if
len
(
self
.
line_numbers
)
>
0
:
result_list
=
[
line_number
.
id
for
line_number
in
self
.
line_numbers
if
y
>=
line_number
.
top
and
y
<=
line_number
.
bottom
]
return
result_list
[
0
]
if
len
(
result_list
)
>
0
else
-
1
else
:
return
-
1
@classmethod
def
get_pages_from_xml_file
(
cls
,
xml_file
,
status_contains
=
''
,
word_selection_function
=
None
):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree
=
ET
.
parse
(
xml_file
)
if
source_tree
.
getroot
()
.
find
(
'metadata/type'
)
.
text
==
FILE_TYPE_SVG_WORD_POSITION
:
page
=
cls
(
xml_source_file
=
xml_file
)
if
word_selection_function
is
None
or
len
(
word_selection_function
(
page
.
words
))
>
0
:
return
[
page
]
else
:
return
[]
elif
source_tree
.
getroot
()
.
find
(
'metadata/type'
)
.
text
==
FILE_TYPE_XML_MANUSCRIPT
:
pages
=
[]
xpath
=
'//page/@output'
\
if
status_contains
==
''
\
else
'//page[contains(@status, "{0}")]/@output'
.
format
(
status_contains
)
for
xml_source_file
in
source_tree
.
xpath
(
xpath
):
if
isfile
(
xml_source_file
):
pages
+=
cls
.
get_pages_from_xml_file
(
xml_source_file
,
word_selection_function
=
word_selection_function
)
return
pages
else
:
return
[]
@classmethod
def
get_semantic_dictionary
(
cls
):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary
=
{}
class_dict
=
cls
.
get_class_dictionary
()
properties
=
{
'number'
:
(
str
,
1
,
'/page/@number'
),
\
'faksimile_image'
:
{
'class'
:
FaksimileImage
,
'cardinality'
:
1
,
'xpath'
:
'/page/{}'
.
format
(
FaksimileImage
.
XML_TAG
)},
\
'line_numbers'
:
(
LineNumber
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
),
\
'orientation'
:
{
'class'
:
str
,
'cardinality'
:
1
,
'xpath'
:
'/page/@orientation'
},
\
'words'
:
(
Word
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
),
\
'svg_image'
:
{
'class'
:
SVGImage
,
'cardinality'
:
1
,
'xpath'
:
'/page/{}'
.
format
(
SVGImage
.
XML_TAG
)},
\
'writing_processes'
:
(
WritingProcess
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
),
\
'word_deletion_paths'
:
(
Path
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
),
\
'word_insertion_marks'
:
(
WordInsertionMark
,
SemanticClass
.
LIST
,
'/page/@number|/page/@title'
)}
dictionary
.
update
({
'class'
:
class_dict
})
dictionary
.
update
({
'properties'
:
properties
})
return
dictionary
def
init_line_numbers
(
self
,
line_numbers
,
document_bottom
):
"""Init line numbers.
"""
even_index
=
0
MINABOVE
=
1
self
.
line_numbers
=
[]
if
len
(
line_numbers
)
>
0
:
first_line_bottom
=
line_numbers
[
even_index
]
.
top
-
MINABOVE
self
.
line_numbers
.
append
(
LineNumber
(
id
=
1
,
top
=
0
,
bottom
=
first_line_bottom
))
self
.
line_numbers
.
append
(
line_numbers
[
even_index
])
even_index
+=
1
while
even_index
<
len
(
line_numbers
):
self
.
line_numbers
.
append
(
LineNumber
(
id
=
line_numbers
[
even_index
]
.
id
-
1
,
\
top
=
line_numbers
[
even_index
-
1
]
.
bottom
+
MINABOVE
,
\
bottom
=
line_numbers
[
even_index
]
.
top
-
MINABOVE
))
self
.
line_numbers
.
append
(
line_numbers
[
even_index
])
even_index
+=
1
self
.
line_numbers
.
append
(
LineNumber
(
id
=
line_numbers
[
even_index
-
1
]
.
id
+
1
,
\
top
=
line_numbers
[
even_index
-
1
]
.
bottom
+
MINABOVE
,
\
bottom
=
document_bottom
))
for
line_number
in
self
.
line_numbers
:
line_number
.
attach_object_to_tree
(
self
.
page_tree
)
def
init_words
(
self
):
self
.
word_insertion_marks
=
[
WordInsertionMark
(
wim_node
=
wim_node
)
for
wim_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
WordInsertionMark
.
XML_TAG
)
]
self
.
words
=
[
Word
.
create_cls
(
word_node
)
for
word_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'./word'
)
]
self
.
mark_foreign_hands
=
[
MarkForeignHands
.
create_cls
(
node
)
for
node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
MarkForeignHands
.
XML_TAG
)
]
self
.
text_connection_marks
=
[
TextConnectionMark
.
create_cls
(
node
)
for
node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
TextConnectionMark
.
XML_TAG
)
]
self
.
line_numbers
=
[
LineNumber
(
xml_text_node
=
line_number_node
)
for
line_number_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
LineNumber
.
XML_TAG
)
]
self
.
writing_processes
=
[
WritingProcess
.
create_writing_process_from_xml
(
node
,
self
.
words
)
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
WritingProcess
.
XML_TAG
)
]
self
.
word_deletion_paths
=
[
Path
(
node
=
node
)
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
Path
.
WORD_DELETION_PATH_TAG
)
]
def
is_locked
(
self
):
"""Return true if page is locked.
"""
return
len
(
self
.
page_tree
.
xpath
(
'//metadata/lock'
))
>
0
def
lock
(
self
,
reference_file
,
message
=
''
):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if
not
self
.
is_locked
():
metadata
=
self
.
page_tree
.
xpath
(
'./metadata'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'./metadata'
))
>
0
\
else
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'metadata'
)
lock
=
ET
.
SubElement
(
metadata
,
'lock'
)
ET
.
SubElement
(
lock
,
'reference-file'
)
.
text
=
reference_file
if
message
!=
''
:
ET
.
SubElement
(
lock
,
'message'
)
.
text
=
message
def
mark_words_intersecting_with_paths_as_deleted
(
self
,
deletion_paths
,
tr_xmin
=
0.0
,
tr_ymin
=
0.0
):
"""Marks all words that intersect with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] list of .path.Path that might be word_underline_paths
"""
if
not
Page
.
UNITTESTING
:
bar
=
Bar
(
'mark words that intersect with deletion paths'
,
max
=
len
(
self
.
words
))
for
word
in
self
.
words
:
not
bool
(
Page
.
UNITTESTING
)
and
bar
.
next
()
word
.
deleted
=
False
for
transkription_position
in
word
.
transkription_positions
:
word_path
=
Path
.
create_path_from_transkription_position
(
transkription_position
,
\
tr_xmin
=
tr_xmin
,
tr_ymin
=
tr_ymin
)
intersecting_paths
=
[
deletion_path
for
deletion_path
in
deletion_paths
\
if
do_paths_intersect_saveMode
(
deletion_path
.
path
,
word_path
.
path
)
]
if
len
(
intersecting_paths
)
>
0
:
transkription_position
.
deleted
=
True
for
deletion_path
in
intersecting_paths
:
if
deletion_path
not
in
self
.
word_deletion_paths
:
deletion_path
.
tag
=
Path
.
WORD_DELETION_PATH_TAG
deletion_path
.
attach_object_to_tree
(
self
.
page_tree
)
self
.
word_deletion_paths
.
append
(
deletion_path
)
word
.
partition_according_to_writing_process_id
()
word
.
partition_according_to_deletion
()
not
bool
(
Page
.
UNITTESTING
)
and
bar
.
finish
()
# return those paths in deletion_paths that are not in self.word_deletion_paths
return
[
word_underline_path
for
word_underline_path
in
set
(
deletion_paths
)
-
set
(
self
.
word_deletion_paths
)
]
def
process_word_boxes
(
self
,
box_paths
,
transkription_field
,
paths
=
None
,
attributes
=
None
,
max_line
=
17
):
"""Process word boxes: partition words according to word boxes.
"""
MAX_HEIGHT_LINES
=
1
if
not
Page
.
UNITTESTING
:
bar
=
Bar
(
'process word boxes'
,
max
=
len
(
self
.
words
))
svg_tree
=
ET
.
parse
(
self
.
source
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
allpaths_on_margin_field
=
[]
if
paths
is
None
or
attributes
is
None
:
paths
,
attributes
=
svg_to_paths
.
svg2paths
(
self
.
source
)
for
index
in
range
(
0
,
len
(
paths
)):
path
=
paths
[
index
]
xmin
,
xmax
,
ymin
,
ymax
=
path
.
bbox
()
attribute
=
attributes
[
index
]
if
len
(
path
)
>
0
\
and
path
!=
transkription_field
.
path
\
and
((
path
.
bbox
()[
1
]
<
transkription_field
.
xmin
and
transkription_field
.
is_page_verso
())
\
or
(
path
.
bbox
()[
0
]
>
transkription_field
.
xmax
and
not
transkription_field
.
is_page_verso
()))
\
and
abs
(
ymax
-
ymin
)
<
max_line
:
allpaths_on_margin_field
.
append
(
Path
(
id
=
index
,
path
=
path
,
style_class
=
attribute
.
get
(
'class'
)))
box_line_number_dict
=
{}
for
box_path
in
sorted
(
box_paths
,
key
=
lambda
path
:
path
.
get_median_y
()):
line_number
=
self
.
get_line_number
(
box_path
.
get_median_y
(
tr_ymin
=
transkription_field
.
ymin
))
if
line_number
not
in
box_line_number_dict
.
keys
():
box_line_number_dict
.
update
({
line_number
:
[
box_path
]})
else
:
box_line_number_dict
.
get
(
line_number
)
.
append
(
box_path
)
boxes
=
[]
for
line_number
in
box_line_number_dict
.
keys
():
box_paths_on_line
=
sorted
(
box_line_number_dict
[
line_number
],
key
=
lambda
path
:
path
.
get_x
())
margin_boxes_on_line
=
sorted
([
margin_box
for
margin_box
in
allpaths_on_margin_field
\
if
self
.
get_line_number
(
margin_box
.
get_median_y
(
tr_ymin
=
transkription_field
.
ymin
))
==
line_number
],
\
key
=
lambda
path
:
path
.
get_x
())
threshold
=
3
if
line_number
%
2
==
0
else
1.5
for
box_path
in
box_paths_on_line
:
box
=
Box
.
create_box
(
box_path
,
margin_boxes_on_line
,
svg_tree
=
svg_tree
,
\
transkription_field
=
transkription_field
,
namespaces
=
namespaces
,
threshold
=
threshold
)
if
box
is
not
None
:
boxes
.
append
(
box
)
for
word
in
self
.
words
:
not
bool
(
Page
.
UNITTESTING
)
and
bar
.
next
()
word
.
process_boxes
(
boxes
,
tr_xmin
=
transkription_field
.
xmin
,
tr_ymin
=
transkription_field
.
ymin
)
not
bool
(
Page
.
UNITTESTING
)
and
bar
.
finish
()
def
unlock
(
self
):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if
self
.
is_locked
():
lock
=
self
.
page_tree
.
xpath
(
'//metadata/lock'
)[
0
]
lock
.
getparent
()
.
remove
(
lock
)
def
update_and_attach_words2tree
(
self
,
update_function_on_word
=
None
,
include_special_words_of_type
=
[]):
"""Update word ids and attach them to page.page_tree.
"""
if
not
self
.
is_locked
():
update_function_on_word
=
[
update_function_on_word
]
\
if
type
(
update_function_on_word
)
!=
list
\
else
update_function_on_word
for
node
in
self
.
page_tree
.
xpath
(
'.//word|.//'
+
MarkForeignHands
.
XML_TAG
+
'|.//'
+
TextConnectionMark
.
XML_TAG
):
node
.
getparent
()
.
remove
(
node
)
for
index
,
word
in
enumerate
(
self
.
words
):
word
.
id
=
index
for
func
in
update_function_on_word
:
if
callable
(
update_function_on_word
):
update_function_on_word
(
word
)
word
.
attach_word_to_tree
(
self
.
page_tree
)
for
index
,
mark_foreign_hands
in
enumerate
(
self
.
mark_foreign_hands
):
mark_foreign_hands
.
id
=
index
if
MarkForeignHands
in
include_special_words_of_type
:
for
func
in
update_function_on_word
:
if
callable
(
update_function_on_word
):
func
(
mark_foreign_hands
)
mark_foreign_hands
.
attach_word_to_tree
(
self
.
page_tree
)
for
index
,
text_connection_mark
in
enumerate
(
self
.
text_connection_marks
):
text_connection_mark
.
id
=
index
if
TextConnectionMark
in
include_special_words_of_type
:
for
func
in
update_function_on_word
:
if
callable
(
update_function_on_word
):
func
(
text_connection_mark
)
text_connection_mark
.
attach_word_to_tree
(
self
.
page_tree
)
def
update_line_number_area
(
self
,
transkription_field
,
svg_tree
=
None
):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD
=
0.4
if
svg_tree
is
None
:
svg_tree
=
ET
.
parse
(
self
.
source
)
if
len
(
self
.
line_numbers
)
>
1
:
line_number
=
self
.
line_numbers
[
9
]
\
if
transkription_field
.
is_page_verso
()
and
len
(
self
.
line_numbers
)
>
8
\
else
self
.
line_numbers
[
1
]
ln_nodes
=
[
item
for
item
in
svg_tree
.
iterfind
(
'//text'
,
svg_tree
.
getroot
()
.
nsmap
)
\
if
Matrix
.
IS_NEARX_TRANSKRIPTION_FIELD
(
item
.
get
(
'transform'
),
transkription_field
)
\
and
LineNumber
.
IS_A_LINE_NUMBER
(
item
)
\
and
LineNumber
(
raw_text_node
=
item
,
transkription_field
=
transkription_field
)
.
id
==
line_number
.
id
]
if
len
(
ln_nodes
)
>
0
:
matrix
=
Matrix
(
transform_matrix_string
=
ln_nodes
[
0
]
.
get
(
'transform'
))
if
transkription_field
.
is_page_verso
():
transkription_field
.
add_line_number_area_width
(
matrix
.
getX
())
elif
self
.
svg_file
is
not
None
and
isfile
(
self
.
svg_file
):
svg_path_tree
=
ET
.
parse
(
self
.
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_path_tree
.
getroot
()
.
nsmap
.
items
()
}
svg_x
=
matrix
.
getX
()
svg_y
=
self
.
line_numbers
[
1
]
.
bottom
+
transkription_field
.
ymin
use_nodes
=
svg_path_tree
.
xpath
(
'//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'
\
.
format
(
svg_x
-
THRESHOLD
,
svg_x
+
THRESHOLD
,
svg_y
-
THRESHOLD
,
svg_y
+
THRESHOLD
),
namespaces
=
namespaces
)
if
len
(
use_nodes
)
>
0
:
symbol_id
=
use_nodes
[
0
]
.
get
(
'{
%s
}href'
%
namespaces
[
'xlink'
])
.
replace
(
'#'
,
''
)
d_strings
=
use_nodes
[
0
]
.
xpath
(
'//ns:symbol[@id="{0}"]/ns:path/@d'
.
format
(
symbol_id
),
namespaces
=
namespaces
)
if
len
(
d_strings
)
>
0
and
d_strings
[
0
]
!=
''
:
path
=
parse_path
(
d_strings
[
0
])
xmin
,
xmax
,
ymin
,
ymax
=
path
.
bbox
()
width
=
xmax
-
xmin
transkription_field
.
add_line_number_area_width
(
matrix
.
getX
()
+
width
)
def
update_page_type
(
self
,
transkription_field
=
None
):
"""Adds a source to page and attaches it to page_tree.
"""
if
transkription_field
is
None
:
if
self
.
source
is
None
or
not
isfile
(
self
.
source
):
raise
FileNotFoundError
(
'Page does not have a source!'
)
transkription_field
=
TranskriptionField
(
self
.
source
)
self
.
page_type
=
Page
.
PAGE_VERSO
\
if
transkription_field
.
is_page_verso
()
\
else
Page
.
PAGE_RECTO
self
.
page_tree
.
getroot
()
.
set
(
'pageType'
,
self
.
page_type
)
def
do_paths_intersect_saveMode
(
path1
,
path2
):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try
:
return
path1
.
intersect
(
path2
,
justonemode
=
True
)
except
AssertionError
:
return
False
Event Timeline
Log In to Comment