Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F62203825
super_page.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, May 11, 14:49
Size
13 KB
Mime Type
text/x-python
Expires
Mon, May 13, 14:49 (2 d)
Engine
blob
Format
Raw Data
Handle
17618569
Attached To
rNIETZSCHEPYTHON nietzsche-python
super_page.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a super page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
lxml
import
etree
as
ET
from
os.path
import
isfile
,
basename
,
dirname
from
progress.bar
import
Bar
from
svgpathtools
import
svg2paths2
,
svg_to_paths
from
svgpathtools.parser
import
parse_path
import
sys
import
warnings
from
.image
import
Image
,
SVGImage
from
.faksimile_image
import
FaksimileImage
from
.mark_foreign_hands
import
MarkForeignHands
from
.text_connection_mark
import
TextConnectionMark
from
.text_field
import
TextField
from
.writing_process
import
WritingProcess
class
SuperPage
:
"""
This super class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
FILE_TYPE_SVG_WORD_POSITION
=
'svgWordPosition'
FILE_TYPE_XML_MANUSCRIPT
=
'xmlManuscriptFile'
PAGE_RECTO
=
'recto'
PAGE_VERSO
=
'verso'
STATUS_MERGED_OK
=
'faksimile merged'
STATUS_POSTMERGED_OK
=
'words processed'
UNITTESTING
=
False
XML_TAG
=
'page'
def
__init__
(
self
,
xml_file
,
title
=
None
,
page_number
=
''
,
orientation
=
'North'
,
page_type
=
PAGE_VERSO
,
should_xml_file_exist
=
False
):
self
.
properties_dictionary
=
{
\
'faksimile_image'
:
(
FaksimileImage
.
XML_TAG
,
None
,
FaksimileImage
),
\
'faksimile_svgFile'
:
(
'data-source/@file'
,
None
,
str
),
\
'number'
:
(
'page/@number'
,
str
(
page_number
),
str
),
\
'orientation'
:
(
'page/@orientation'
,
orientation
,
str
),
\
'page_type'
:
(
'page/@pageType'
,
page_type
,
str
),
\
'pdfFile'
:
(
'pdf/@file'
,
None
,
str
),
\
'source'
:
(
'page/@source'
,
None
,
str
),
\
'svg_file'
:
(
'svg/@file'
,
None
,
str
),
\
'svg_image'
:
(
SVGImage
.
XML_TAG
,
None
,
SVGImage
),
\
'text_field'
:
(
FaksimileImage
.
XML_TAG
+
'/'
+
TextField
.
XML_TAG
,
None
,
TextField
),
\
'title'
:
(
'page/@title'
,
title
,
str
),
\
}
self
.
online_properties
=
[]
self
.
line_numbers
=
[]
self
.
lines
=
[]
self
.
mark_foreign_hands
=
[]
self
.
page_tree
=
None
self
.
sonderzeichen_list
=
[]
self
.
style_dict
=
{}
self
.
text_connection_marks
=
[]
self
.
word_deletion_paths
=
[]
self
.
word_insertion_marks
=
[]
self
.
words
=
[]
self
.
writing_processes
=
[]
self
.
xml_file
=
xml_file
if
not
self
.
is_page_source_xml_file
():
msg
=
f
'ERROR: xml_source_file {self.xml_file} is not of type "{FILE_TYPE_SVG_WORD_POSITION}"'
raise
Exception
(
msg
)
self
.
_init_tree
(
should_xml_file_exist
=
should_xml_file_exist
)
def
add_style
(
self
,
sonderzeichen_list
=
[],
letterspacing_list
=
[],
style_dict
=
{},
style_node
=
None
):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self
.
sonderzeichen_list
=
sonderzeichen_list
self
.
letterspacing_list
=
letterspacing_list
self
.
style_dict
=
style_dict
if
style_node
is
not
None
:
self
.
style_dict
=
{
item
.
get
(
'name'
):
{
key
:
value
for
key
,
value
in
item
.
attrib
.
items
()
if
key
!=
'name'
}
for
item
in
style_node
.
findall
(
'.//class'
)
}
self
.
sonderzeichen_list
=
[
item
.
get
(
'name'
)
for
item
in
style_node
.
findall
(
'.//class'
)
\
if
bool
(
item
.
get
(
'font-family'
))
and
'Sonderzeichen'
in
item
.
get
(
'font-family'
)
]
self
.
letterspacing_list
=
[
item
.
get
(
'name'
)
for
item
in
style_node
.
findall
(
'.//class'
)
\
if
bool
(
item
.
get
(
'letterspacing-list'
))
]
elif
bool
(
self
.
style_dict
):
style_node
=
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'style'
)
if
len
(
self
.
sonderzeichen_list
)
>
0
:
style_node
.
set
(
'Sonderzeichen'
,
' '
.
join
(
self
.
sonderzeichen_list
))
if
len
(
self
.
letterspacing_list
)
>
0
:
style_node
.
set
(
'letterspacing-list'
,
' '
.
join
(
self
.
letterspacing_list
))
for
key
in
self
.
style_dict
.
keys
():
self
.
style_dict
[
key
][
'name'
]
=
key
ET
.
SubElement
(
style_node
,
'class'
,
attrib
=
self
.
style_dict
[
key
])
fontsize_dict
=
{
key
:
float
(
value
.
get
(
'font-size'
)
.
replace
(
'px'
,
''
))
for
key
,
value
in
self
.
style_dict
.
items
()
if
'font-size'
in
value
}
fontsizes
=
sorted
(
fontsize_dict
.
values
(),
reverse
=
True
)
# create a mapping between fontsizes and word stages
self
.
fontsizekey2stage_mapping
=
{}
for
fontsize_key
,
value
in
fontsize_dict
.
items
():
if
value
>=
fontsizes
[
0
]
-
1
:
self
.
fontsizekey2stage_mapping
.
update
({
fontsize_key
:
WritingProcess
.
FIRST_VERSION
})
elif
value
<=
fontsizes
[
-
1
]
+
1
:
self
.
fontsizekey2stage_mapping
.
update
({
fontsize_key
:
WritingProcess
.
LATER_INSERTION_AND_ADDITION
})
else
:
self
.
fontsizekey2stage_mapping
.
update
({
fontsize_key
:
WritingProcess
.
INSERTION_AND_ADDITION
})
def
get_biggest_fontSize4styles
(
self
,
style_set
=
{}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if
bool
(
self
.
style_dict
):
sorted_font_sizes
=
sorted
(
(
float
(
self
.
style_dict
[
key
][
'font-size'
]
.
replace
(
'px'
,
''
))
for
key
in
style_set
if
bool
(
self
.
style_dict
[
key
]
.
get
(
'font-size'
))),
reverse
=
True
)
return
sorted_font_sizes
[
0
]
if
len
(
sorted_font_sizes
)
>
0
else
1
else
:
return
1
def
get_line_number
(
self
,
y
):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if
len
(
self
.
line_numbers
)
>
0
:
result_list
=
[
line_number
.
id
for
line_number
in
self
.
line_numbers
if
y
>=
line_number
.
top
and
y
<=
line_number
.
bottom
]
return
result_list
[
0
]
if
len
(
result_list
)
>
0
else
-
1
else
:
return
-
1
def
init_all_properties
(
self
,
overwrite
=
False
):
"""Initialize all properties.
"""
for
property_key
in
self
.
properties_dictionary
.
keys
():
if
property_key
not
in
self
.
online_properties
:
self
.
init_property
(
property_key
,
overwrite
=
overwrite
)
def
init_property
(
self
,
property_key
,
value
=
None
,
overwrite
=
False
):
"""Initialize all properties.
Args:
property_key: key of property in self.__dict__
value: new value to set to property
overwrite: whether or not to update values from xml_file (default: read only)
"""
if
value
is
None
:
if
property_key
not
in
self
.
online_properties
:
xpath
,
value
,
cls
=
self
.
properties_dictionary
.
get
(
property_key
)
if
len
(
self
.
page_tree
.
xpath
(
'//'
+
xpath
))
>
0
:
value
=
self
.
page_tree
.
xpath
(
'//'
+
xpath
)[
0
]
if
value
is
not
None
:
if
cls
.
__module__
==
'builtins'
:
self
.
update_tree
(
value
,
xpath
)
self
.
__dict__
.
update
({
property_key
:
cls
(
value
)})
else
:
value
=
cls
(
node
=
value
)
\
if
type
(
value
)
!=
cls
\
else
value
self
.
__dict__
.
update
({
property_key
:
value
})
self
.
__dict__
.
get
(
property_key
)
.
attach_object_to_tree
(
self
.
page_tree
)
else
:
self
.
__dict__
.
update
({
property_key
:
value
})
self
.
online_properties
.
append
(
property_key
)
elif
overwrite
or
property_key
not
in
self
.
online_properties
:
xpath
,
default_value
,
cls
=
self
.
properties_dictionary
.
get
(
property_key
)
if
cls
.
__module__
==
'builtins'
:
self
.
__dict__
.
update
({
property_key
:
cls
(
value
)})
self
.
update_tree
(
value
,
xpath
)
else
:
self
.
__dict__
.
update
({
property_key
:
value
})
self
.
__dict__
.
get
(
property_key
)
.
attach_object_to_tree
(
self
.
page_tree
)
self
.
online_properties
.
append
(
property_key
)
def
is_locked
(
self
):
"""Return true if page is locked.
"""
return
len
(
self
.
page_tree
.
xpath
(
'//metadata/lock'
))
>
0
def
is_page_source_xml_file
(
self
,
source_tree
=
None
):
"""Return true if xml_file is of type FILE_TYPE_SVG_WORD_POSITION.
"""
if
not
isfile
(
self
.
xml_file
):
return
True
if
source_tree
is
None
:
source_tree
=
ET
.
parse
(
self
.
xml_file
)
return
source_tree
.
getroot
()
.
find
(
'metadata/type'
)
.
text
==
self
.
FILE_TYPE_SVG_WORD_POSITION
def
lock
(
self
,
reference_file
,
message
=
''
):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if
not
self
.
is_locked
():
metadata
=
self
.
page_tree
.
xpath
(
'./metadata'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'./metadata'
))
>
0
\
else
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'metadata'
)
lock
=
ET
.
SubElement
(
metadata
,
'lock'
)
ET
.
SubElement
(
lock
,
'reference-file'
)
.
text
=
reference_file
if
message
!=
''
:
ET
.
SubElement
(
lock
,
'message'
)
.
text
=
message
def
unlock
(
self
):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if
self
.
is_locked
():
lock
=
self
.
page_tree
.
xpath
(
'//metadata/lock'
)[
0
]
lock
.
getparent
()
.
remove
(
lock
)
def
update_and_attach_words2tree
(
self
,
update_function_on_word
=
None
,
include_special_words_of_type
=
[]):
"""Update word ids and attach them to page.page_tree.
"""
if
not
self
.
is_locked
():
update_function_on_word
=
[
update_function_on_word
]
\
if
type
(
update_function_on_word
)
!=
list
\
else
update_function_on_word
for
node
in
self
.
page_tree
.
xpath
(
'.//word|.//'
+
MarkForeignHands
.
XML_TAG
+
'|.//'
+
TextConnectionMark
.
XML_TAG
):
node
.
getparent
()
.
remove
(
node
)
for
index
,
word
in
enumerate
(
self
.
words
):
word
.
id
=
index
for
func
in
update_function_on_word
:
if
callable
(
func
):
func
(
word
)
word
.
attach_word_to_tree
(
self
.
page_tree
)
for
index
,
mark_foreign_hands
in
enumerate
(
self
.
mark_foreign_hands
):
mark_foreign_hands
.
id
=
index
if
MarkForeignHands
in
include_special_words_of_type
:
for
func
in
update_function_on_word
:
if
callable
(
update_function_on_word
):
func
(
mark_foreign_hands
)
mark_foreign_hands
.
attach_word_to_tree
(
self
.
page_tree
)
for
index
,
text_connection_mark
in
enumerate
(
self
.
text_connection_marks
):
text_connection_mark
.
id
=
index
if
TextConnectionMark
in
include_special_words_of_type
:
for
func
in
update_function_on_word
:
if
callable
(
update_function_on_word
):
func
(
text_connection_mark
)
text_connection_mark
.
attach_word_to_tree
(
self
.
page_tree
)
else
:
print
(
'locked'
)
def
update_property_dictionary
(
self
,
property_key
,
default_value
):
"""Update properties_dictionary.
"""
content
=
self
.
properties_dictionary
.
get
(
property_key
)
if
content
is
not
None
:
self
.
properties_dictionary
.
update
({
property_key
:
(
content
[
0
],
default_value
,
content
[
2
])})
else
:
msg
=
f
'ERROR: properties_dictionary does not contain a key {property_key}!'
raise
Exception
(
msg
)
def
update_tree
(
self
,
value
,
xpath
):
"""Update tree.
"""
node_name
=
dirname
(
xpath
)
node
=
self
.
page_tree
.
xpath
(
'//'
+
node_name
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'//'
+
node_name
))
>
0
\
else
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
node_name
)
node
.
set
(
basename
(
xpath
)
.
replace
(
'@'
,
''
),
str
(
value
))
def
_init_tree
(
self
,
should_xml_file_exist
=
False
):
"""Initialize page_tree from xml_file if it exists.
"""
if
isfile
(
self
.
xml_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
page_tree
=
ET
.
parse
(
self
.
xml_file
,
parser
)
elif
not
should_xml_file_exist
:
self
.
page_tree
=
ET
.
ElementTree
(
ET
.
Element
(
'page'
))
self
.
page_tree
.
docinfo
.
URL
=
self
.
xml_file
else
:
msg
=
f
'ERROR: xml_source_file {self.xml_file} does not exist!'
raise
FileNotFoundError
(
msg
)
Event Timeline
Log In to Comment