Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F60568416
page.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, May 1, 03:59
Size
16 KB
Mime Type
text/x-python
Expires
Fri, May 3, 03:59 (2 d)
Engine
blob
Format
Raw Data
Handle
17376427
Attached To
rNIETZSCHEPYTHON nietzsche-python
page.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
lxml
import
etree
as
ET
from
os.path
import
isfile
from
progress.bar
import
Bar
from
svgpathtools
import
svg2paths2
,
svg_to_paths
from
svgpathtools.parser
import
parse_path
import
sys
import
warnings
from
.box
import
Box
from
.color
import
Color
from
.image
import
Image
,
SVGImage
from
.faksimile_image
import
FaksimileImage
from
.faksimile_position
import
FaksimilePosition
from
.lineNumber
import
LineNumber
from
.line
import
Line
from
.mark_foreign_hands
import
MarkForeignHands
from
.matrix
import
Matrix
from
.path
import
Path
from
.positional_word_part
import
PositionalWordPart
from
.super_page
import
SuperPage
from
.style
import
Style
from
.text_connection_mark
import
TextConnectionMark
from
.text_field
import
TextField
from
.transkriptionField
import
TranskriptionField
from
.writing_process
import
WritingProcess
from
.word
import
Word
from
.word_deletion_path
import
WordDeletionPath
from
.word_insertion_mark
import
WordInsertionMark
sys
.
path
.
append
(
'py2ttl'
)
from
class_spec
import
SemanticClass
FILE_TYPE_SVG_WORD_POSITION
=
SuperPage
.
FILE_TYPE_SVG_WORD_POSITION
FILE_TYPE_XML_MANUSCRIPT
=
SuperPage
.
FILE_TYPE_XML_MANUSCRIPT
STATUS_MERGED_OK
=
SuperPage
.
STATUS_MERGED_OK
STATUS_POSTMERGED_OK
=
SuperPage
.
STATUS_POSTMERGED_OK
class
Page
(
SemanticClass
,
SuperPage
):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
UNITTESTING
=
False
def
__init__
(
self
,
xml_source_file
,
faksimile_image
=
None
,
faksimile_svgFile
=
None
,
add_deletion_paths_to_words
=
True
):
super
(
Page
,
self
)
.
__init__
(
xml_source_file
)
self
.
update_property_dictionary
(
'faksimile_image'
,
faksimile_image
)
self
.
update_property_dictionary
(
'faksimile_svgFile'
,
faksimile_svgFile
)
self
.
init_all_properties
()
self
.
add_style
(
style_node
=
self
.
page_tree
.
getroot
()
.
find
(
'.//style'
))
self
.
init_node_objects
()
if
add_deletion_paths_to_words
:
self
.
add_deletion_paths_to_words
()
def
add_deletion_paths_to_words
(
self
):
"""Add deletion paths to words.
"""
if
(
self
.
svg_file
is
not
None
and
isfile
(
self
.
svg_file
))
\
or
(
self
.
source
is
not
None
and
isfile
(
self
.
source
)):
svg_file
=
self
.
svg_file
if
self
.
svg_file
is
not
None
else
self
.
source
transkription_field
=
TranskriptionField
(
svg_file
)
words
=
[
word
for
word
in
self
.
words
if
word
.
deleted
or
True
in
[
part
.
deleted
for
part
in
word
.
word_parts
]]
for
word
in
words
:
word
.
add_deletion_paths
(
self
.
word_deletion_paths
,
tr_xmin
=
transkription_field
.
xmin
,
tr_ymin
=
transkription_field
.
ymin
)
@classmethod
def
get_pages_from_xml_file
(
cls
,
xml_file
,
status_contains
=
''
,
status_not_contain
=
''
,
word_selection_function
=
None
):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree
=
ET
.
parse
(
xml_file
)
if
source_tree
.
getroot
()
.
find
(
'metadata/type'
)
.
text
==
FILE_TYPE_SVG_WORD_POSITION
:
page
=
cls
(
xml_file
)
if
word_selection_function
is
None
or
len
(
word_selection_function
(
page
.
words
))
>
0
:
return
[
page
]
else
:
return
[]
elif
source_tree
.
getroot
()
.
find
(
'metadata/type'
)
.
text
==
FILE_TYPE_XML_MANUSCRIPT
:
pages
=
[]
xpath
=
'//page/@output'
if
status_contains
!=
''
and
status_not_contain
!=
''
:
xpath
=
'//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'
.
format
(
status_contains
,
status_not_contain
)
elif
status_contains
!=
''
:
xpath
=
'//page[contains(@status, "{0}")]/@output'
.
format
(
status_contains
)
elif
status_not_contain
!=
''
:
xpath
=
'//page[not(contains(@status, "{0}"))]/@output'
.
format
(
status_not_contain
)
for
xml_source_file
in
source_tree
.
xpath
(
xpath
):
if
isfile
(
xml_source_file
):
pages
+=
cls
.
get_pages_from_xml_file
(
xml_source_file
,
word_selection_function
=
word_selection_function
)
return
pages
else
:
return
[]
@classmethod
def
get_semantic_dictionary
(
cls
):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary
=
{}
class_dict
=
cls
.
get_class_dictionary
()
properties
=
{
'number'
:
{
'class'
:
str
,
'cardinality'
:
1
},
\
'faksimile_image'
:
{
'class'
:
FaksimileImage
,
'cardinality'
:
1
},
\
'orientation'
:
{
'class'
:
str
,
'cardinality'
:
1
},
\
'svg_image'
:
{
'class'
:
SVGImage
,
'cardinality'
:
1
}}
properties
.
update
(
cls
.
create_semantic_property_dictionary
(
'text_field'
,
TextField
,
\
cardinality
=
1
,
name
=
'pageIsOnTextField'
,
label
=
'page is on text field'
,
\
comment
=
'Relates a page to the text field on a faksimile image.'
))
for
key
in
[
'lines'
,
'words'
,
'word_deletion_paths'
,
'word_insertion_marks'
]:
properties
.
update
(
cls
.
create_semantic_property_dictionary
(
key
,
list
))
dictionary
.
update
({
cls
.
CLASS_KEY
:
class_dict
})
dictionary
.
update
({
cls
.
PROPERTIES_KEY
:
properties
})
return
cls
.
return_dictionary_after_updating_super_classes
(
dictionary
)
def
init_node_objects
(
self
):
"""Initialize all node objects.
"""
self
.
word_insertion_marks
=
[
WordInsertionMark
(
wim_node
=
wim_node
)
for
wim_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
WordInsertionMark
.
XML_TAG
)
]
self
.
words
=
[
Word
.
create_cls
(
word_node
)
for
word_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'./word'
)
]
self
.
mark_foreign_hands
=
[
MarkForeignHands
.
create_cls
(
node
)
for
node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
MarkForeignHands
.
XML_TAG
)
]
self
.
text_connection_marks
=
[
TextConnectionMark
.
create_cls
(
node
)
for
node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
TextConnectionMark
.
XML_TAG
)
]
self
.
line_numbers
=
[
LineNumber
(
xml_text_node
=
line_number_node
)
for
line_number_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
LineNumber
.
XML_TAG
)
]
self
.
lines
=
[
Line
.
create_cls_from_node
(
node
=
line_number_node
)
for
line_number_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
LineNumber
.
XML_TAG
)
]
self
.
writing_processes
=
[
WritingProcess
.
create_writing_process_from_xml
(
node
,
self
.
words
)
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
WritingProcess
.
XML_TAG
)
]
self
.
word_deletion_paths
=
[
WordDeletionPath
.
create_cls
(
node
,
self
)
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
WordDeletionPath
.
XML_TAG
)
]
if
self
.
faksimile_image
is
not
None
and
self
.
text_field
is
not
None
:
for
simple_word
in
self
.
words
+
self
.
mark_foreign_hands
+
self
.
text_connection_marks
:
simple_word
.
init_word
(
self
)
for
wim
in
self
.
word_insertion_marks
:
if
wim
.
line_number
>
-
1
:
wim
.
line
=
[
line
for
line
in
self
.
lines
if
line
.
id
==
wim
.
line_number
][
0
]
def
update_and_attach_words2tree
(
self
,
update_function_on_word
=
None
,
include_special_words_of_type
=
[]):
"""Update word ids and attach them to page.page_tree.
"""
if
not
self
.
is_locked
():
update_function_on_word
=
[
update_function_on_word
]
\
if
type
(
update_function_on_word
)
!=
list
\
else
update_function_on_word
for
node
in
self
.
page_tree
.
xpath
(
'.//word|.//'
+
MarkForeignHands
.
XML_TAG
+
'|.//'
+
TextConnectionMark
.
XML_TAG
):
node
.
getparent
()
.
remove
(
node
)
for
index
,
word
in
enumerate
(
self
.
words
):
word
.
id
=
index
for
func
in
update_function_on_word
:
if
callable
(
func
):
func
(
word
)
word
.
attach_word_to_tree
(
self
.
page_tree
)
for
index
,
mark_foreign_hands
in
enumerate
(
self
.
mark_foreign_hands
):
mark_foreign_hands
.
id
=
index
if
MarkForeignHands
in
include_special_words_of_type
:
for
func
in
update_function_on_word
:
if
callable
(
update_function_on_word
):
func
(
mark_foreign_hands
)
mark_foreign_hands
.
attach_word_to_tree
(
self
.
page_tree
)
for
index
,
text_connection_mark
in
enumerate
(
self
.
text_connection_marks
):
text_connection_mark
.
id
=
index
if
TextConnectionMark
in
include_special_words_of_type
:
for
func
in
update_function_on_word
:
if
callable
(
update_function_on_word
):
func
(
text_connection_mark
)
text_connection_mark
.
attach_word_to_tree
(
self
.
page_tree
)
else
:
print
(
'locked'
)
def
update_data_source
(
self
,
faksimile_svgFile
=
None
,
xml_correction_file
=
None
):
"""Update the data source of page.
"""
if
faksimile_svgFile
is
not
None
:
self
.
faksimile_svgFile
=
faksimile_svgFile
data_node
=
self
.
page_tree
.
xpath
(
'.//data-source'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//data-source'
))
>
0
\
else
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'data-source'
)
data_node
.
set
(
'file'
,
self
.
faksimile_svgFile
)
if
xml_correction_file
is
not
None
:
data_node
.
set
(
'xml-corrected-words'
,
xml_correction_file
)
def
update_line_number_area
(
self
,
transkription_field
,
svg_tree
=
None
):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD
=
0.4
if
svg_tree
is
None
:
svg_tree
=
ET
.
parse
(
self
.
source
)
if
len
(
self
.
line_numbers
)
>
1
:
line_number
=
self
.
line_numbers
[
9
]
\
if
transkription_field
.
is_page_verso
()
and
len
(
self
.
line_numbers
)
>
8
\
else
self
.
line_numbers
[
1
]
ln_nodes
=
[
item
for
item
in
svg_tree
.
iterfind
(
'//text'
,
svg_tree
.
getroot
()
.
nsmap
)
\
if
Matrix
.
IS_NEARX_TRANSKRIPTION_FIELD
(
item
.
get
(
'transform'
),
transkription_field
)
\
and
LineNumber
.
IS_A_LINE_NUMBER
(
item
)
\
and
LineNumber
(
raw_text_node
=
item
,
transkription_field
=
transkription_field
)
.
id
==
line_number
.
id
]
if
len
(
ln_nodes
)
>
0
:
matrix
=
Matrix
(
transform_matrix_string
=
ln_nodes
[
0
]
.
get
(
'transform'
))
if
transkription_field
.
is_page_verso
():
transkription_field
.
add_line_number_area_width
(
matrix
.
getX
())
elif
self
.
svg_file
is
not
None
and
isfile
(
self
.
svg_file
):
svg_path_tree
=
ET
.
parse
(
self
.
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_path_tree
.
getroot
()
.
nsmap
.
items
()
}
svg_x
=
matrix
.
getX
()
svg_y
=
self
.
line_numbers
[
1
]
.
bottom
+
transkription_field
.
ymin
use_nodes
=
svg_path_tree
.
xpath
(
'//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'
\
.
format
(
svg_x
-
THRESHOLD
,
svg_x
+
THRESHOLD
,
svg_y
-
THRESHOLD
,
svg_y
+
THRESHOLD
),
namespaces
=
namespaces
)
if
len
(
use_nodes
)
>
0
:
symbol_id
=
use_nodes
[
0
]
.
get
(
'{
%s
}href'
%
namespaces
[
'xlink'
])
.
replace
(
'#'
,
''
)
d_strings
=
use_nodes
[
0
]
.
xpath
(
'//ns:symbol[@id="{0}"]/ns:path/@d'
.
format
(
symbol_id
),
namespaces
=
namespaces
)
if
len
(
d_strings
)
>
0
and
d_strings
[
0
]
!=
''
:
path
=
parse_path
(
d_strings
[
0
])
xmin
,
xmax
,
ymin
,
ymax
=
path
.
bbox
()
width
=
xmax
-
xmin
transkription_field
.
add_line_number_area_width
(
matrix
.
getX
()
+
width
)
def
update_page_type
(
self
,
transkription_field
=
None
):
"""Adds a source to page and attaches it to page_tree.
"""
if
transkription_field
is
None
:
if
self
.
source
is
None
or
not
isfile
(
self
.
source
):
raise
FileNotFoundError
(
'Page does not have a source!'
)
transkription_field
=
TranskriptionField
(
self
.
source
)
self
.
page_type
=
Page
.
PAGE_VERSO
\
if
transkription_field
.
is_page_verso
()
\
else
Page
.
PAGE_RECTO
self
.
page_tree
.
getroot
()
.
set
(
'pageType'
,
self
.
page_type
)
def
update_styles
(
self
,
words
=
None
,
manuscript
=
None
,
add_to_parents
=
False
,
partition_according_to_styles
=
False
,
create_css
=
False
):
"""Update styles of words and add them to their transkription_positions.
Args:
add_to_parents: Add styles also to word (and if not None to manuscript).
partition_according_to_styles: Partition word if its transkription_positions have different styles.
"""
style_dictionary
=
{}
if
words
is
None
:
words
=
self
.
words
for
word
in
words
:
if
len
(
word
.
word_parts
)
>
0
:
self
.
update_styles
(
words
=
word
.
word_parts
,
manuscript
=
manuscript
,
create_css
=
create_css
,
\
add_to_parents
=
add_to_parents
,
partition_according_to_styles
=
partition_according_to_styles
)
for
transkription_position
in
word
.
transkription_positions
:
if
len
(
transkription_position
.
positional_word_parts
)
>
0
:
style_class
=
transkription_position
.
positional_word_parts
[
0
]
.
style_class
writing_process_id
=
-
1
for
font_key
in
[
font_key
for
font_key
in
style_class
.
split
(
' '
)
if
font_key
in
self
.
fontsizekey2stage_mapping
.
keys
()
]:
writing_process_id
=
self
.
fontsizekey2stage_mapping
.
get
(
font_key
)
style_class_key
=
(
Style
.
remove_irrelevant_style_keys
(
style_class
,
self
,
extended_styles
=
create_css
),
writing_process_id
)
if
create_css
:
if
style_dictionary
.
get
((
style_class_key
,
word
.
deleted
))
is
None
:
color
=
word
.
deletion_paths
[
0
]
.
style
.
color
\
if
len
(
word
.
deletion_paths
)
>
0
else
None
style_dictionary
[(
style_class_key
,
word
.
deleted
)]
=
Style
.
create_cls
(
self
,
style_class_key
[
0
],
manuscript
=
manuscript
,
\
create_css
=
create_css
,
deletion_color
=
color
,
writing_process_id
=
style_class_key
[
1
]
)
transkription_position
.
style
=
style_dictionary
[(
style_class_key
,
word
.
deleted
)]
#print(style_dictionary[(style_class_key, word.deleted)])
else
:
if
style_dictionary
.
get
(
style_class_key
)
is
None
:
style_dictionary
[
style_class_key
]
=
Style
.
create_cls
(
self
,
style_class_key
[
0
],
manuscript
=
manuscript
,
create_css
=
create_css
)
style_dictionary
[
style_class_key
]
.
writing_process_id
=
style_class_key
[
1
]
transkription_position
.
style
=
style_dictionary
[
style_class_key
]
if
add_to_parents
and
transkription_position
.
style
not
in
word
.
styles
:
word
.
styles
.
append
(
transkription_position
.
style
)
if
partition_according_to_styles
:
word
.
split_according_to_status
(
'style'
,
splits_are_parts
=
True
)
if
manuscript
is
not
None
\
and
add_to_parents
:
manuscript
.
update_styles
(
*
style_dictionary
.
values
())
Event Timeline
Log In to Comment