Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F62393216
page.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, May 12, 21:30
Size
13 KB
Mime Type
text/x-python
Expires
Tue, May 14, 21:30 (2 d)
Engine
blob
Format
Raw Data
Handle
17642043
Attached To
rNIETZSCHEPYTHON nietzsche-python
page.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
lxml
import
etree
as
ET
from
os.path
import
isfile
#from myxmlwriter import write_pretty
from
.class_spec
import
SemanticClass
from
.image
import
Image
from
.word
import
Word
from
.lineNumber
import
LineNumber
from
.word_insertion_mark
import
WordInsertionMark
from
.transkriptionField
import
TranskriptionField
class
Page
(
SemanticClass
):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
class_dictionary
=
{}
def
__init__
(
self
,
xml_source_file
=
None
,
xml_target_file
=
None
,
title
=
None
,
page_number
=
None
,
pdfFile
=
None
,
svg_file
=
None
,
extract_transkription_field_only
=
False
):
self
.
title
=
title
self
.
line_numbers
=
[]
self
.
style_dict
=
{}
self
.
sonderzeichen_list
=
[]
self
.
svg_file
=
None
self
.
pdfFile
=
None
self
.
source
=
None
self
.
number
=
int
(
page_number
)
if
page_number
is
not
None
else
-
1
if
xml_source_file
is
not
None
:
if
isfile
(
xml_source_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
page_tree
=
ET
.
parse
(
xml_source_file
,
parser
)
self
.
title
=
self
.
page_tree
.
getroot
()
.
get
(
'title'
)
self
.
number
=
self
.
page_tree
.
getroot
()
.
get
(
'number'
)
self
.
source
=
self
.
page_tree
.
getroot
()
.
get
(
'source'
)
self
.
init_words
()
self
.
add_style
(
style_node
=
self
.
page_tree
.
getroot
()
.
find
(
'.//style'
))
self
.
svg_file
=
self
.
page_tree
.
xpath
(
'.//svg/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@file'
))
>
0
else
None
self
.
width
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
))
>
0
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
))
>
0
else
0.0
self
.
pdfFile
=
self
.
page_tree
.
xpath
(
'.//pdf/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//pdf/@file'
))
>
0
else
None
if
pdfFile
is
not
None
and
self
.
pdfFile
is
None
:
self
.
pdfFile
=
pdfFile
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'pdf'
,
attrib
=
{
'file'
:
self
.
pdfFile
})
#write_pretty(xml_element_tree=self.page_tree, file_name=xml_source_file, script_name=__file__, file_type='svgWordPosition')
if
svg_file
is
not
None
and
self
.
svg_file
is
None
:
self
.
svg_file
=
svg_file
tf
=
TranskriptionField
(
svg_file
)
self
.
width
=
round
(
tf
.
documentWidth
,
3
)
self
.
height
=
round
(
tf
.
documentHeight
,
3
)
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'svg'
,
attrib
=
{
'width'
:
str
(
self
.
width
),
'height'
:
str
(
self
.
height
),
'file'
:
self
.
svg_file
})
#write_pretty(xml_element_tree=self.page_tree, file_name=xml_source_file, script_name=__file__, file_type='svgWordPosition')
else
:
raise
Exception
(
'File "{}" does not exist!'
.
format
(
xml_source_file
))
elif
xml_target_file
is
not
None
:
self
.
word_insertion_marks
=
[]
self
.
words
=
[]
self
.
svg_file
=
svg_file
self
.
pdfFile
=
pdfFile
if
isfile
(
xml_target_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
page_tree
=
ET
.
parse
(
xml_target_file
,
parser
)
self
.
source
=
self
.
page_tree
.
getroot
()
.
get
(
'source'
)
if
bool
(
self
.
page_tree
.
getroot
()
.
get
(
'title'
)):
self
.
title
=
self
.
page_tree
.
getroot
()
.
get
(
'title'
)
elif
title
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'title'
,
title
)
if
self
.
svg_file
is
None
:
self
.
svg_file
=
self
.
page_tree
.
xpath
(
'.//svg/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@file'
))
>
0
else
None
self
.
width
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
))
>
0
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
))
>
0
else
0.0
elif
len
(
self
.
page_tree
.
xpath
(
'.//svg/@file'
))
==
0
:
tf
=
TranskriptionField
(
svg_file
)
self
.
width
=
round
(
tf
.
documentWidth
,
3
)
self
.
height
=
round
(
tf
.
documentHeight
,
3
)
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'svg'
,
attrib
=
{
'width'
:
str
(
self
.
width
),
'height'
:
str
(
self
.
height
),
'file'
:
self
.
svg_file
})
else
:
self
.
width
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@width'
))
>
0
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
)[
0
])
\
if
len
(
self
.
page_tree
.
xpath
(
'.//svg/@height'
))
>
0
else
0.0
if
self
.
pdfFile
is
None
:
self
.
pdfFile
=
self
.
page_tree
.
xpath
(
'.//pdf/@file'
)[
0
]
\
if
len
(
self
.
page_tree
.
xpath
(
'.//pdf/@file'
))
>
0
else
None
elif
len
(
self
.
page_tree
.
xpath
(
'.//pdf/@file'
))
==
0
:
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'pdf'
,
attrib
=
{
'file'
:
self
.
pdfFile
})
for
xpath2remove
in
[
'word'
,
'style'
,
'freehand'
,
LineNumber
.
XML_TAG
()
]:
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
xpath2remove
):
node
.
getparent
()
.
remove
(
node
)
else
:
self
.
page_tree
=
ET
.
ElementTree
(
ET
.
Element
(
'page'
))
self
.
pdfFile
=
pdfFile
self
.
svg_file
=
svg_file
if
title
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'title'
,
title
)
self
.
page_tree
.
getroot
()
.
set
(
'transkription-field-only'
,
str
(
extract_transkription_field_only
)
.
lower
())
if
page_number
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'number'
,
str
(
page_number
))
if
self
.
pdfFile
is
not
None
:
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'pdf'
,
attrib
=
{
'file'
:
self
.
pdfFile
})
if
self
.
svg_file
is
not
None
:
tf
=
TranskriptionField
(
self
.
svg_file
)
self
.
width
=
round
(
tf
.
documentWidth
,
3
)
self
.
height
=
round
(
tf
.
documentHeight
,
3
)
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'svg'
,
attrib
=
{
'width'
:
str
(
self
.
width
),
'height'
:
str
(
self
.
height
),
'file'
:
self
.
svg_file
})
self
.
svg_image
=
Image
(
file_name
=
self
.
svg_file
,
height
=
self
.
height
,
width
=
self
.
width
)
if
self
.
svg_file
is
not
None
\
else
None
self
.
create_semantic_dictionary
(
Page
.
class_dictionary
)
def
init_line_numbers
(
self
,
line_numbers
,
document_bottom
):
"""Init line numbers.
"""
even_index
=
0
MINABOVE
=
1
self
.
line_numbers
=
[]
if
len
(
line_numbers
)
>
0
:
first_line_bottom
=
line_numbers
[
even_index
]
.
top
-
MINABOVE
self
.
line_numbers
.
append
(
LineNumber
(
id
=
1
,
top
=
0
,
bottom
=
first_line_bottom
))
self
.
line_numbers
.
append
(
line_numbers
[
even_index
])
even_index
+=
1
while
even_index
<
len
(
line_numbers
):
self
.
line_numbers
.
append
(
LineNumber
(
id
=
line_numbers
[
even_index
]
.
id
-
1
,
\
top
=
line_numbers
[
even_index
-
1
]
.
bottom
+
MINABOVE
,
\
bottom
=
line_numbers
[
even_index
]
.
top
-
MINABOVE
))
self
.
line_numbers
.
append
(
line_numbers
[
even_index
])
even_index
+=
1
self
.
line_numbers
.
append
(
LineNumber
(
id
=
line_numbers
[
even_index
-
1
]
.
id
+
1
,
\
top
=
line_numbers
[
even_index
-
1
]
.
bottom
+
MINABOVE
,
\
bottom
=
document_bottom
))
for
line_number
in
self
.
line_numbers
:
line_number
.
attach_object_to_tree
(
self
.
page_tree
)
def
init_words
(
self
):
self
.
word_insertion_marks
=
[
WordInsertionMark
(
wim_node
=
wim_node
)
for
wim_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
WordInsertionMark
.
XML_TAG
())
]
self
.
words
=
[
Word
.
CREATE_WORD
(
word_node
=
word_node
)
for
word_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//word'
)
]
self
.
line_numbers
=
[
LineNumber
(
xml_text_node
=
line_number_node
)
for
line_number_node
in
self
.
page_tree
.
getroot
()
.
xpath
(
'//'
+
LineNumber
.
XML_TAG
())
]
for
index
,
word
in
enumerate
(
self
.
words
):
for
word_insertion_mark
in
self
.
word_insertion_marks
:
self
.
words
[
index
]
=
word_insertion_mark
.
attach_and_update_word_if_involved
(
word
)
if
self
.
words
[
index
]
!=
word
:
break
def
add_style
(
self
,
sonderzeichen_list
=
[],
letterspacing_list
=
[],
style_dict
=
{},
style_node
=
None
):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self
.
sonderzeichen_list
=
sonderzeichen_list
self
.
letterspacing_list
=
letterspacing_list
self
.
style_dict
=
style_dict
if
style_node
is
not
None
:
self
.
style_dict
=
{
item
.
get
(
'name'
):
{
key
:
value
for
key
,
value
in
item
.
attrib
.
items
()
if
key
!=
'name'
}
for
item
in
style_node
.
findall
(
'.//class'
)
}
self
.
sonderzeichen_list
=
[
item
.
get
(
'name'
)
for
item
in
style_node
.
findall
(
'.//class'
)
\
if
bool
(
item
.
get
(
'font-family'
))
and
'Sonderzeichen'
in
item
.
get
(
'font-family'
)
]
self
.
letterspacing_list
=
[
item
.
get
(
'name'
)
for
item
in
style_node
.
findall
(
'.//class'
)
\
if
bool
(
item
.
get
(
'letterspacing-list'
))
]
elif
bool
(
self
.
style_dict
):
style_node
=
ET
.
SubElement
(
self
.
page_tree
.
getroot
(),
'style'
)
if
len
(
self
.
sonderzeichen_list
)
>
0
:
style_node
.
set
(
'Sonderzeichen'
,
' '
.
join
(
self
.
sonderzeichen_list
))
if
len
(
self
.
letterspacing_list
)
>
0
:
style_node
.
set
(
'letterspacing-list'
,
' '
.
join
(
self
.
letterspacing_list
))
for
key
in
self
.
style_dict
.
keys
():
self
.
style_dict
[
key
][
'name'
]
=
key
ET
.
SubElement
(
style_node
,
'class'
,
attrib
=
self
.
style_dict
[
key
])
def
get_biggest_fontSize4styles
(
self
,
style_set
=
{}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if
bool
(
self
.
style_dict
):
sorted_font_sizes
=
sorted
(
(
float
(
self
.
style_dict
[
key
][
'font-size'
]
.
replace
(
'px'
,
''
))
for
key
in
style_set
if
bool
(
self
.
style_dict
[
key
]
.
get
(
'font-size'
))),
reverse
=
True
)
return
sorted_font_sizes
[
0
]
if
len
(
sorted_font_sizes
)
>
0
else
1
else
:
return
1
def
get_line_number
(
self
,
y
):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if
len
(
self
.
line_numbers
)
>
0
:
result_list
=
[
line_number
.
id
for
line_number
in
self
.
line_numbers
if
y
>=
line_number
.
top
and
y
<=
line_number
.
bottom
]
return
result_list
[
0
]
if
len
(
result_list
)
>
0
else
-
1
else
:
return
-
1
def
create_semantic_dictionary
(
self
,
dictionary
):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
if
len
(
dictionary
)
==
0
:
class_dict
=
self
.
get_class_dictionary
()
if
self
.
__class__
==
Page
:
class_dict
.
update
({
'rdf:subClassOf'
:
'http://www.knora.org/ontology/0000/information-carrier#Page'
})
properties
=
{
'title'
:
(
str
,
1
),
'number'
:
(
str
,
1
),
'line_numbers'
:
(
LineNumber
,
SemanticClass
.
LIST
),
'words'
:
(
Word
,
SemanticClass
.
LIST
),
\
'svg_image'
:
(
Image
,
1
),
'word_insertion_marks'
:
(
WordInsertionMark
,
SemanticClass
.
LIST
)}
dictionary
.
update
({
'class'
:
class_dict
})
dictionary
.
update
({
'properties'
:
properties
})
@staticmethod
def
get_semantic_dictionary
():
return
__class__
.
class_dictionary
Event Timeline
Log In to Comment