Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F60564770
faksimile.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, May 1, 03:24
Size
11 KB
Mime Type
text/x-python
Expires
Fri, May 3, 03:24 (2 d)
Engine
blob
Format
Raw Data
Handle
17375553
Attached To
rNIETZSCHEPYTHON nietzsche-python
faksimile.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a faksimile page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
import
re
from
lxml
import
etree
as
ET
from
os
import
path
from
os.path
import
isdir
,
isfile
,
sep
,
basename
from
svgpathtools.parser
import
parse_path
from
.faksimile_image
import
FaksimileImage
from
.matrix
import
Matrix
from
.text_field
import
TextField
from
.word_position
import
WordPosition
class
FaksimilePage
:
"""
This class represents a faksimile page.
Args:
xml_target_file (str): name of the xml file to which page info will be written.
xml_source_file (str): name of the xml file that will be instantiated.
"""
XML_TAG
=
'faksimile-page'
def
__init__
(
self
,
xml_source_file
=
None
,
xml_target_file
=
None
,
title
=
None
,
page_number
=
None
,
svg_source_file
=
None
,
faksimile_image
=
None
,
text_field
=
None
):
xml_file
=
xml_source_file
if
xml_source_file
is
not
None
else
xml_target_file
self
.
title
=
title
self
.
page_number
=
page_number
self
.
xml_file
=
xml_file
if
xml_file
is
not
None
and
isfile
(
xml_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
page_tree
=
ET
.
parse
(
xml_file
,
parser
)
self
.
title
=
self
.
page_tree
.
getroot
()
.
get
(
'title'
)
self
.
page_number
=
self
.
page_tree
.
getroot
()
.
get
(
'page-number'
)
self
.
width
=
float
(
self
.
page_tree
.
getroot
()
.
get
(
'width'
))
if
bool
(
self
.
page_tree
.
getroot
()
.
get
(
'width'
))
else
0.0
self
.
height
=
float
(
self
.
page_tree
.
getroot
()
.
get
(
'height'
))
if
bool
(
self
.
page_tree
.
getroot
()
.
get
(
'height'
))
else
0.0
else
:
self
.
page_tree
=
ET
.
ElementTree
(
ET
.
Element
(
self
.
XML_TAG
))
if
title
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'title'
,
title
)
if
page_number
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'page-number'
,
str
(
page_number
))
if
xml_target_file
is
not
None
:
self
.
remove_tags_from_page_tree
([
WordPosition
.
FAKSIMILE
])
if
svg_source_file
is
not
None
:
self
.
page_tree
.
getroot
()
.
set
(
'svg-source-file'
,
svg_source_file
)
if
faksimile_image
is
not
None
:
faksimile_image
.
attach_object_to_tree
(
self
.
page_tree
)
if
text_field
is
not
None
:
text_field
.
attach_object_to_tree
(
self
.
page_tree
)
self
.
svg_source_file
=
self
.
page_tree
.
getroot
()
.
get
(
'svg-source-file'
)
self
.
faksimile_image
=
FaksimileImage
(
node
=
self
.
page_tree
.
getroot
()
.
find
(
'.//'
+
FaksimileImage
.
XML_TAG
))
\
if
len
(
self
.
page_tree
.
getroot
()
.
findall
(
'.//'
+
FaksimileImage
.
XML_TAG
))
>
0
else
None
self
.
text_field
=
TextField
(
node
=
self
.
page_tree
.
getroot
()
.
find
(
'.//'
+
TextField
.
XML_TAG
))
\
if
len
(
self
.
page_tree
.
getroot
()
.
findall
(
'.//'
+
TextField
.
XML_TAG
))
>
0
else
None
self
.
word_positions
=
[
WordPosition
(
node
=
node
)
for
node
in
self
.
page_tree
.
getroot
()
.
findall
(
'.//'
+
WordPosition
.
FAKSIMILE
)
]
\
if
len
(
self
.
page_tree
.
getroot
()
.
findall
(
'.//'
+
WordPosition
.
FAKSIMILE
))
>
0
else
[]
def
append_word_position
(
self
,
word_position
):
"""Appends word_position to word_positions and attaches it to page_tree.
"""
self
.
word_positions
.
append
(
word_position
)
word_position
.
attach_object_to_tree
(
self
.
page_tree
)
@classmethod
def
get_faksimile_pages
(
cls
,
svg_file
,
page_number
=
''
)
->
list
:
"""Creates and returns text fields contained in a svg_file as a list.
"""
svg_tree
=
ET
.
parse
(
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
return
cls
.
GET_FAKSIMILEPAGES
(
svg_tree
,
namespaces
=
namespaces
,
page_number
=
page_number
)
@staticmethod
def
GET_FAKSIMILEPAGES
(
svg_tree
,
namespaces
=
None
,
page_number
=
''
)
->
list
:
"""Creates and returns text fields contained in a svg_tree as a list.
"""
THRESHOLD_X
=
10
if
namespaces
is
None
:
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
source_file_name
=
svg_tree
.
docinfo
.
URL
image
=
FaksimileImage
.
CREATE_IMAGE
(
svg_tree
.
getroot
()
.
find
(
'.//image'
,
svg_tree
.
getroot
()
.
nsmap
),
source_file_name
)
xml_dir
=
'.{}xml'
.
format
(
sep
)
faksimile_pages
=
list
()
title_string
=
re
.
sub
(
r'[,_][0-9]+.*\.svg'
,
''
,
basename
(
source_file_name
))
if
re
.
match
(
r'.*-\d+[a-z]$'
,
title_string
):
title_string
=
re
.
sub
(
r'-\d+[a-z]$'
,
''
,
title_string
)
title
=
title_string
.
replace
(
'-'
,
' '
)
rect_list
=
[
rect
for
rect
in
svg_tree
.
getroot
()
.
findall
(
'.//rect'
,
svg_tree
.
getroot
()
.
nsmap
)
\
if
rect
.
get
(
'id'
,
svg_tree
.
getroot
()
.
nsmap
)
.
startswith
(
title_string
)
\
and
rect
.
get
(
'id'
,
svg_tree
.
getroot
()
.
nsmap
)
.
endswith
(
str
(
page_number
))
]
for
text_field_rect
in
rect_list
:
tf_x
=
float
(
text_field_rect
.
get
(
'x'
,
svg_tree
.
getroot
()
.
nsmap
))
-
image
.
x
tf_y
=
float
(
text_field_rect
.
get
(
'y'
,
svg_tree
.
getroot
()
.
nsmap
))
-
image
.
y
tf_width
=
float
(
text_field_rect
.
get
(
'width'
,
svg_tree
.
getroot
()
.
nsmap
))
tf_height
=
float
(
text_field_rect
.
get
(
'height'
,
svg_tree
.
getroot
()
.
nsmap
))
tf_matrix
=
Matrix
(
transform_matrix_string
=
text_field_rect
.
get
(
'transform'
))
\
if
bool
(
text_field_rect
.
get
(
'transform'
))
\
else
None
id
=
text_field_rect
.
get
(
'id'
,
svg_tree
.
getroot
()
.
nsmap
)
target_file_name
=
xml_dir
+
sep
+
id
+
'.xml'
if
isdir
(
xml_dir
)
else
id
+
'.xml'
page_number
=
re
.
sub
(
r'.*[,_]'
,
''
,
id
)
if
page_number
.
startswith
(
'0'
):
page_number
=
page_number
.
lstrip
(
'0'
)
text_field
=
TextField
(
id
=
id
,
width
=
tf_width
,
height
=
tf_height
,
x
=
tf_x
,
y
=
tf_y
,
matrix
=
tf_matrix
)
faksimile_page
=
FaksimilePage
(
xml_target_file
=
target_file_name
,
svg_source_file
=
source_file_name
,
\
title
=
title
,
page_number
=
page_number
,
faksimile_image
=
image
,
text_field
=
text_field
)
x_min
=
text_field
.
xmin
+
image
.
x
y_min
=
text_field
.
ymin
+
image
.
y
#rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\
# x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces)
rect_titles
=
get_paths_inside_rect
(
svg_tree
,
'//ns:rect/ns:title'
,
x_min
,
text_field
.
xmax
+
image
.
x
-
THRESHOLD_X
,
\
y_min
,
text_field
.
ymax
+
image
.
y
,
text_field
.
id
,
namespaces
=
namespaces
)
rect_titles
+=
get_paths_inside_rect
(
svg_tree
,
'//ns:path/ns:title'
,
x_min
,
text_field
.
xmax
+
image
.
x
-
THRESHOLD_X
,
\
y_min
,
text_field
.
ymax
+
image
.
y
,
text_field
.
id
,
namespaces
=
namespaces
)
for
rect_title
in
rect_titles
:
rect
=
rect_title
.
getparent
()
x
,
y
,
height
,
width
=
0.0
,
0.0
,
0.0
,
0.0
if
rect
.
tag
.
endswith
(
'path'
):
path
=
parse_path
(
rect
.
get
(
'd'
))
x
,
xmax
,
y
,
ymax
=
path
.
bbox
()
width
=
xmax
-
x
height
=
ymax
-
y
else
:
x
=
float
(
rect
.
get
(
'x'
,
svg_tree
.
getroot
()
.
nsmap
))
y
=
float
(
rect
.
get
(
'y'
,
svg_tree
.
getroot
()
.
nsmap
))
height
=
float
(
rect
.
get
(
'height'
,
svg_tree
.
getroot
()
.
nsmap
))
width
=
width
=
float
(
rect
.
get
(
'width'
,
svg_tree
.
getroot
()
.
nsmap
))
matrix
=
None
if
bool
(
rect
.
get
(
'transform'
)):
matrix
=
Matrix
(
transform_matrix_string
=
rect
.
get
(
'transform'
))
text
=
re
.
sub
(
r'(\s(?=[-;:.,?!’–])|(?<=[-;:.,?!’–])\s)'
,
''
,
rect_title
.
text
)
faksimile_page
.
append_word_position
(
\
WordPosition
(
id
=
rect
.
get
(
'id'
,
svg_tree
.
getroot
()
.
nsmap
),
text
=
text
,
height
=
height
,
\
width
=
width
,
x
=
x
,
y
=
y
,
matrix
=
matrix
,
tag
=
WordPosition
.
FAKSIMILE
))
faksimile_pages
.
append
(
faksimile_page
)
return
faksimile_pages
def
remove_tags_from_page_tree
(
self
,
list_of_tags_to_remove
):
"""Removes the tags specified in the list from the target tree.
"""
for
xpath2remove
in
list_of_tags_to_remove
:
for
node
in
self
.
page_tree
.
xpath
(
'//'
+
xpath2remove
):
node
.
getparent
()
.
remove
(
node
)
def
get_paths_inside_rect
(
svg_tree
,
xpath
,
x_min
,
x_max
,
y_min
,
y_max
,
not_id
,
namespaces
=
{}):
"""Returns a list of all paths selected by xpath that are inside x_min, x_max, y_min, y_max and do not have id == not_id.
"""
paths
=
[]
if
len
(
namespaces
)
==
0
:
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
for
path_node
in
svg_tree
.
xpath
(
xpath
,
namespaces
=
namespaces
):
append_node
=
path_node
if
not
path_node
.
tag
.
endswith
(
'path'
)
and
not
path_node
.
tag
.
endswith
(
'rect'
):
path_node
=
path_node
.
getparent
()
x
,
xmax
,
y
,
ymax
=
-
1
,
-
1
,
-
1
,
-
1
init_xy
=
False
if
path_node
.
tag
.
endswith
(
'rect'
):
x
=
float
(
path_node
.
get
(
'x'
))
if
bool
(
path_node
.
get
(
'x'
))
else
-
1
y
=
float
(
path_node
.
get
(
'y'
))
if
bool
(
path_node
.
get
(
'y'
))
else
-
1
xmax
=
x
+
float
(
path_node
.
get
(
'width'
))
if
bool
(
path_node
.
get
(
'width'
))
else
-
1
ymax
=
y
+
float
(
path_node
.
get
(
'height'
))
if
bool
(
path_node
.
get
(
'height'
))
else
-
1
init_xy
=
True
elif
path_node
.
tag
.
endswith
(
'path'
)
and
bool
(
path_node
.
get
(
'd'
))
and
path_node
.
get
(
'd'
)
!=
0
:
path
=
parse_path
(
path_node
.
get
(
'd'
))
x
,
xmax
,
y
,
ymax
=
path
.
bbox
()
init_xy
=
True
if
init_xy
:
if
bool
(
path_node
.
get
(
'transform'
)):
matrix
=
Matrix
(
transform_matrix_string
=
path_node
.
get
(
'transform'
))
x
,
xmax
=
matrix
.
get_new_x
(
x
=
x
,
y
=
y
),
matrix
.
get_new_x
(
x
=
xmax
,
y
=
ymax
)
y
,
ymax
=
matrix
.
get_new_y
(
x
=
x
,
y
=
y
),
matrix
.
get_new_y
(
x
=
xmax
,
y
=
ymax
)
width
=
xmax
-
x
height
=
ymax
-
y
if
x
>
x_min
and
x
<
x_max
\
and
y
>
y_min
and
y
<
y_max
\
and
path_node
.
get
(
'id'
)
!=
not_id
:
paths
.
append
(
append_node
)
return
paths
Event Timeline
Log In to Comment