Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F65533994
extractFaksimilePosition.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, Jun 4, 11:50
Size
12 KB
Mime Type
text/x-python
Expires
Thu, Jun 6, 11:50 (2 d)
Engine
blob
Format
Raw Data
Handle
18089198
Attached To
rNIETZSCHEPYTHON nietzsche-python
extractFaksimilePosition.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract the position of the word hovers in a faksimile svg file and write them to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import
re
import
getopt
import
sys
from
os
import
sep
,
listdir
,
mkdir
,
path
from
os.path
import
exists
,
isfile
,
isdir
from
datetime
import
datetime
from
lxml
import
etree
as
ET
from
svgpathtools
import
svg2paths2
from
myxmlwriter
import
write_pretty
from
datatypes.faksimile
import
Faksimile
from
datatypes.faksimile_image
import
FaksimileImage
from
datatypes.matrix
import
Matrix
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.word
import
Word
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
class
Extractor
:
"""
This class can be used to extract the word hover positions in a faksimile svg file and write it to a xml file.
Args:
[xml_dir (str): target directory]
[title (str): title of document]
TODO change everything!!!
"""
def
__init__
(
self
,
xml_dir
=
None
,
title
=
None
):
if
bool
(
xml_dir
):
self
.
xml_dir
=
xml_dir
not
isdir
(
self
.
xml_dir
)
and
mkdir
(
self
.
xml_dir
)
else
:
self
.
xml_dir
=
'xml'
if
(
isdir
(
'xml'
))
else
''
self
.
xml_dir
=
self
.
xml_dir
+
sep
if
(
bool
(
self
.
xml_dir
))
else
''
self
.
title
=
title
def
get_page_number
(
self
,
file_name
,
page_number
=
None
):
""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
"""
if
not
bool
(
page_number
)
and
bool
(
re
.
search
(
r'\d'
,
file_name
)):
"""if page_number=None and filename contains digits,
then split filename into its parts that contain only digits, remove empty strings
and return the last part containing only digits.
"""
page_number
=
list
(
filter
(
lambda
x
:
x
!=
''
,
re
.
split
(
r'\D+'
,
file_name
)))
.
pop
()
if
bool
(
page_number
):
leading_zeros
=
'00'
if
(
len
(
page_number
)
==
1
)
else
'0'
if
(
len
(
page_number
)
==
2
)
else
''
return
leading_zeros
+
str
(
page_number
)
else
:
return
''
def
get_file_name
(
self
,
file_name
,
page_number
=
None
):
"""Returns the file_name of the target xml file.
"""
dir_name
=
path
.
dirname
(
self
.
xml_dir
)
+
sep
if
(
bool
(
self
.
xml_dir
))
else
''
if
bool
(
self
.
title
):
return
dir_name
+
self
.
title
.
replace
(
' '
,
'_'
)
+
'_page'
+
self
.
get_page_number
(
file_name
,
page_number
=
page_number
)
+
'.xml'
else
:
return
'{}{}'
.
format
(
dir_name
,
path
.
basename
(
file_name
)
.
replace
(
'.svg'
,
'.xml'
))
def
extract_faksimile_word_position
(
self
,
svg_tree
,
page
,
transkription_field
=
None
):
"""Extracts faksimile word hover positions.
"""
counter
=
0
word_part_obj
=
[]
endSign
=
'%'
last_matrix
=
None
MAXBOTTOMDIFF
=
5
MAXXDIFF
=
6
for
text_item
in
self
.
get_text_items
(
svg_tree
.
getroot
(),
transkription_field
=
transkription_field
):
current_matrix
=
Matrix
(
text_item
.
get
(
'transform'
),
transkription_field
=
transkription_field
)
# check for line breaks
if
(
last_matrix
is
not
None
and
len
(
word_part_obj
)
>
0
and
(
\
Matrix
.
DO_CONVERSION_FACTORS_DIFFER
(
last_matrix
,
current_matrix
)
or
\
(
abs
(
current_matrix
.
getY
()
-
last_matrix
.
getY
())
>
MAXBOTTOMDIFF
)
or
\
(
abs
(
current_matrix
.
getX
()
-
word_part_obj
[
len
(
word_part_obj
)
-
1
][
'x'
])
>
MAXXDIFF
)))
\
or
(
len
(
word_part_obj
)
>
0
and
self
.
get_word_object_multi_char_x
(
word_part_obj
[
0
])
>
current_matrix
.
getX
()):
endSign
=
'%'
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
debug_msg
=
'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'
.
format
(
\
abs
(
current_matrix
.
getX
()
-
word_part_obj
[
len
(
word_part_obj
)
-
1
][
'x'
]),
abs
(
current_matrix
.
getY
()
-
last_matrix
.
getY
()),
\
str
(
Matrix
.
DO_CONVERSION_FACTORS_DIFFER
(
last_matrix
,
current_matrix
)))
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
current_matrix
.
clone_transformation_matrix
(),
debug_msg
=
debug_msg
)
word_part_obj
=
[]
endX
=
current_matrix
.
getX
()
if
(
len
(
text_item
.
findall
(
".//tspan"
,
svg_tree
.
getroot
()
.
nsmap
))
<
1
):
# case: <svg><text>TEXT
if
(
bool
(
text_item
.
text
)
and
not
bool
(
re
.
search
(
r'^\s*$'
,
text_item
.
text
))):
word_part_obj
.
append
(
{
"text"
:
text_item
.
text
,
"x"
:
current_matrix
.
getX
(),
"y"
:
current_matrix
.
getY
(),
"class"
:
text_item
.
get
(
'class'
)}
)
else
:
endSign
=
text_item
.
text
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
current_matrix
.
clone_transformation_matrix
(),
debug_msg
=
'svg/text/\s'
)
word_part_obj
=
[]
endSign
=
'%'
for
tspan_item
in
text_item
.
findall
(
".//tspan"
,
svg_tree
.
getroot
()
.
nsmap
):
# case: <svg><text><tspan>TEXT
endX
=
current_matrix
.
add2X
(
tspan_item
.
get
(
'x'
))
if
(
tspan_item
.
text
!=
None
and
tspan_item
.
text
!=
''
and
not
bool
(
re
.
search
(
r'^\s*$'
,
tspan_item
.
text
))):
y
=
current_matrix
.
add2Y
(
tspan_item
.
get
(
'y'
))
word_part_obj
.
append
(
{
"text"
:
tspan_item
.
text
,
"x"
:
endX
,
"y"
:
y
,
"class"
:
tspan_item
.
get
(
'class'
)
})
if
len
(
set
(
page
.
letterspacing_list
)
&
set
(
tspan_item
.
get
(
'class'
)
.
split
(
' '
)))
>
0
:
# text_item has letterspacing class
endSign
=
'%'
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
current_matrix
.
clone_transformation_matrix
(),
debug_msg
=
'tspan with letterspacing'
)
word_part_obj
=
[]
else
:
endSign
=
tspan_item
.
text
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
current_matrix
.
clone_transformation_matrix
(),
debug_msg
=
'svg/text/tspan/\s'
)
word_part_obj
=
[]
endSign
=
'%'
last_matrix
=
current_matrix
if
(
self
.
get_word_from_part_obj
(
word_part_obj
)
!=
''
):
counter
=
self
.
add_word
(
page
,
counter
,
word_part_obj
,
endSign
,
endX
,
matrix
=
current_matrix
.
clone_transformation_matrix
(),
debug_msg
=
'end of loop'
)
word_part_obj
=
[]
endSign
=
'%'
def
extract_information
(
self
,
file_name
,
page_number
=
None
,
xml_target_file
=
None
,
svg_file
=
None
):
"""Extracts information about positions of text elements.
[:returns:] (svgscripts.Page) the page containing all information.
"""
if
isfile
(
file_name
):
if
not
bool
(
xml_target_file
):
xml_target_file
=
self
.
get_file_name
(
file_name
,
page_number
)
if
bool
(
self
.
xml_dir
)
and
not
bool
(
path
.
dirname
(
xml_target_file
)):
xml_target_file
=
path
.
dirname
(
self
.
xml_dir
)
+
sep
+
xml_target_file
transkription_field
=
TranskriptionField
(
file_name
)
svg_tree
=
ET
.
parse
(
file_name
)
page
=
Page
(
xml_target_file
=
xml_target_file
,
title
=
self
.
title
,
page_number
=
page_number
)
self
.
extract_faksimile_word_position
(
svg_tree
,
page
,
transkription_field
=
transkription_field
)
return
page
else
:
raise
FileNotFoundError
(
'
\"
{}
\"
is not an existing file!'
.
format
(
file_name
))
def
extractAndWriteInformation
(
self
,
file_name
):
"""Extracts faksimile word positions for each faksimile page and writes them to xml files.
"""
if
isfile
(
file_name
):
#TODO
faksimile
=
self
.
extract_information
(
file_name
,
page_number
=
page_number
,
xml_target_file
=
xml_target_file
,
svg_file
=
svg_file
)
write_pretty
(
xml_element_tree
=
faksimile
.
page_tree
,
file_name
=
xml_target_file
,
script_name
=
__file__
,
file_type
=
'svgWordPosition'
)
return
0
else
:
raise
FileNotFoundError
(
'
\"
{}
\"
is not an existing file!'
.
format
(
file_name
))
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to extract the position of the words in a svg file and write them to a xml file.
svgscripts/extractWordPosition.py [-h|--help, -d|--xml-dir=xmlDir, -m|-manuscript-file, -o|--only-transkription-field, -p|--page=pageNumber, -t|--title=title, -x|--xml-target-file=xmlOutputFile ] <file>
svgscripts/extractWordPosition.py [-h|--help, -d|--xml-dir=xmlDir, -m|-manuscript-file, -o|--only-transkription-field, -t|--title=title] <file|dir> ...
svgscripts/extractWordPosition.py [-h|--help, -d|--xml-dir=xmlDir, -m|-manuscript-file -o|--only-transkription-field]
-h|--help: show help
-d|--xml-dir=xmlDir: target directory for the xml output file(s)
-m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s)
-o|--only-transkription-field: extract only words that are part of the transkription field.
-p|--page=pageNumber: page number of the current page. For use with _one_ file only.
-x|--xml-target-file=xmlOutputFile: target file
-t|--title=title: title of the manuscript to which the current page(s) belong(s)
:return: exit code (int)
"""
xml_dir
=
".{}xml"
.
format
(
sep
)
title
=
None
page_number
=
None
xml_target_file
=
None
manuscript_file
=
None
extract_transkription_field_only
=
False
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hod:m:t:p:x:"
,
[
"help"
,
"only-transkription-field"
,
"xml-dir="
,
"manuscript-file="
,
"title="
,
"page="
,
"xml-target-file="
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
)
or
not
args
:
usage
()
return
0
elif
opt
in
(
'-o'
,
'--only-transkription-field'
):
extract_transkription_field_only
=
True
elif
opt
in
(
'-d'
,
'--xml-dir'
):
xml_dir
=
arg
elif
opt
in
(
'-m'
,
'--manuscript-file'
):
manuscript_file
=
arg
elif
opt
in
(
'-t'
,
'--title'
):
title
=
arg
elif
opt
in
(
'-p'
,
'--page'
):
page_number
=
str
(
arg
)
elif
opt
in
(
'-x'
,
'--xml-target-file'
):
xml_target_file
=
str
(
arg
)
files_to_process
=
list
()
for
arg
in
args
:
if
isfile
(
arg
):
files_to_process
.
append
(
arg
)
elif
isdir
(
arg
):
files_to_process
=
files_to_process
+
list
(
filter
(
lambda
file
:
'.svg'
in
file
,
listdir
(
arg
)))
else
:
print
(
"'{}' does not exist!"
.
format
(
arg
))
return
2
if
len
(
files_to_process
)
<
1
:
usage
()
return
2
if
len
(
files_to_process
)
>
1
and
(
bool
(
page_number
)
or
bool
(
xml_target_file
)):
print
(
"ERROR: too many input files: option --page and --xml-target-file presuppose one input file!"
)
usage
()
return
2
extractor
=
Extractor
(
xml_dir
=
xml_dir
,
title
=
title
,
manuscript_file
=
manuscript_file
,
extract_transkription_field_only
=
extract_transkription_field_only
)
for
file
in
files_to_process
:
extractor
.
extractAndWriteInformation
(
file
,
page_number
=
page_number
,
xml_target_file
=
xml_target_file
)
return
0
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment