Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F62599095
process_files.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, May 14, 06:19
Size
15 KB
Mime Type
text/x-python
Expires
Thu, May 16, 06:19 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
17666854
Attached To
rNIETZSCHEPYTHON nietzsche-python
process_files.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract information from all text svg files in directory.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
import
getopt
import
re
import
sys
from
os
import
listdir
,
sep
,
path
from
os.path
import
isfile
,
isdir
,
dirname
import
lxml.etree
as
ET
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
convertPDF2SVG4Web
import
Converter
from
datatypes.page
import
Page
from
datatypes.transkriptionField
import
TranskriptionField
from
extractWordPosition
import
Extractor
from
myxmlwriter
import
write_pretty
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
class
MyErrorHandler
:
"""This class can be used to handle errors executing extractWordPosition.Extractor.extractAndWriteInformation.
"""
ERROR_LOG
=
'error_log.xml'
def
__init__
(
self
):
self
.
tree
=
ET
.
ElementTree
(
ET
.
Element
(
'error-log'
))
if
isfile
(
MyErrorHandler
.
ERROR_LOG
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
tree
=
ET
.
parse
(
MyErrorHandler
.
ERROR_LOG
,
parser
)
def
record_error
(
self
,
svgfile
,
pdffile
,
title
,
page_number
,
error
=
None
):
"""Records an error.
"""
if
len
(
self
.
tree
.
xpath
(
'//error[@title="{0}" and @number="{1}"]'
.
format
(
title
,
page_number
)))
>
0
:
error_node
=
self
.
tree
.
xpath
(
'//error[@title="{0}" and @number="{1}"]'
.
format
(
title
,
page_number
))[
0
]
else
:
error_node
=
ET
.
SubElement
(
self
.
tree
.
getroot
(),
'error'
,
attrib
=
{
'title'
:
title
,
'number'
:
page_number
})
ET
.
SubElement
(
error_node
,
'svgfile'
)
.
text
=
svgfile
ET
.
SubElement
(
error_node
,
'pdffile'
)
.
text
=
pdffile
if
error
is
not
None
:
error_node
.
set
(
'type'
,
str
(
type
(
error
)
.
__name__
))
if
str
(
error
)
!=
''
:
error_msg
=
ET
.
SubElement
(
error_node
,
'error-msg'
)
error_msg
.
text
=
str
(
error
)
if
str
(
type
(
error
)
.
__name__
)
==
'ExpatError'
:
error_msg
.
text
+=
'->svgfile is empty!'
def
run
(
self
,
title
=
None
,
page_number
=
None
,
error_type
=
None
):
"""Run all or some errors
[:return:] exit status (int)
"""
xpath
=
'//error'
if
title
is
not
None
and
page_number
is
not
None
:
xpath
=
'//error[@title="{0}" and @number="{1}"]'
.
format
(
title
,
page_number
)
elif
title
is
not
None
:
xpath
=
'//error[@title="{0}"]'
.
format
(
title
)
elif
page_number
is
not
None
:
xpath
=
'//error[@number="{0}"]'
.
format
(
page_number
)
if
error_type
is
not
None
:
xpath
=
xpath
+
'[@type="{0}"]'
.
format
(
error_type
)
\
if
title
is
None
and
page_number
is
None
\
else
xpath
.
replace
(
']'
,
' '
)
+
'and @type="{0}"]'
.
format
(
error_type
)
exit_status
=
0
for
error
in
self
.
tree
.
xpath
(
xpath
):
title
=
error
.
get
(
'title'
)
page_number
=
error
.
get
(
'number'
)
svgfile
=
error
.
xpath
(
'./svgfile/text()'
)[
0
]
\
if
len
(
error
.
xpath
(
'./svgfile/text()'
))
>
0
else
None
pdffile
=
error
.
xpath
(
'./pdffile/text()'
)[
0
]
\
if
len
(
error
.
xpath
(
'./pdffile/text()'
))
>
0
else
None
if
svgfile
is
not
None
:
converter
=
Converter
(
title
=
title
)
extractor
=
Extractor
(
title
=
title
,
extract_transkription_field_only
=
True
,
compare2pdf
=
True
)
status
=
process_file
(
converter
,
extractor
,
svgfile
,
pdffile
,
page_number
)
if
status
>
0
:
exit_status
=
status
if
status
<
2
:
error
.
getparent
()
.
remove
(
error
)
self
.
write
()
return
exit_status
def
write
(
self
):
"""Writes error log.
"""
write_pretty
(
xml_element_tree
=
self
.
tree
,
file_name
=
MyErrorHandler
.
ERROR_LOG
,
script_name
=
__file__
,
file_type
=
'xmlErrorLog'
)
def
is_page_ok
(
manuscript_file
=
None
,
page_number
=
None
):
"""Returns true if page status is 'OK'.
"""
if
manuscript_file
is
not
None
and
isfile
(
manuscript_file
):
manuscript_tree
=
ET
.
parse
(
manuscript_file
)
if
page_number
is
not
None
\
and
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
))
>
0
:
return
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
.
get
(
'status'
)
==
'OK'
\
and
isfile
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
.
get
(
'output'
))
return
False
def
is_svg_ok
(
manuscript_file
=
None
,
page_number
=
None
):
"""Returns true if svgfile contains a valid svg graphic location.
"""
if
manuscript_file
is
not
None
and
isfile
(
manuscript_file
):
manuscript_tree
=
ET
.
parse
(
manuscript_file
)
if
page_number
is
not
None
\
and
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
))
>
0
\
and
isfile
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
.
get
(
'output'
)):
xml_source_tree
=
ET
.
parse
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
.
get
(
'output'
))
return
len
(
xml_source_tree
.
xpath
(
'//svg/@file'
))
>
0
and
isfile
(
xml_source_tree
.
xpath
(
'//svg/@file'
)[
0
])
return
False
def
process_file
(
converter
,
extractor
,
svgfile
,
pdffile
,
page_number
):
"""Processes file.
[:return:] exit status (int)
"""
exit_status
=
0
path_svg_file
=
converter
.
get_file_name
(
pdffile
,
page_number
=
page_number
)
if
not
UNITTESTING
:
print
(
Fore
.
BLUE
+
'Processing file {} ...'
.
format
(
svgfile
))
print
(
Style
.
RESET_ALL
)
if
converter
.
pdf2svg
(
pdffile
,
page_number
=
page_number
,
svg_file_name
=
path_svg_file
)
==
0
:
transkriptionField
=
TranskriptionField
(
path_svg_file
)
transkriptionField
.
shrink_svg_to_transkription_field
()
xml_target_file
=
extractor
.
get_file_name
(
svgfile
,
page_number
)
extraction_status
=
extractor
.
extractAndWriteInformation
(
svgfile
,
xml_target_file
=
xml_target_file
,
\
page_number
=
page_number
,
pdfFile
=
pdffile
,
svg_file
=
path_svg_file
,
record_warnings
=
True
)
if
extraction_status
<
2
and
extractor
.
manuscript_file
is
not
None
:
status
=
'OK'
if
extraction_status
==
1
:
status
=
extractor
.
latest_status
exit_status
=
1
update_manuscript_file
(
extractor
.
manuscript_file
,
page_number
,
xml_target_file
,
status
=
status
)
return
exit_status
def
update_graphical_svg
(
converter
,
svgfile
,
pdffile
,
page_number
,
xml_source_file
):
"""Create a new graphical svg file and update xml output file.
[:return:] exit status (int)
"""
exit_status
=
0
if
isfile
(
xml_source_file
):
path_svg_file
=
converter
.
get_file_name
(
pdffile
,
page_number
=
page_number
)
if
not
UNITTESTING
:
print
(
Fore
.
LIGHTBLUE_EX
+
'Creating file {} ...'
.
format
(
svgfile
))
print
(
Style
.
RESET_ALL
)
if
converter
.
pdf2svg
(
pdffile
,
page_number
=
page_number
,
svg_file_name
=
path_svg_file
)
==
0
:
transkriptionField
=
TranskriptionField
(
path_svg_file
)
transkriptionField
.
shrink_svg_to_transkription_field
()
page
=
Page
(
xml_source_file
=
xml_source_file
,
svg_file
=
path_svg_file
)
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
xml_source_file
,
script_name
=
__file__
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
else
:
exit_status
=
2
return
exit_status
def
update_manuscript_file
(
manuscript_file
,
page_number
,
file_name
,
status
=
'changed'
):
"""Updates manuscript file: adds status information about page.
"""
if
isfile
(
manuscript_file
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
manuscript_tree
=
ET
.
parse
(
manuscript_file
,
parser
)
if
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
))
>
0
:
node
=
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
node
.
set
(
'status'
,
status
)
node
.
set
(
'output'
,
file_name
)
else
:
pages_node
=
manuscript_tree
.
getroot
()
.
find
(
'pages'
)
\
if
manuscript_tree
.
getroot
()
.
find
(
'pages'
)
is
not
None
\
else
ET
.
SubElement
(
manuscript_tree
.
getroot
(),
'pages'
)
new_id
=
len
(
pages_node
.
findall
(
'page'
))
+
1
ET
.
SubElement
(
pages_node
,
'page'
,
attrib
=
{
'id'
:
str
(
new_id
),
'number'
:
str
(
page_number
),
'status'
:
status
,
'output'
:
file_name
})
write_pretty
(
xml_element_tree
=
manuscript_tree
,
file_name
=
manuscript_file
,
script_name
=
__file__
,
file_type
=
FILE_TYPE_XML_MANUSCRIPT
)
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to extract information from all text svg files in a directory.
svgscripts/process_files.py [OPTIONS] <PDFDIR> <TEXT_SVG_DIR>
svgscripts/process_files.py [OPTIONS] <xmlManuscriptFile>
<PDFDIR> Directory containing pdfs corresponding to svg files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
<TEXT_SVG_DIR> Directory containing svg files corresponding to pdf files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
OPTIONS:
-h|--help: show help
-e|--run-error Rerun error cases.
-g|--check-graphic-svg Check that graphical svg file exists or generate a new svg file.
-n|--number=pageNumber Use this with OPTION -e|--run-error in order to specify an error case.
-t|--title=title: title of the manuscript to which all files belong.
-T|--error-type: error type, use this with OPTION -e|--run-error in order to specify an error case.
-s|--svg-target-dir=svg-target-dir target directory for path svg files, i.e. svg files that can be displayed on the web.
-x|--xml-target-dir=xml-target-dir target directory for xml files.
:return: exit code (int)
"""
title
=
None
xml_target_dir
=
".{}xml"
.
format
(
sep
)
svg_target_dir
=
".{}svg"
.
format
(
sep
)
error_handler
=
MyErrorHandler
()
number
=
None
rerun_errors
=
False
error_type
=
None
check_graphic_svg_exists
=
False
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hegn:s:t:T:x:"
,
[
"help"
,
"run-error"
,
"check-graphic-svg"
,
"number="
,
"svg-target-dir="
,
"title="
,
"error-type="
,
"xml-target-dir="
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
elif
opt
in
(
'-e'
,
'--run-error'
):
rerun_errors
=
True
elif
opt
in
(
'-g'
,
'--check-graphic-svg'
):
check_graphic_svg_exists
=
True
elif
opt
in
(
'-t'
,
'--title'
):
title
=
arg
elif
opt
in
(
'-T'
,
'--error-type'
):
error_type
=
arg
elif
opt
in
(
'-n'
,
'--number'
):
number
=
arg
elif
opt
in
(
'-s'
,
'--svg-target-dir'
):
svg_target_dir
=
arg
elif
opt
in
(
'-x'
,
'--xml-target-dir'
):
xml_target_dir
=
arg
if
rerun_errors
:
return
error_handler
.
run
(
title
=
title
,
page_number
=
number
,
error_type
=
error_type
)
if
len
(
args
)
==
1
and
args
[
0
]
.
endswith
(
'.xml'
):
source_tree
=
ET
.
parse
(
args
[
0
])
if
source_tree
.
getroot
()
.
find
(
'metadata/type'
)
.
text
==
FILE_TYPE_XML_MANUSCRIPT
:
svg_word_file_tree
=
ET
.
parse
(
source_tree
.
xpath
(
'//page/@output'
)[
0
])
svg_dir
=
dirname
(
svg_word_file_tree
.
xpath
(
'//page/@source'
)[
0
])
pdf_dir
=
dirname
(
svg_word_file_tree
.
xpath
(
'//page/pdf/@file'
)[
0
])
else
:
print
(
'File {} is not of type {}'
.
format
(
args
[
0
],
FILE_TYPE_XML_MANUSCRIPT
))
usage
()
return
2
elif
len
(
args
)
<
1
or
\
(
len
(
args
)
==
1
\
and
(
True
not
in
[
pdffile
.
endswith
(
'pdf'
)
for
pdffile
in
listdir
(
args
[
0
])
]
\
or
True
not
in
[
svgfile
.
endswith
(
'svg'
)
for
svgfile
in
listdir
(
args
[
0
])
])
\
):
print
(
"Please specify both PDFDIR and TEXT_SVG_DIR!"
)
usage
()
return
2
elif
len
(
args
)
<
2
:
pdf_dir
,
svg_dir
=
args
[
0
],
args
[
0
]
elif
isdir
(
args
[
0
])
and
isdir
(
args
[
1
]):
pdf_dir
,
svg_dir
=
args
[
0
],
args
[
1
]
if
True
in
[
svgfile
.
endswith
(
'pdf'
)
for
svgfile
in
listdir
(
args
[
1
])
]:
pdf_dir
,
svg_dir
=
args
[
1
],
args
[
0
]
else
:
not_existing
=
args
[
0
]
if
not
isdir
(
args
[
0
])
else
args
[
1
]
print
(
"ERROR directory {} does not exist!"
.
format
(
not_existing
))
return
2
list_of_svg
=
[
svgfile
for
svgfile
in
listdir
(
svg_dir
)
if
svgfile
.
endswith
(
'svg'
)
]
list_of_pdf
=
[
pdffile
for
pdffile
in
listdir
(
pdf_dir
)
if
pdffile
.
endswith
(
'pdf'
)
]
converter
=
Converter
(
target_dir
=
svg_target_dir
,
title
=
title
)
extractor
=
Extractor
(
xml_dir
=
xml_target_dir
,
title
=
title
,
extract_transkription_field_only
=
True
,
compare2pdf
=
True
)
exit_status
=
0
for
svgfile
in
list_of_svg
:
if
svgfile
.
replace
(
'.svg'
,
'.pdf'
)
in
list_of_pdf
:
title
=
re
.
split
(
r'(^[A-Z]+p*_[A-Z]*_[0-9]*)'
,
svgfile
)[
1
]
.
replace
(
'_'
,
' '
)
if
extractor
.
title
is
None
or
extractor
.
title
!=
title
:
extractor
.
update_title_and_manuscript
(
title
)
if
converter
.
title
is
None
or
converter
.
title
!=
title
:
converter
.
title
=
title
.
replace
(
' '
,
'_'
)
if
'page'
in
svgfile
:
page_number
=
svgfile
.
replace
(
'.svg'
,
''
)
.
split
(
'page'
)[
1
]
else
:
page_number
=
svgfile
.
replace
(
'.svg'
,
''
)
.
split
(
'_'
)[
len
(
svgfile
.
replace
(
'.svg'
,
''
)
.
split
(
'_'
))
-
1
]
pdffile
=
'{}{}{}'
.
format
(
pdf_dir
,
sep
,
svgfile
.
replace
(
'.svg'
,
'.pdf'
))
if
not
check_graphic_svg_exists
and
not
is_page_ok
(
manuscript_file
=
extractor
.
manuscript_file
,
page_number
=
page_number
):
try
:
svgfile
=
'{}{}{}'
.
format
(
svg_dir
,
sep
,
svgfile
)
exit_status
=
process_file
(
converter
,
extractor
,
svgfile
,
pdffile
,
page_number
)
except
Exception
as
err
:
error_handler
.
record_error
(
svgfile
,
pdffile
,
title
,
page_number
,
error
=
err
)
if
not
UNITTESTING
:
print
(
Fore
.
RED
)
print
(
'There was an error ->'
,
err
)
print
(
Style
.
RESET_ALL
)
elif
not
is_svg_ok
(
manuscript_file
=
extractor
.
manuscript_file
,
page_number
=
page_number
):
update_graphical_svg
(
converter
,
svgfile
,
pdffile
,
page_number
,
extractor
.
get_file_name
(
svgfile
,
page_number
))
error_handler
.
write
()
return
exit_status
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment