Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F60644244
process_files.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, May 1, 16:35
Size
22 KB
Mime Type
text/x-python
Expires
Fri, May 3, 16:35 (2 d)
Engine
blob
Format
Raw Data
Handle
17389297
Attached To
rNIETZSCHEPYTHON nietzsche-python
process_files.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract information from all text svg files in directory.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
import
csv
import
getopt
import
re
import
sys
from
os
import
listdir
,
sep
,
path
from
os.path
import
isfile
,
isdir
,
dirname
import
lxml.etree
as
ET
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
convertPDF2SVG4Web
import
Converter
from
datatypes.page_creator
import
PageCreator
from
datatypes.transkriptionField
import
TranskriptionField
from
extractWordPosition
import
Extractor
from
fix_missing_glyphs
import
fix_missing_glyphs
from
util
import
update_svgposfile_status
,
update_manuscript_file
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
WARN_MISSING_USE_NODE
=
f
'with warnings:{PageCreator.WARNING_MISSING_USE_NODE4PWP}:'
WARN_MISSING_GLYPH
=
f
'with warnings:{PageCreator.WARNING_MISSING_GLYPH_ID4WIM}:'
class
MyCSVHandler
:
"""This class can be used to handle csv files that contain information about the tile and layout of the svg files.
"""
ENTRY_KEY_PAGE
=
'pdf_page_number'
ENTRY_KEY_FILE
=
'svg_source_file'
ENTRY_KEY_TITLE
=
'manuscript_title'
ENTRY_KEY_PAGE_NAMES
=
'page_names'
ENTRY_KEY_MARG_PAGE
=
'marginals_page_entry'
MANUSCRIPT_AE_REMOVAL
=
re
.
compile
(
'[a-e]'
)
MANUSCRIPT_KEY
=
'Ms'
MANUSCRIPT_PATTERN
=
re
.
compile
(
r'(\d+)(>\s)(.*)'
)
MANUSCRIPT_TITLE_EXTENSION
=
'Mp'
MANUSCRIPT_TITLE_PARTS
=
re
.
compile
(
r'([I-X]+[a-e]*)(\s)(\d+\w*)(/\d+\w*)*'
)
MARGINALS_PAGE
=
re
.
compile
(
r'([I-X]+[a-e]*)(\s)(\d+\w*)(\s)(Marg)'
)
REMOVE_NONNUMERIC
=
re
.
compile
(
'\D'
)
def
__init__
(
self
,
csv_file_name
,
pdf_file
,
svg_dir
,
title
=
None
):
self
.
csv_entries
=
[]
self
.
pdf_file
=
pdf_file
self
.
svg_dir
=
svg_dir
self
.
title
=
title
self
.
_init_csv_entries
(
csv_file_name
)
def
_init_csv_entries
(
self
,
csv_file_name
):
"""Init csv entries by reading the csv_file.
"""
with
open
(
csv_file_name
,
newline
=
''
)
as
csvfile
:
reader
=
csv
.
DictReader
(
csvfile
)
list_of_svg_files
=
[
svg_file
for
svg_file
in
listdir
(
self
.
svg_dir
)
if
svg_file
.
endswith
(
'.svg'
)
]
marg_entry
=
None
for
row
in
reader
:
ms_string
=
row
[
self
.
MANUSCRIPT_KEY
]
manuscript_match
=
re
.
match
(
self
.
MANUSCRIPT_PATTERN
,
ms_string
)
if
manuscript_match
is
not
None
:
page_number
=
int
(
manuscript_match
.
group
(
1
))
files_matching
=
[
svg_file
for
svg_file
in
list_of_svg_files
\
if
re
.
match
(
rf
'([0]*{page_number})(.svg)'
,
svg_file
.
replace
(
re
.
split
(
r'\d+\.svg'
,
svg_file
)[
0
],
''
))
]
if
len
(
files_matching
)
>
0
:
svg_file
=
files_matching
[
0
]
title_parts
=
re
.
match
(
self
.
MANUSCRIPT_TITLE_PARTS
,
manuscript_match
.
group
(
3
))
marginals_page
=
re
.
match
(
self
.
MARGINALS_PAGE
,
manuscript_match
.
group
(
3
))
if
marginals_page
is
not
None
:
marg_entry
=
{
self
.
ENTRY_KEY_PAGE
:
page_number
,
self
.
ENTRY_KEY_FILE
:
svg_file
}
elif
title_parts
is
not
None
:
title
=
self
.
MANUSCRIPT_AE_REMOVAL
.
sub
(
''
,
title_parts
.
group
(
1
))
manuscript_title
=
f
'{self.MANUSCRIPT_TITLE_EXTENSION} {title}'
entry
=
{
self
.
ENTRY_KEY_PAGE
:
page_number
,
\
self
.
ENTRY_KEY_FILE
:
svg_file
,
\
self
.
ENTRY_KEY_TITLE
:
manuscript_title
,
\
self
.
ENTRY_KEY_PAGE_NAMES
:
[
f
'{title_parts.group(3)}'
]
}
if
title_parts
.
group
(
4
)
is
not
None
:
entry
[
self
.
ENTRY_KEY_PAGE_NAMES
]
.
append
(
title_parts
.
group
(
4
)
.
replace
(
'/'
,
''
))
if
marg_entry
is
not
None
\
and
marg_entry
[
self
.
ENTRY_KEY_PAGE
]
==
page_number
-
1
:
entry
[
self
.
ENTRY_KEY_MARG_PAGE
]
=
marg_entry
marg_entry
=
None
if
self
.
title
is
None
\
or
self
.
title
==
manuscript_title
:
self
.
csv_entries
.
append
(
entry
)
def
process_files
(
self
,
svg_target_dir
,
xml_target_dir
,
error_handler
=
None
)
->
int
:
"""Process files and return exit status.
"""
exit_status
=
0
if
len
(
self
.
csv_entries
)
>
0
:
converter
=
Converter
(
target_dir
=
svg_target_dir
)
extractor
=
Extractor
(
xml_dir
=
xml_target_dir
)
for
entry
in
self
.
csv_entries
:
title
=
entry
[
self
.
ENTRY_KEY_TITLE
]
extractor
.
update_title_and_manuscript
(
title
)
#converter.title = title.replace(' ', '_')
pdf_page_number
=
entry
[
self
.
ENTRY_KEY_PAGE
]
svgfile
=
f
'{self.svg_dir}{sep}{entry[self.ENTRY_KEY_FILE]}'
for
index
,
page_number
in
enumerate
(
entry
[
self
.
ENTRY_KEY_PAGE_NAMES
]):
pdf_name_dictionary
=
{
pdf_page_number
:
title
.
replace
(
' '
,
'_'
)
+
'_'
+
str
(
page_number
)
+
'_web'
}
multipage_index
=
-
1
\
if
len
(
entry
[
self
.
ENTRY_KEY_PAGE_NAMES
])
==
1
\
else
index
marginals_page
=
None
\
if
not
bool
(
entry
.
get
(
self
.
ENTRY_KEY_MARG_PAGE
))
\
else
f
'{self.svg_dir}{sep}{entry[self.ENTRY_KEY_MARG_PAGE][self.ENTRY_KEY_FILE]}'
try
:
if
page_has_status
(
WARN_MISSING_USE_NODE
,
\
manuscript_file
=
extractor
.
manuscript_file
,
page_number
=
page_number
)
\
or
page_has_status
(
WARN_MISSING_GLYPH
,
\
manuscript_file
=
extractor
.
manuscript_file
,
page_number
=
page_number
):
svg_pos_file
=
get_page_output_file
(
page_number
,
manuscript_file
=
extractor
.
manuscript_file
)
if
svg_pos_file
is
not
None
and
isfile
(
svg_pos_file
):
fix_missing_glyphs
(
svg_pos_file
,
manuscript_file
=
extractor
.
manuscript_file
)
elif
not
is_page_ok
(
manuscript_file
=
extractor
.
manuscript_file
,
page_number
=
page_number
):
exit_status
=
process_file
(
converter
,
extractor
,
svgfile
,
self
.
pdf_file
,
page_number
,
\
pdf_name_dictionary
=
pdf_name_dictionary
,
multipage_index
=
multipage_index
,
\
marginals_page
=
marginals_page
)
except
Exception
as
err
:
print
(
err
)
if
error_handler
is
not
None
:
error_handler
.
record_error
(
svgfile
,
self
.
pdf_file
,
title
,
page_number
,
error
=
err
)
if
not
UNITTESTING
:
print
(
Fore
.
RED
)
print
(
'There was an error ->'
,
err
)
print
(
Style
.
RESET_ALL
)
if
error_handler
is
not
None
:
error_handler
.
write
()
return
exit_status
class
MyErrorHandler
:
"""This class can be used to handle errors executing extractWordPosition.Extractor.extractAndWriteInformation.
"""
ERROR_LOG
=
'error_log.xml'
def
__init__
(
self
):
self
.
tree
=
ET
.
ElementTree
(
ET
.
Element
(
'error-log'
))
if
isfile
(
MyErrorHandler
.
ERROR_LOG
):
parser
=
ET
.
XMLParser
(
remove_blank_text
=
True
)
self
.
tree
=
ET
.
parse
(
MyErrorHandler
.
ERROR_LOG
,
parser
)
def
record_error
(
self
,
svgfile
,
pdffile
,
title
,
page_number
,
error
=
None
):
"""Records an error.
"""
if
len
(
self
.
tree
.
xpath
(
'//error[@title="{0}" and @number="{1}"]'
.
format
(
title
,
page_number
)))
>
0
:
error_node
=
self
.
tree
.
xpath
(
'//error[@title="{0}" and @number="{1}"]'
.
format
(
title
,
page_number
))[
0
]
else
:
error_node
=
ET
.
SubElement
(
self
.
tree
.
getroot
(),
'error'
,
attrib
=
{
'title'
:
title
,
'number'
:
page_number
})
ET
.
SubElement
(
error_node
,
'svgfile'
)
.
text
=
svgfile
ET
.
SubElement
(
error_node
,
'pdffile'
)
.
text
=
pdffile
if
error
is
not
None
:
error_node
.
set
(
'type'
,
str
(
type
(
error
)
.
__name__
))
if
str
(
error
)
!=
''
:
error_msg
=
ET
.
SubElement
(
error_node
,
'error-msg'
)
error_msg
.
text
=
str
(
error
)
if
str
(
type
(
error
)
.
__name__
)
==
'ExpatError'
:
error_msg
.
text
+=
'->svgfile is empty!'
def
run
(
self
,
title
=
None
,
page_number
=
None
,
error_type
=
None
):
"""Run all or some errors
[:return:] exit status (int)
"""
xpath
=
'//error'
if
title
is
not
None
and
page_number
is
not
None
:
xpath
=
'//error[@title="{0}" and @number="{1}"]'
.
format
(
title
,
page_number
)
elif
title
is
not
None
:
xpath
=
'//error[@title="{0}"]'
.
format
(
title
)
elif
page_number
is
not
None
:
xpath
=
'//error[@number="{0}"]'
.
format
(
page_number
)
if
error_type
is
not
None
:
xpath
=
xpath
+
'[@type="{0}"]'
.
format
(
error_type
)
\
if
title
is
None
and
page_number
is
None
\
else
xpath
.
replace
(
']'
,
' '
)
+
'and @type="{0}"]'
.
format
(
error_type
)
exit_status
=
0
for
error
in
self
.
tree
.
xpath
(
xpath
):
title
=
error
.
get
(
'title'
)
page_number
=
error
.
get
(
'number'
)
svgfile
=
error
.
xpath
(
'./svgfile/text()'
)[
0
]
\
if
len
(
error
.
xpath
(
'./svgfile/text()'
))
>
0
else
None
pdffile
=
error
.
xpath
(
'./pdffile/text()'
)[
0
]
\
if
len
(
error
.
xpath
(
'./pdffile/text()'
))
>
0
else
None
if
svgfile
is
not
None
:
converter
=
Converter
(
title
=
title
)
extractor
=
Extractor
(
title
=
title
,
compare2pdf
=
True
)
status
=
process_file
(
converter
,
extractor
,
svgfile
,
pdffile
,
page_number
)
if
status
>
0
:
exit_status
=
status
if
status
<
2
:
error
.
getparent
()
.
remove
(
error
)
self
.
write
()
return
exit_status
def
write
(
self
):
"""Writes error log.
"""
write_pretty
(
xml_element_tree
=
self
.
tree
,
file_name
=
MyErrorHandler
.
ERROR_LOG
,
script_name
=
__file__
,
file_type
=
'xmlErrorLog'
)
def
get_page_output_file
(
page_number
:
str
,
manuscript_file
=
None
,
manuscript_tree
=
None
)
->
str
:
"""Return filename of xml output file for page with page number page_number.
"""
if
manuscript_tree
is
None
:
if
manuscript_file
is
None
or
not
isfile
(
manuscript_file
):
msg
=
f
'File {manuscript_file} does not exist!'
\
if
manuscript_file
is
not
None
\
else
'Please specify either manuscript_file or manuscript_tree'
raise
Exception
(
msg
)
manuscript_tree
=
ET
.
parse
(
manuscript_file
)
if
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
))
>
0
:
return
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
.
get
(
'output'
)
return
None
def
is_page_ok
(
manuscript_file
=
None
,
page_number
=
None
):
"""Returns true if page status is 'OK'.
"""
return
page_has_status
(
'OK'
,
manuscript_file
=
manuscript_file
,
page_number
=
page_number
)
def
page_has_status
(
status
,
manuscript_file
=
None
,
page_number
=
None
):
"""Returns true if page status is 'OK'.
"""
if
manuscript_file
is
not
None
and
isfile
(
manuscript_file
):
manuscript_tree
=
ET
.
parse
(
manuscript_file
)
if
page_number
is
not
None
\
and
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
))
>
0
:
return
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
.
get
(
'status'
)
==
status
return
False
def
is_svg_ok
(
manuscript_file
=
None
,
page_number
=
None
):
"""Returns true if svgfile contains a valid svg graphic location.
"""
if
manuscript_file
is
not
None
and
isfile
(
manuscript_file
):
manuscript_tree
=
ET
.
parse
(
manuscript_file
)
if
page_number
is
not
None
\
and
len
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
))
>
0
\
and
isfile
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
.
get
(
'output'
)):
xml_source_tree
=
ET
.
parse
(
manuscript_tree
.
getroot
()
.
xpath
(
'//page[@number="
%s
"]'
%
page_number
)[
0
]
.
get
(
'output'
))
return
len
(
xml_source_tree
.
xpath
(
'//svg/@file'
))
>
0
and
isfile
(
xml_source_tree
.
xpath
(
'//svg/@file'
)[
0
])
return
False
def
process_file
(
converter
,
extractor
,
svgfile
,
pdffile
,
page_number
,
pdf_name_dictionary
=
None
,
multipage_index
=-
1
,
marginals_page
=
None
):
"""Processes file.
[:return:] exit status (int)
"""
exit_status
=
0
if
not
UNITTESTING
:
print
(
Fore
.
LIGHTBLUE_EX
+
'Processing file {} ...'
.
format
(
svgfile
))
print
(
Style
.
RESET_ALL
)
if
converter
.
pdf2svg
(
pdffile
,
page_number
=
page_number
,
name_dictionary
=
pdf_name_dictionary
)
==
0
:
for
path_svg_file
in
converter
.
latest_converted_files
:
transkriptionField
=
TranskriptionField
(
path_svg_file
,
multipage_index
=
multipage_index
)
transkriptionField
.
shrink_svg_to_transkription_field
()
xml_target_file
=
extractor
.
get_file_name
(
svgfile
,
page_number
)
extraction_status
=
extractor
.
extractAndWriteInformation
(
svgfile
,
xml_target_file
=
xml_target_file
,
\
page_number
=
page_number
,
pdfFile
=
pdffile
,
svg_file
=
path_svg_file
,
record_warnings
=
True
,
\
multipage_index
=
multipage_index
,
marginals_page
=
marginals_page
)
if
extraction_status
<
2
and
extractor
.
manuscript_file
is
not
None
:
status
=
'OK'
if
extraction_status
==
1
:
status
=
extractor
.
latest_status
exit_status
=
1
update_svgposfile_status
(
xml_target_file
,
manuscript_file
=
extractor
.
manuscript_file
,
status
=
status
)
return
exit_status
def
update_graphical_svg
(
converter
,
svgfile
,
pdffile
,
page_number
,
xml_target_file
):
"""Create a new graphical svg file and update xml output file.
[:return:] exit status (int)
"""
exit_status
=
0
if
isfile
(
xml_target_file
):
path_svg_file
=
converter
.
get_file_name
(
pdffile
,
page_number
=
page_number
)
if
not
UNITTESTING
:
print
(
Fore
.
LIGHTBLUE_EX
+
'Creating file {} ...'
.
format
(
path_svg_file
))
print
(
Style
.
RESET_ALL
)
if
converter
.
pdf2svg
(
pdffile
,
page_number
=
page_number
,
svg_file_name
=
path_svg_file
)
==
0
:
transkriptionField
=
TranskriptionField
(
path_svg_file
)
transkriptionField
.
shrink_svg_to_transkription_field
()
page
=
PageCreator
(
xml_target_file
,
svg_file
=
path_svg_file
)
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
xml_target_file
,
script_name
=
__file__
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
else
:
exit_status
=
2
return
exit_status
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to extract information from all text svg files in a directory.
svgscripts/process_files.py [OPTIONS] <PDFDIR> <TEXT_SVG_DIR>
svgscripts/process_files.py [OPTIONS] <CSVFILE> <PDFFILE> <TEXT_SVG_DIR>
svgscripts/process_files.py [OPTIONS] <xmlManuscriptFile>
<PDFDIR> Directory containing pdfs corresponding to svg files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
<TEXT_SVG_DIR> Directory containing svg files corresponding to pdf files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
OPTIONS:
-h|--help: show help
-e|--run-error Rerun error cases.
-g|--check-graphic-svg Check that graphical svg file exists or generate a new svg file.
-n|--number=pageNumber Use this with OPTION -e|--run-error in order to specify an error case.
-t|--title=title: title of the manuscript to which all files belong.
-T|--error-type: error type, use this with OPTION -e|--run-error in order to specify an error case.
-s|--svg-target-dir=svg-target-dir target directory for path svg files, i.e. svg files that can be displayed on the web.
-x|--xml-target-dir=xml-target-dir target directory for xml files.
:return: exit code (int)
"""
check_graphic_svg_exists
=
False
csv_handler
=
None
error_handler
=
MyErrorHandler
()
error_type
=
None
number
=
None
rerun_errors
=
False
svg_target_dir
=
".{}svg"
.
format
(
sep
)
title
=
None
xml_target_dir
=
".{}xml"
.
format
(
sep
)
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hegn:s:t:T:x:"
,
[
"help"
,
"run-error"
,
"check-graphic-svg"
,
"number="
,
"svg-target-dir="
,
"title="
,
"error-type="
,
"xml-target-dir="
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
elif
opt
in
(
'-e'
,
'--run-error'
):
rerun_errors
=
True
elif
opt
in
(
'-g'
,
'--check-graphic-svg'
):
check_graphic_svg_exists
=
True
elif
opt
in
(
'-t'
,
'--title'
):
title
=
arg
elif
opt
in
(
'-T'
,
'--error-type'
):
error_type
=
arg
elif
opt
in
(
'-n'
,
'--number'
):
number
=
arg
elif
opt
in
(
'-s'
,
'--svg-target-dir'
):
svg_target_dir
=
arg
elif
opt
in
(
'-x'
,
'--xml-target-dir'
):
xml_target_dir
=
arg
if
rerun_errors
:
return
error_handler
.
run
(
title
=
title
,
page_number
=
number
,
error_type
=
error_type
)
if
len
(
args
)
==
1
and
args
[
0
]
.
endswith
(
'.xml'
):
source_tree
=
ET
.
parse
(
args
[
0
])
if
source_tree
.
getroot
()
.
find
(
'metadata/type'
)
.
text
==
FILE_TYPE_XML_MANUSCRIPT
:
svg_word_file_tree
=
ET
.
parse
(
source_tree
.
xpath
(
'//page/@output'
)[
0
])
svg_dir
=
dirname
(
svg_word_file_tree
.
xpath
(
'//page/@source'
)[
0
])
pdf_dir
=
dirname
(
svg_word_file_tree
.
xpath
(
'//page/pdf/@file'
)[
0
])
else
:
print
(
'File {} is not of type {}'
.
format
(
args
[
0
],
FILE_TYPE_XML_MANUSCRIPT
))
usage
()
return
2
elif
len
(
args
)
<
1
or
\
(
len
(
args
)
==
1
\
and
(
True
not
in
[
pdffile
.
endswith
(
'pdf'
)
for
pdffile
in
listdir
(
args
[
0
])
]
\
or
True
not
in
[
svgfile
.
endswith
(
'svg'
)
for
svgfile
in
listdir
(
args
[
0
])
])
\
):
print
(
"Please specify both PDFDIR and TEXT_SVG_DIR!"
)
usage
()
return
2
elif
len
(
args
)
<
2
:
pdf_dir
,
svg_dir
=
args
[
0
],
args
[
0
]
elif
isdir
(
args
[
0
])
and
isdir
(
args
[
1
]):
pdf_dir
,
svg_dir
=
args
[
0
],
args
[
1
]
if
True
in
[
svgfile
.
endswith
(
'pdf'
)
for
svgfile
in
listdir
(
args
[
1
])
]:
pdf_dir
,
svg_dir
=
args
[
1
],
args
[
0
]
elif
len
(
args
)
==
3
\
and
isfile
(
args
[
0
])
and
args
[
0
]
.
endswith
(
'.csv'
)
\
and
isfile
(
args
[
1
])
and
args
[
1
]
.
endswith
(
'.pdf'
)
\
and
isdir
(
args
[
2
]):
csv_handler
=
MyCSVHandler
(
args
[
0
],
args
[
1
],
args
[
2
],
title
=
title
)
return
csv_handler
.
process_files
(
svg_target_dir
,
xml_target_dir
,
error_handler
)
else
:
not_existing
=
args
[
0
]
if
not
isdir
(
args
[
0
])
else
args
[
1
]
print
(
"ERROR directory {} does not exist!"
.
format
(
not_existing
))
return
2
list_of_svg
=
[
svgfile
for
svgfile
in
listdir
(
svg_dir
)
if
svgfile
.
endswith
(
'svg'
)
]
list_of_pdf
=
[
pdffile
for
pdffile
in
listdir
(
pdf_dir
)
if
pdffile
.
endswith
(
'pdf'
)
]
converter
=
Converter
(
target_dir
=
svg_target_dir
,
title
=
title
)
extractor
=
Extractor
(
xml_dir
=
xml_target_dir
,
title
=
title
,
compare2pdf
=
True
)
exit_status
=
0
for
svgfile
in
list_of_svg
:
if
svgfile
.
replace
(
'.svg'
,
'.pdf'
)
in
list_of_pdf
:
title
=
re
.
split
(
r'(^[A-Z]+p*_[A-Z]*_[0-9]*)'
,
svgfile
)[
1
]
.
replace
(
'_'
,
' '
)
if
extractor
.
title
is
None
or
extractor
.
title
!=
title
:
extractor
.
update_title_and_manuscript
(
title
)
if
converter
.
title
is
None
or
converter
.
title
!=
title
:
converter
.
title
=
title
.
replace
(
' '
,
'_'
)
if
'page'
in
svgfile
:
page_number
=
svgfile
.
replace
(
'.svg'
,
''
)
.
split
(
'page'
)[
1
]
else
:
page_number
=
svgfile
.
replace
(
'.svg'
,
''
)
.
split
(
'_'
)[
-
1
]
pdffile
=
'{}{}{}'
.
format
(
pdf_dir
,
sep
,
svgfile
.
replace
(
'.svg'
,
'.pdf'
))
if
not
check_graphic_svg_exists
and
not
is_page_ok
(
manuscript_file
=
extractor
.
manuscript_file
,
page_number
=
page_number
):
try
:
svgfile
=
'{}{}{}'
.
format
(
svg_dir
,
sep
,
svgfile
)
exit_status
=
process_file
(
converter
,
extractor
,
svgfile
,
pdffile
,
page_number
)
except
Exception
as
err
:
error_handler
.
record_error
(
svgfile
,
pdffile
,
title
,
page_number
,
error
=
err
)
if
not
UNITTESTING
:
print
(
Fore
.
RED
)
print
(
'There was an error ->'
,
err
)
print
(
Style
.
RESET_ALL
)
elif
not
is_svg_ok
(
manuscript_file
=
extractor
.
manuscript_file
,
page_number
=
page_number
):
update_graphical_svg
(
converter
,
svgfile
,
pdffile
,
page_number
,
extractor
.
get_file_name
(
svgfile
,
page_number
))
error_handler
.
write
()
return
exit_status
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment