Page MenuHomec4science

process_files.py
No OneTemporary

File Metadata

Created
Sat, Apr 27, 08:02

process_files.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract information from all text svg files in directory.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
import csv
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from convertPDF2SVG4Web import Converter
from datatypes.image import SVGImage
from datatypes.page_creator import PageCreator
from datatypes.transkriptionField import TranskriptionField
from datatypes.text_field import TextField
from extractWordPosition import Extractor
from fix_missing_glyphs import fix_missing_glyphs
from util import update_svgposfile_status, update_manuscript_file
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
WARN_MISSING_USE_NODE = f'with warnings:{PageCreator.WARNING_MISSING_USE_NODE4PWP}:'
WARN_MISSING_GLYPH = f'with warnings:{PageCreator.WARNING_MISSING_GLYPH_ID4WIM}:'
class MyCSVHandler:
"""This class can be used to handle csv files that contain information about the title and layout of the svg files.
"""
ENTRY_KEY_PAGE = 'pdf_page_number'
ENTRY_KEY_FILE = 'svg_source_file'
ENTRY_KEY_TITLE = 'manuscript_title'
ENTRY_KEY_PAGE_NAMES = 'page_names'
ENTRY_KEY_MARG_PAGE = 'marginals_page_entry'
MANUSCRIPT_AE_REMOVAL = re.compile('[a-e]')
MANUSCRIPT_KEY = 'Ms'
MANUSCRIPT_PATTERN = re.compile(r'(\d+)(>\s)(.*)')
MANUSCRIPT_TITLE_EXTENSION = 'Mp'
MANUSCRIPT_TITLE_PARTS = re.compile(r'([I-X]+[a-e]*)(\s)(\d+\w*)(/\d+\w*)*')
MARGINALS_PAGE = re.compile(r'([I-X]+[a-e]*)(\s)(\d+\w*)(\s)(Marg)')
REMOVE_NONNUMERIC = re.compile('\D')
def __init__(self, csv_file_name, pdf_file, svg_dir=None, title=None, createBlanks=False):
self.createBlanks = createBlanks
self.csv_entries = []
self.pdf_file = pdf_file
self.svg_dir = svg_dir
self.title = title
self._init_csv_entries(csv_file_name)
def _init_csv_entries(self, csv_file_name):
"""Init csv entries by reading the csv_file.
"""
with open(csv_file_name, newline='') as csvfile:
reader = csv.DictReader(csvfile)
list_of_svg_files = [ svg_file for svg_file in listdir(self.svg_dir) if svg_file.endswith('.svg') ]
marg_entry = None
for row in reader:
ms_string = row[self.MANUSCRIPT_KEY]
manuscript_match = re.match(self.MANUSCRIPT_PATTERN, ms_string)
if manuscript_match is not None:
page_number = int(manuscript_match.group(1))
files_matching = [ svg_file for svg_file in list_of_svg_files\
if re.match(rf'([0]*{page_number})(.svg)', svg_file.replace(re.split(r'\d+\.svg', svg_file)[0], '')) ]
if self.createBlanks or len(files_matching) > 0:
svg_file = files_matching[0] if len(files_matching) > 0 else None
title_parts = re.match(self.MANUSCRIPT_TITLE_PARTS, manuscript_match.group(3))
marginals_page = re.match(self.MARGINALS_PAGE, manuscript_match.group(3))
if marginals_page is not None:
marg_entry = { self.ENTRY_KEY_PAGE: page_number, self.ENTRY_KEY_FILE: svg_file }
elif title_parts is not None:
title = self.MANUSCRIPT_AE_REMOVAL.sub('', title_parts.group(1))
manuscript_title = f'{self.MANUSCRIPT_TITLE_EXTENSION} {title}'
entry = { self.ENTRY_KEY_PAGE: page_number,\
self.ENTRY_KEY_FILE: svg_file,\
self.ENTRY_KEY_TITLE: manuscript_title,\
self.ENTRY_KEY_PAGE_NAMES: [ f'{title_parts.group(3)}' ] }
if title_parts.group(4) is not None:
entry[self.ENTRY_KEY_PAGE_NAMES].append(title_parts.group(4).replace('/',''))
if marg_entry is not None\
and marg_entry[self.ENTRY_KEY_PAGE] == page_number-1:
entry[self.ENTRY_KEY_MARG_PAGE] = marg_entry
marg_entry = None
if self.title is None\
or self.title == manuscript_title:
self.csv_entries.append(entry)
def process_files(self, svg_target_dir, xml_target_dir, error_handler=None) -> int:
"""Process files and return exit status.
"""
exit_status = 0
if len(self.csv_entries) > 0:
converter = Converter(target_dir=svg_target_dir)
extractor = Extractor(xml_dir=xml_target_dir)
for entry in self.csv_entries:
title = entry[self.ENTRY_KEY_TITLE]
extractor.update_title_and_manuscript(title)
#converter.title = title.replace(' ', '_')
pdf_page_number = entry[self.ENTRY_KEY_PAGE]
svgfile = f'{self.svg_dir}{sep}{entry[self.ENTRY_KEY_FILE]}'
for index, page_number in enumerate(entry[self.ENTRY_KEY_PAGE_NAMES]):
pdf_name_dictionary = { pdf_page_number: title.replace(' ', '_') + '_' + str(page_number) + '_web' }
multipage_index = -1\
if len(entry[self.ENTRY_KEY_PAGE_NAMES]) == 1\
else index
marginals_page = None\
if not bool(entry.get(self.ENTRY_KEY_MARG_PAGE))\
else f'{self.svg_dir}{sep}{entry[self.ENTRY_KEY_MARG_PAGE][self.ENTRY_KEY_FILE]}'
try:
if self.createBlanks:
svg_pos_file = f'.{sep}xml{sep}' + title.replace(' ', '_') + '_page' + str(page_number) + '.xml'
exit_status = process_blank_file(converter, extractor, self.pdf_file, page_number, pdf_name_dictionary, pdf_name_dictionary[pdf_page_number], svg_pos_file,
multipage_index=multipage_index)
elif page_has_status(WARN_MISSING_USE_NODE,\
manuscript_file=extractor.manuscript_file, page_number=page_number)\
or page_has_status(WARN_MISSING_GLYPH,\
manuscript_file=extractor.manuscript_file, page_number=page_number):
svg_pos_file = get_page_output_file(page_number, manuscript_file=extractor.manuscript_file)
if svg_pos_file is not None and isfile(svg_pos_file):
fix_missing_glyphs(svg_pos_file, manuscript_file=extractor.manuscript_file)
elif not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
exit_status = process_file(converter, extractor, svgfile, self.pdf_file, page_number,\
pdf_name_dictionary=pdf_name_dictionary, multipage_index=multipage_index,\
marginals_page=marginals_page)
except Exception as err:
print(err)
if error_handler is not None:
error_handler.record_error(svgfile, self.pdf_file, title, page_number, error=err)
if not UNITTESTING:
print(Fore.RED)
print('There was an error ->', err)
print(Style.RESET_ALL)
if error_handler is not None:
error_handler.write()
return exit_status
class MyErrorHandler:
"""This class can be used to handle errors executing extractWordPosition.Extractor.extractAndWriteInformation.
"""
ERROR_LOG = 'error_log.xml'
def __init__(self):
self.tree = ET.ElementTree(ET.Element('error-log'))
if isfile(MyErrorHandler.ERROR_LOG):
parser = ET.XMLParser(remove_blank_text=True)
self.tree = ET.parse(MyErrorHandler.ERROR_LOG, parser)
def record_error(self, svgfile, pdffile, title, page_number, error=None):
"""Records an error.
"""
if len(self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))) > 0:
error_node = self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))[0]
else:
error_node = ET.SubElement(self.tree.getroot(), 'error', attrib={'title': title, 'number': page_number})
ET.SubElement(error_node, 'svgfile').text = svgfile
ET.SubElement(error_node, 'pdffile').text = pdffile
if error is not None:
error_node.set('type', str(type(error).__name__))
if str(error) != '':
error_msg = ET.SubElement(error_node, 'error-msg')
error_msg.text = str(error)
if str(type(error).__name__) == 'ExpatError':
error_msg.text += '->svgfile is empty!'
def run(self, title=None, page_number=None, error_type=None):
"""Run all or some errors
[:return:] exit status (int)
"""
xpath = '//error'
if title is not None and page_number is not None:
xpath = '//error[@title="{0}" and @number="{1}"]'.format(title, page_number)
elif title is not None:
xpath = '//error[@title="{0}"]'.format(title)
elif page_number is not None:
xpath = '//error[@number="{0}"]'.format(page_number)
if error_type is not None:
xpath = xpath + '[@type="{0}"]'.format(error_type)\
if title is None and page_number is None\
else xpath.replace(']', ' ') + 'and @type="{0}"]'.format(error_type)
exit_status = 0
for error in self.tree.xpath(xpath):
title = error.get('title')
page_number = error.get('number')
svgfile = error.xpath('./svgfile/text()')[0]\
if len(error.xpath('./svgfile/text()')) > 0 else None
pdffile = error.xpath('./pdffile/text()')[0]\
if len(error.xpath('./pdffile/text()')) > 0 else None
if svgfile is not None:
converter = Converter(title=title)
extractor = Extractor(title=title, compare2pdf=True)
status = process_file(converter, extractor, svgfile, pdffile, page_number)
if status > 0:
exit_status = status
if status < 2:
error.getparent().remove(error)
self.write()
return exit_status
def write(self):
"""Writes error log.
"""
write_pretty(xml_element_tree=self.tree, file_name=MyErrorHandler.ERROR_LOG, script_name=__file__, file_type='xmlErrorLog')
def get_page_output_file(page_number: str, manuscript_file=None, manuscript_tree=None) ->str:
"""Return filename of xml output file for page with page number page_number.
"""
if manuscript_tree is None:
if manuscript_file is None or not isfile(manuscript_file):
msg = f'File {manuscript_file} does not exist!'\
if manuscript_file is not None\
else 'Please specify either manuscript_file or manuscript_tree'
raise Exception(msg)
manuscript_tree = ET.parse(manuscript_file)
if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')
return None
def is_page_ok(manuscript_file=None, page_number=None):
"""Returns true if page status is 'OK'.
"""
return page_has_status('OK', manuscript_file=manuscript_file, page_number=page_number)
def page_has_status(status, manuscript_file=None, page_number=None):
"""Returns true if page status is 'OK'.
"""
if manuscript_file is not None and isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if page_number is not None\
and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('status') == status
return False
def is_svg_ok(manuscript_file=None, page_number=None):
"""Returns true if svgfile contains a valid svg graphic location.
"""
if manuscript_file is not None and isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if page_number is not None\
and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0\
and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')):
xml_source_tree = ET.parse(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output'))
return len(xml_source_tree.xpath('//svg/@file')) > 0 and isfile(xml_source_tree.xpath('//svg/@file')[0])
return False
def _update_svg_image_according_to_extended_text_field(page, text_field):
page.svg_image.width = text_field.width
page.svg_image.height = text_field.height
page.svg_image.text_field = text_field
page.svg_image.attach_object_to_tree(page.page_tree)
tf = TranskriptionField(page.svg_image.file_name, multipage_index=page.multipage_index)
tf.xmin = text_field.left
tf.ymin = text_field.top
tf.width = text_field.width
tf.height = text_field.height
tf.shrink_svg_to_transkription_field(redo=True)
def process_blank_file(converter, extractor, pdffile, page_number, pdf_name_dictionary, path_svg_file, xml_target_file, multipage_index):
"""Processes file as blank.
[:return:] exit status (int)
"""
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + f'Processing page {page_number} of {pdffile} ...')
print(Style.RESET_ALL)
if converter.pdf2svg(pdffile, page_number=page_number, name_dictionary=pdf_name_dictionary) == 0:
svg_file = converter.target_dir + sep + path_svg_file + '.svg'
if isfile(svg_file):
text_field = get_extended_text_field(svg_file, multipage_index=multipage_index)
if isfile(xml_target_file):
page = Page.create_cls(xml_target_file)
_update_svg_image_according_to_extended_text_field(page, text_field)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
else:
page = PageCreator(xml_target_file, title=extractor.title, page_number=page_number, svg_file=svg_file, multipage_index=multipage_index, svg_text_field=text_field)
_update_svg_image_according_to_extended_text_field(page, text_field)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
update_svgposfile_status(xml_target_file, manuscript_file=extractor.manuscript_file, status='blank')
return 0
return 2
return 2
def get_extended_text_field(svg_file, multipage_index=-1) ->TextField:
"""Get an extended text field, i.e. the text_field and the marginals.
"""
tf = TranskriptionField(svg_file, multipage_index=multipage_index)
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
max_y = tf.second_field.ymin_without_title if tf.second_field is not None else tf.documentHeight-30
if tf.second_field is None and tf.is_shrunk():
max_y = sorted([ float(y) for y in svg_tree.xpath(f'//ns:use[@y>{tf.ymax}]/@y', namespaces=namespaces)], reverse=True)[0]-40\
if len(svg_tree.xpath(f'//ns:use[@y>{tf.ymax}]/@y', namespaces=namespaces)) > 0\
else tf.documentHeight+100
sorted_left_x = sorted([ float(x) for x in svg_tree.xpath(f'//ns:use[@x<{tf.xmin} and @y>{tf.ymin} and @y <{max_y}]/@x', namespaces=namespaces)])
sorted_right_x = sorted([ float(x) for x in svg_tree.xpath(f'//ns:use[@x>{tf.xmax} and @y>{tf.ymin} and @y <{max_y}]/@x', namespaces=namespaces)], reverse=True)
sorted_y = sorted([ float(y) for y in svg_tree.xpath(f'//ns:use[@y>{tf.ymax} and @y <{max_y}]/@y', namespaces=namespaces)], reverse=True)
xmin = sorted_left_x[0]-5 if len(sorted_left_x) > 0 and sorted_left_x[0]-5 < tf.xmin else tf.xmin
xmax = sorted_right_x[0]+5 if len(sorted_right_x) > 0 and sorted_right_x[0]+5 > tf.xmax else tf.xmax
ymax = sorted_y[0]+5 if len(sorted_y) > 0 else tf.ymax
return TextField(x=xmin, y=tf.ymin, width=xmax-xmin, height=ymax-tf.ymin)
def process_file(converter, extractor, svgfile, pdffile, page_number, pdf_name_dictionary=None, multipage_index=-1, marginals_page=None):
"""Processes file.
[:return:] exit status (int)
"""
exit_status = 0
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} ...'.format(svgfile))
print(Style.RESET_ALL)
if converter.pdf2svg(pdffile, page_number=page_number, name_dictionary=pdf_name_dictionary) == 0:
for path_svg_file in converter.latest_converted_files:
transkriptionField = TranskriptionField(path_svg_file, multipage_index=multipage_index)
transkriptionField.shrink_svg_to_transkription_field()
xml_target_file = extractor.get_file_name(svgfile, page_number)
extraction_status = extractor.extractAndWriteInformation(svgfile, xml_target_file=xml_target_file,\
page_number=page_number, pdfFile=pdffile, svg_file=path_svg_file, record_warnings=True,\
multipage_index=multipage_index, marginals_page=marginals_page)
if extraction_status < 2 and extractor.manuscript_file is not None:
status = 'OK'
if extraction_status == 1:
status = extractor.latest_status
exit_status = 1
update_svgposfile_status(xml_target_file, manuscript_file=extractor.manuscript_file, status=status)
return exit_status
def update_graphical_svg(converter, svgfile, pdffile, page_number, xml_target_file):
"""Create a new graphical svg file and update xml output file.
[:return:] exit status (int)
"""
exit_status = 0
if isfile(xml_target_file):
path_svg_file = converter.get_file_name(pdffile, page_number=page_number)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Creating file {} ...'.format(path_svg_file))
print(Style.RESET_ALL)
if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0:
transkriptionField = TranskriptionField(path_svg_file)
transkriptionField.shrink_svg_to_transkription_field()
page = PageCreator(xml_target_file, svg_file=path_svg_file)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
else:
exit_status = 2
return exit_status
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to extract information from all text svg files in a directory.
svgscripts/process_files.py [OPTIONS] <PDFDIR> <TEXT_SVG_DIR>
svgscripts/process_files.py [OPTIONS] <CSVFILE> <PDFFILE> <TEXT_SVG_DIR>
svgscripts/process_files.py --blank <CSVFILE> <PDFFILE>
svgscripts/process_files.py [OPTIONS] <xmlManuscriptFile>
<PDFDIR> Directory containing pdfs corresponding to svg files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
<TEXT_SVG_DIR> Directory containing svg files corresponding to pdf files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
OPTIONS:
-h|--help: show help
-b|--blank Create web svg files and empty xml files.
-e|--run-error Rerun error cases.
-g|--check-graphic-svg Check that graphical svg file exists or generate a new svg file.
-n|--number=pageNumber Use this with OPTION -e|--run-error in order to specify an error case.
-t|--title=title: title of the manuscript to which all files belong.
-T|--error-type: error type, use this with OPTION -e|--run-error in order to specify an error case.
-s|--svg-target-dir=svg-target-dir target directory for path svg files, i.e. svg files that can be displayed on the web.
-x|--xml-target-dir=xml-target-dir target directory for xml files.
:return: exit code (int)
"""
blank = False
check_graphic_svg_exists = False
csv_handler = None
error_handler = MyErrorHandler()
error_type = None
number = None
rerun_errors = False
svg_target_dir = ".{}svg".format(sep)
title = None
xml_target_dir = ".{}xml".format(sep)
try:
opts, args = getopt.getopt(argv, "hbegn:s:t:T:x:", ["help", "blank", "run-error", "check-graphic-svg", "number=", "svg-target-dir=", "title=", "error-type=", "xml-target-dir="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-b', '--blank'):
blank = True
elif opt in ('-e', '--run-error'):
rerun_errors = True
elif opt in ('-g', '--check-graphic-svg'):
check_graphic_svg_exists = True
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-T', '--error-type'):
error_type = arg
elif opt in ('-n', '--number'):
number = arg
elif opt in ('-s', '--svg-target-dir'):
svg_target_dir = arg
elif opt in ('-x', '--xml-target-dir'):
xml_target_dir = arg
if blank:
if len(args) == 2\
and isfile(args[0]) and args[0].endswith('.csv')\
and isfile(args[1]) and args[1].endswith('.pdf'):
csv_handler = MyCSVHandler(args[0], args[1], title=title, createBlanks=blank)
return csv_handler.process_files(svg_target_dir, xml_target_dir, error_handler)
else:
print("Please specify both CSV- and PDF-file!")
usage()
return 2
if rerun_errors:
return error_handler.run(title=title, page_number=number, error_type=error_type)
if len(args) == 1 and args[0].endswith('.xml'):
source_tree = ET.parse(args[0])
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
svg_word_file_tree = ET.parse(source_tree.xpath('//page/@output')[0])
svg_dir = dirname(svg_word_file_tree.xpath('//page/@source')[0])
pdf_dir = dirname(svg_word_file_tree.xpath('//page/pdf/@file')[0])
else:
print('File {} is not of type {}'.format(args[0], FILE_TYPE_XML_MANUSCRIPT))
usage()
return 2
elif len(args) < 1 or\
(len(args) == 1\
and (True not in [ pdffile.endswith('pdf') for pdffile in listdir(args[0]) ]\
or True not in [ svgfile.endswith('svg') for svgfile in listdir(args[0]) ])\
):
print("Please specify both PDFDIR and TEXT_SVG_DIR!")
usage()
return 2
elif len(args) < 2:
pdf_dir, svg_dir = args[0], args[0]
elif isdir(args[0]) and isdir(args[1]):
pdf_dir, svg_dir = args[0], args[1]
if True in [ svgfile.endswith('pdf') for svgfile in listdir(args[1]) ]:
pdf_dir, svg_dir = args[1], args[0]
elif len(args) == 3\
and isfile(args[0]) and args[0].endswith('.csv')\
and isfile(args[1]) and args[1].endswith('.pdf')\
and isdir(args[2]):
csv_handler = MyCSVHandler(args[0], args[1], args[2], title=title)
return csv_handler.process_files(svg_target_dir, xml_target_dir, error_handler)
else:
not_existing = args[0] if not isdir(args[0]) else args[1]
print("ERROR directory {} does not exist!".format(not_existing))
return 2
list_of_svg = [ svgfile for svgfile in listdir(svg_dir) if svgfile.endswith('svg') ]
list_of_pdf = [ pdffile for pdffile in listdir(pdf_dir) if pdffile.endswith('pdf') ]
converter = Converter(target_dir=svg_target_dir, title=title)
extractor = Extractor(xml_dir=xml_target_dir, title=title, compare2pdf=True)
exit_status = 0
for svgfile in list_of_svg:
if svgfile.replace('.svg', '.pdf') in list_of_pdf:
title = re.split(r'(^[A-Z]+p*_[A-Z]*_[0-9]*)', svgfile)[1].replace('_', ' ')
if extractor.title is None or extractor.title != title:
extractor.update_title_and_manuscript(title)
if converter.title is None or converter.title != title:
converter.title = title.replace(' ', '_')
if 'page' in svgfile:
page_number = svgfile.replace('.svg','').split('page')[1]
else:
page_number = svgfile.replace('.svg','').split('_')[-1]
pdffile = '{}{}{}'.format(pdf_dir, sep, svgfile.replace('.svg', '.pdf'))
if not check_graphic_svg_exists and not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
try:
svgfile = '{}{}{}'.format(svg_dir, sep, svgfile)
exit_status = process_file(converter, extractor, svgfile, pdffile, page_number)
except Exception as err:
error_handler.record_error(svgfile, pdffile, title, page_number, error=err)
if not UNITTESTING:
print(Fore.RED)
print('There was an error ->', err)
print(Style.RESET_ALL)
elif not is_svg_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
update_graphical_svg(converter, svgfile, pdffile, page_number, extractor.get_file_name(svgfile, page_number))
error_handler.write()
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline