Page MenuHomec4science

process_files.py
No OneTemporary

File Metadata

Created
Tue, May 14, 06:19

process_files.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract information from all text svg files in directory.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from convertPDF2SVG4Web import Converter
from datatypes.page import Page
from datatypes.transkriptionField import TranskriptionField
from extractWordPosition import Extractor
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
class MyErrorHandler:
"""This class can be used to handle errors executing extractWordPosition.Extractor.extractAndWriteInformation.
"""
ERROR_LOG = 'error_log.xml'
def __init__(self):
self.tree = ET.ElementTree(ET.Element('error-log'))
if isfile(MyErrorHandler.ERROR_LOG):
parser = ET.XMLParser(remove_blank_text=True)
self.tree = ET.parse(MyErrorHandler.ERROR_LOG, parser)
def record_error(self, svgfile, pdffile, title, page_number, error=None):
"""Records an error.
"""
if len(self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))) > 0:
error_node = self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))[0]
else:
error_node = ET.SubElement(self.tree.getroot(), 'error', attrib={'title': title, 'number': page_number})
ET.SubElement(error_node, 'svgfile').text = svgfile
ET.SubElement(error_node, 'pdffile').text = pdffile
if error is not None:
error_node.set('type', str(type(error).__name__))
if str(error) != '':
error_msg = ET.SubElement(error_node, 'error-msg')
error_msg.text = str(error)
if str(type(error).__name__) == 'ExpatError':
error_msg.text += '->svgfile is empty!'
def run(self, title=None, page_number=None, error_type=None):
"""Run all or some errors
[:return:] exit status (int)
"""
xpath = '//error'
if title is not None and page_number is not None:
xpath = '//error[@title="{0}" and @number="{1}"]'.format(title, page_number)
elif title is not None:
xpath = '//error[@title="{0}"]'.format(title)
elif page_number is not None:
xpath = '//error[@number="{0}"]'.format(page_number)
if error_type is not None:
xpath = xpath + '[@type="{0}"]'.format(error_type)\
if title is None and page_number is None\
else xpath.replace(']', ' ') + 'and @type="{0}"]'.format(error_type)
exit_status = 0
for error in self.tree.xpath(xpath):
title = error.get('title')
page_number = error.get('number')
svgfile = error.xpath('./svgfile/text()')[0]\
if len(error.xpath('./svgfile/text()')) > 0 else None
pdffile = error.xpath('./pdffile/text()')[0]\
if len(error.xpath('./pdffile/text()')) > 0 else None
if svgfile is not None:
converter = Converter(title=title)
extractor = Extractor(title=title, extract_transkription_field_only=True, compare2pdf=True)
status = process_file(converter, extractor, svgfile, pdffile, page_number)
if status > 0:
exit_status = status
if status < 2:
error.getparent().remove(error)
self.write()
return exit_status
def write(self):
"""Writes error log.
"""
write_pretty(xml_element_tree=self.tree, file_name=MyErrorHandler.ERROR_LOG, script_name=__file__, file_type='xmlErrorLog')
def is_page_ok(manuscript_file=None, page_number=None):
"""Returns true if page status is 'OK'.
"""
if manuscript_file is not None and isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if page_number is not None\
and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('status') == 'OK'\
and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output'))
return False
def is_svg_ok(manuscript_file=None, page_number=None):
"""Returns true if svgfile contains a valid svg graphic location.
"""
if manuscript_file is not None and isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if page_number is not None\
and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0\
and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')):
xml_source_tree = ET.parse(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output'))
return len(xml_source_tree.xpath('//svg/@file')) > 0 and isfile(xml_source_tree.xpath('//svg/@file')[0])
return False
def process_file(converter, extractor, svgfile, pdffile, page_number):
"""Processes file.
[:return:] exit status (int)
"""
exit_status = 0
path_svg_file = converter.get_file_name(pdffile, page_number=page_number)
if not UNITTESTING:
print(Fore.BLUE + 'Processing file {} ...'.format(svgfile))
print(Style.RESET_ALL)
if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0:
transkriptionField = TranskriptionField(path_svg_file)
transkriptionField.shrink_svg_to_transkription_field()
xml_target_file = extractor.get_file_name(svgfile, page_number)
extraction_status = extractor.extractAndWriteInformation(svgfile, xml_target_file=xml_target_file,\
page_number=page_number, pdfFile=pdffile, svg_file=path_svg_file, record_warnings=True)
if extraction_status < 2 and extractor.manuscript_file is not None:
status = 'OK'
if extraction_status == 1:
status = extractor.latest_status
exit_status = 1
update_manuscript_file(extractor.manuscript_file, page_number, xml_target_file, status=status)
return exit_status
def update_graphical_svg(converter, svgfile, pdffile, page_number, xml_source_file):
"""Create a new graphical svg file and update xml output file.
[:return:] exit status (int)
"""
exit_status = 0
if isfile(xml_source_file):
path_svg_file = converter.get_file_name(pdffile, page_number=page_number)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Creating file {} ...'.format(svgfile))
print(Style.RESET_ALL)
if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0:
transkriptionField = TranskriptionField(path_svg_file)
transkriptionField.shrink_svg_to_transkription_field()
page = Page(xml_source_file=xml_source_file, svg_file=path_svg_file)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
else:
exit_status = 2
return exit_status
def update_manuscript_file(manuscript_file, page_number, file_name, status='changed'):
"""Updates manuscript file: adds status information about page.
"""
if isfile(manuscript_file):
parser = ET.XMLParser(remove_blank_text=True)
manuscript_tree = ET.parse(manuscript_file, parser)
if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0]
node.set('status', status)
node.set('output', file_name)
else:
pages_node = manuscript_tree.getroot().find('pages')\
if manuscript_tree.getroot().find('pages') is not None\
else ET.SubElement(manuscript_tree.getroot(), 'pages')
new_id = len(pages_node.findall('page')) + 1
ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name})
write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to extract information from all text svg files in a directory.
svgscripts/process_files.py [OPTIONS] <PDFDIR> <TEXT_SVG_DIR>
svgscripts/process_files.py [OPTIONS] <xmlManuscriptFile>
<PDFDIR> Directory containing pdfs corresponding to svg files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
<TEXT_SVG_DIR> Directory containing svg files corresponding to pdf files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
OPTIONS:
-h|--help: show help
-e|--run-error Rerun error cases.
-g|--check-graphic-svg Check that graphical svg file exists or generate a new svg file.
-n|--number=pageNumber Use this with OPTION -e|--run-error in order to specify an error case.
-t|--title=title: title of the manuscript to which all files belong.
-T|--error-type: error type, use this with OPTION -e|--run-error in order to specify an error case.
-s|--svg-target-dir=svg-target-dir target directory for path svg files, i.e. svg files that can be displayed on the web.
-x|--xml-target-dir=xml-target-dir target directory for xml files.
:return: exit code (int)
"""
title = None
xml_target_dir = ".{}xml".format(sep)
svg_target_dir = ".{}svg".format(sep)
error_handler = MyErrorHandler()
number = None
rerun_errors = False
error_type = None
check_graphic_svg_exists = False
try:
opts, args = getopt.getopt(argv, "hegn:s:t:T:x:", ["help", "run-error", "check-graphic-svg", "number=", "svg-target-dir=", "title=", "error-type=", "xml-target-dir="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-e', '--run-error'):
rerun_errors = True
elif opt in ('-g', '--check-graphic-svg'):
check_graphic_svg_exists = True
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-T', '--error-type'):
error_type = arg
elif opt in ('-n', '--number'):
number = arg
elif opt in ('-s', '--svg-target-dir'):
svg_target_dir = arg
elif opt in ('-x', '--xml-target-dir'):
xml_target_dir = arg
if rerun_errors:
return error_handler.run(title=title, page_number=number, error_type=error_type)
if len(args) == 1 and args[0].endswith('.xml'):
source_tree = ET.parse(args[0])
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
svg_word_file_tree = ET.parse(source_tree.xpath('//page/@output')[0])
svg_dir = dirname(svg_word_file_tree.xpath('//page/@source')[0])
pdf_dir = dirname(svg_word_file_tree.xpath('//page/pdf/@file')[0])
else:
print('File {} is not of type {}'.format(args[0], FILE_TYPE_XML_MANUSCRIPT))
usage()
return 2
elif len(args) < 1 or\
(len(args) == 1\
and (True not in [ pdffile.endswith('pdf') for pdffile in listdir(args[0]) ]\
or True not in [ svgfile.endswith('svg') for svgfile in listdir(args[0]) ])\
):
print("Please specify both PDFDIR and TEXT_SVG_DIR!")
usage()
return 2
elif len(args) < 2:
pdf_dir, svg_dir = args[0], args[0]
elif isdir(args[0]) and isdir(args[1]):
pdf_dir, svg_dir = args[0], args[1]
if True in [ svgfile.endswith('pdf') for svgfile in listdir(args[1]) ]:
pdf_dir, svg_dir = args[1], args[0]
else:
not_existing = args[0] if not isdir(args[0]) else args[1]
print("ERROR directory {} does not exist!".format(not_existing))
return 2
list_of_svg = [ svgfile for svgfile in listdir(svg_dir) if svgfile.endswith('svg') ]
list_of_pdf = [ pdffile for pdffile in listdir(pdf_dir) if pdffile.endswith('pdf') ]
converter = Converter(target_dir=svg_target_dir, title=title)
extractor = Extractor(xml_dir=xml_target_dir, title=title, extract_transkription_field_only=True, compare2pdf=True)
exit_status = 0
for svgfile in list_of_svg:
if svgfile.replace('.svg', '.pdf') in list_of_pdf:
title = re.split(r'(^[A-Z]+p*_[A-Z]*_[0-9]*)', svgfile)[1].replace('_', ' ')
if extractor.title is None or extractor.title != title:
extractor.update_title_and_manuscript(title)
if converter.title is None or converter.title != title:
converter.title = title.replace(' ', '_')
if 'page' in svgfile:
page_number = svgfile.replace('.svg','').split('page')[1]
else:
page_number = svgfile.replace('.svg','').split('_')[len(svgfile.replace('.svg','').split('_'))-1]
pdffile = '{}{}{}'.format(pdf_dir, sep, svgfile.replace('.svg', '.pdf'))
if not check_graphic_svg_exists and not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
try:
svgfile = '{}{}{}'.format(svg_dir, sep, svgfile)
exit_status = process_file(converter, extractor, svgfile, pdffile, page_number)
except Exception as err:
error_handler.record_error(svgfile, pdffile, title, page_number, error=err)
if not UNITTESTING:
print(Fore.RED)
print('There was an error ->', err)
print(Style.RESET_ALL)
elif not is_svg_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
update_graphical_svg(converter, svgfile, pdffile, page_number, extractor.get_file_name(svgfile, page_number))
error_handler.write()
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline