Page MenuHomec4science

extractAndConvert.py
No OneTemporary

File Metadata

Created
Wed, Apr 24, 16:40

extractAndConvert.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to start extraction of word position and showing the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import getopt
import re
import sys
from os import sep, path
from os.path import isfile, dirname
import lxml.etree as ET
from extractWordPosition import Extractor
from convert_wordPositions import HTMLConverter
sys.path.append('shared_util')
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to start extraction of word position and showing the word positions to HTML for testing purposes.
svgscripts/extractAndConvert.py [OPTIONS] <file>
<file> svg file OR xml target file containing file name of svg file as "/page/@source".
OPTIONS:
-h|--help: show help
-s|--svg=svgFile: svg web file
-H|--HTML [default] convert to HTML test file
-x|--xml-target-file=xmlOutputFile: target file
-p|--page=pageNumber: page number of the current page. For use with _one_ file only.
-P|--PDF=pdfFile: pdf file - used for word correction
-t|--title=title: title of the manuscript to which the current page(s) belong(s)
:return: exit code (int)
"""
convert_to_type = 'HTML'
file_name = None
non_testing = True
page = None
page_number = None
pdfFile = None
svg_file = None
title = None
xml_dir = ".{}xml".format(sep)
xml_target_file = None
try:
opts, args = getopt.getopt(argv, "hTHt:p:s:x:P:", ["help", "Testing", "HTML", "title=", "page=", "svg=", "xml-target-file=", "PDF="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-T', '--Testing'):
non_testing = False
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-p', '--page'):
page_number = str(arg)
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-P', '--PDF'):
pdfFile = arg
elif opt in ('-x', '--xml-target-file'):
xml_target_file = str(arg)
if len(args) < 1 or args[0].endswith('xml'):
if xml_target_file is None:
xml_target_file = args[0] if len(args) > 0 else None
if xml_target_file is not None and isfile(xml_target_file):
xml_dir = dirname(xml_target_file)
target_file_tree = ET.parse(xml_target_file)
file_name = target_file_tree.getroot().get('source')
title = target_file_tree.getroot().get('title') if title is None else title
page_number = target_file_tree.getroot().get('number') if page_number is None else page_number
if svg_file is None:
if len(target_file_tree.xpath('//svg-image')) > 0:
svg_file = target_file_tree.xpath('.//svg-image/@file-name')[0]\
if len(target_file_tree.xpath('.//svg-image/@file-name')) > 0 else None
else:
svg_file = target_file_tree.xpath('.//svg/@file')[0]\
if len(target_file_tree.xpath('.//svg/@file')) > 0 else None
else:
file_name = args[0]
if file_name is None or not isfile(file_name):
print("'{}' does not exist!".format(file_name)) if (file_name is not None) else usage()
return 2
extractor = Extractor(xml_dir=xml_dir, title=title)
page = extractor.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file)
if page.svg_file is None:
print('Please specify a svg file!')
usage()
return 2
converter = HTMLConverter(page, non_testing=non_testing)
converter.convert()
if xml_target_file is not None:
xml_target_file = xml_dir + sep + path.basename(xml_target_file)
page.page_tree.getroot().set('source', file_name)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition')
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline