Index: error_log.xml
===================================================================
--- error_log.xml (revision 0)
+++ error_log.xml (revision 33)
@@ -0,0 +1,72 @@
+
+
+
+ xmlErrorLog
+
+
+ 2019-06-17 21:59:01
+
+ 2019-06-17 23:19:43
+ 2019-06-17 22:23:55
+
+
+ not enough values to unpack (expected 4, got 0)
+
+
+
+
+
+
+
+
+ maximum recursion depth exceeded in comparison
+
+
+
+
+
+
+
+
+ maximum recursion depth exceeded in comparison
+
+
+
+
+
+
+
+
+
+
+
+ not enough values to unpack (expected 4, got 0)
+
+
+ maximum recursion depth exceeded in comparison
+
+
+
+
+
+
+
+
+
+
+
+ list index out of range
+
+
+
+
+
+ list index out of range
+
+
+
+
+
+
+
+
Index: svgscripts/test_process_files.py
===================================================================
--- svgscripts/test_process_files.py (revision 32)
+++ svgscripts/test_process_files.py (revision 33)
@@ -1,28 +1,31 @@
import unittest
from os import sep, path
import lxml.etree as ET
import warnings
import process_files
-
+from process_files import MyErrorHandler
class TestProcessFiles(unittest.TestCase):
def setUp(self):
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.dir = DATADIR + sep + 'pdfsvg'
- @unittest.skipUnless(__name__ == "__main__", 'test takes to long, we do not run it with unittest discover')
+ @unittest.skipUnless(__name__ == "__main__", 'test takes too long, we do not run it with unittest discover')
def test_main(self):
argv = [ '-x', self.dir, '-s', self.dir, self.dir ]
- with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- self.assertEqual(process_files.main(argv), 0)
+ self.assertEqual(process_files.main(argv), 1)
def test_update_manuscript_file(self):
#manuscript_file = self.dir + sep + 'W_I_7.xml'
#process_files.update_manuscript_file(manuscript_file, 82, 'asdf.xml')
pass
+ @unittest.skip('')
+ def test_run(self):
+ error_handler = MyErrorHandler()
+ error_handler.run(page_number='15')
+
if __name__ == "__main__":
unittest.main()
Index: svgscripts/process_files.py
===================================================================
--- svgscripts/process_files.py (revision 32)
+++ svgscripts/process_files.py (revision 33)
@@ -1,151 +1,256 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract information from all text svg files in directory.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
+from colorama import Fore, Style
import getopt
import re
import sys
from os import listdir, sep, path
-from os.path import isfile, isdir
+from os.path import isfile, isdir, dirname
import lxml.etree as ET
from convertPDF2SVG4Web import Converter
from datatypes.transkriptionField import TranskriptionField
from extractWordPosition import Extractor
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
+class MyErrorHandler:
+ """This class can be used to handle errors executing extractWordPosition.Extractor.extractAndWriteInformation.
+ """
+ ERROR_LOG = 'error_log.xml'
+
+ def __init__(self):
+ self.tree = ET.ElementTree(ET.Element('error-log'))
+ if isfile(MyErrorHandler.ERROR_LOG):
+ parser = ET.XMLParser(remove_blank_text=True)
+ self.tree = ET.parse(MyErrorHandler.ERROR_LOG, parser)
+
+ def record_error(self, svgfile, pdffile, title, page_number, xml_target_file, error=None):
+ """Records an error.
+ """
+ if len(self.tree.xpath('//error[@svgfile="%s"]' % svgfile)) > 0:
+ error_node = self.tree.xpath('//error[@svgfile="%s"]' % svgfile)[0]
+ else:
+ error_node = ET.SubElement(self.tree.getroot(), 'error', attrib={'svgfile': svgfile})
+ error_node.set('pdffile', pdffile)
+ error_node.set('title', title)
+ error_node.set('number', page_number)
+ error_node.set('output', xml_target_file)
+ if error is not None:
+ error_msg = ET.SubElement(error_node, 'error-msg', attrib={'type': str(type(error).__name__)})
+ error_msg.text = str(error)
+
+ def run(self, title=None, page_number=None):
+ """Run all or some errors
+
+ [:return:] exit status (int)
+ """
+ xpath = '//error'
+ if title is not None and page_number is not None:
+ xpath = '//error[@title="{0}" and @number="{1}"]'.format(title, page_number)
+ elif title is not None:
+ xpath = '//error[@title="{0}"]'.format(title)
+ elif page_number is not None:
+ xpath = '//error[@number="{0}"]'.format(page_number)
+ exit_status = 0
+ for error in self.tree.xpath(xpath):
+ xml_output_file = error.get('output')
+ title = error.get('title')
+ page_number = error.get('number')
+ svgfile = error.get('svgfile')
+ pdffile = error.get('pdffile')
+ xml_dir = dirname(xml_output_file) if bool(xml_output_file) else None
+ svg_target_dir = dirname(svgfile) if bool(svgfile) else None
+ converter = Converter(target_dir=svg_target_dir, title=title)
+ extractor = Extractor(xml_dir=xml_dir, title=title, extract_transkription_field_only=True, compare2pdf=True)
+ status = process_file(converter, extractor, svgfile, pdffile, page_number)
+ if status > 0:
+ exit_status = status
+ return exit_status
+
+
+ def write(self):
+ """Writes error log.
+ """
+ write_pretty(xml_element_tree=self.tree, file_name=MyErrorHandler.ERROR_LOG, script_name=__file__, file_type='xmlErrorLog')
+
+def is_page_ok(manuscript_file=None, page_number=None):
+ """Returns true if page status is 'OK'.
+ """
+ if manuscript_file is not None and isfile(manuscript_file):
+ manuscript_tree = ET.parse(manuscript_file)
+ if page_number is not None\
+ and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
+ return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('status') == 'OK'
+ return False
def update_manuscript_file(manuscript_file, page_number, file_name, status='changed'):
"""Updates manuscript file: adds status information about page.
"""
if isfile(manuscript_file):
parser = ET.XMLParser(remove_blank_text=True)
manuscript_tree = ET.parse(manuscript_file, parser)
if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0]
node.set('status', status)
node.set('output', file_name)
else:
pages_node = manuscript_tree.getroot().find('pages')\
if manuscript_tree.getroot().find('pages') is not None\
else ET.SubElement(manuscript_tree.getroot(), 'pages')
new_id = len(pages_node.findall('page')) + 1
ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name})
write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type='xmlManuscriptFile')
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
+def process_file(converter, extractor, svgfile, pdffile, page_number):
+ """Processes file.
+
+ [:return:] exit status (int)
+ """
+ exit_status = 0
+ path_svg_file = converter.get_file_name(pdffile, page_number=page_number)
+ print(Fore.BLUE + 'Processing file {} ...'.format(svgfile))
+ print(Style.RESET_ALL)
+ if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0:
+ transkriptionField = TranskriptionField(path_svg_file)
+ transkriptionField.shrink_svg_to_transkription_field()
+ xml_target_file = extractor.get_file_name(svgfile, page_number)
+ extraction_status = extractor.extractAndWriteInformation(svgfile, xml_target_file=xml_target_file,\
+ page_number=page_number, pdfFile=pdffile, svg_file=path_svg_file, record_warnings=True)
+ if extraction_status < 2 and extractor.manuscript_file is not None:
+ status = 'OK'
+ if extraction_status == 1:
+ status = 'with warnings'
+ exit_status = 1
+ update_manuscript_file(extractor.manuscript_file, page_number, xml_target_file, status=status)
+ return exit_status
+
def main(argv):
- """This program can be used to extract information from all text svg files in directory.
+ """This program can be used to extract information from all text svg files in a directory.
svgscripts/process_files.py [OPTIONS] Directory containing pdfs corresponding to svg files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
Directory containing svg files corresponding to pdf files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
OPTIONS:
-h|--help: show help
+ -e|--run-error Rerun error cases.
+ -n|--number=pageNumber Use this whit -e in order to specify an error case.
-t|--title=title: title of the manuscript to which all files belong.
-s|--svg-target-dir=svg-target-dir target directory for path svg files, i.e. svg files that can be displayed on the web.
-x|--xml-target-dir=xml-target-dir target directory for xml files.
:return: exit code (int)
"""
title = None
xml_target_dir = ".{}xml".format(sep)
svg_target_dir = ".{}svg".format(sep)
+ error_handler = MyErrorHandler()
+ number = None
+ rerun_errors = False
try:
- opts, args = getopt.getopt(argv, "hs:t:x:", ["help", "svg-target-dir=", "title=", "xml-target-dir="])
+ opts, args = getopt.getopt(argv, "hen:s:t:x:", ["help", "run-error", "number=", "svg-target-dir=", "title=", "xml-target-dir="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
+ elif opt in ('-e', '--run-error'):
+ rerun_errors = True
elif opt in ('-t', '--title'):
title = arg
+ elif opt in ('-n', '--number'):
+ number = arg
elif opt in ('-s', '--svg-target-dir'):
svg_target_dir = arg
elif opt in ('-x', '--xml-target-dir'):
xml_target_dir = arg
-
+
+ if rerun_errors:
+ error_handler.run(title=title, page_number=number)
if len(args) < 1 or\
(len(args) == 1\
and (True not in [ pdffile.endswith('pdf') for pdffile in listdir(args[0]) ]\
or True not in [ svgfile.endswith('svg') for svgfile in listdir(args[0]) ])\
):
print("Please specify both PDFDIR and TEXT_SVG_DIR!")
usage()
return 2
elif len(args) < 2:
pdf_dir, svg_dir = args[0], args[0]
elif isdir(args[0]) and isdir(args[1]):
pdf_dir, svg_dir = args[0], args[1]
if True in [ svgfile.endswith('pdf') for svgfile in listdir(args[1]) ]:
pdf_dir, svg_dir = args[1], args[0]
else:
not_existing = args[0] if not isdir(args[0]) else args[1]
print("ERROR directory {} does not exist!".format(not_existing))
return 2
list_of_svg = [ svgfile for svgfile in listdir(svg_dir) if svgfile.endswith('svg') ]
list_of_pdf = [ pdffile for pdffile in listdir(pdf_dir) if pdffile.endswith('pdf') ]
converter = Converter(target_dir=svg_target_dir, title=title)
extractor = Extractor(xml_dir=xml_target_dir, title=title, extract_transkription_field_only=True, compare2pdf=True)
+ exit_status = 0
for svgfile in list_of_svg:
if svgfile.replace('.svg', '.pdf') in list_of_pdf:
title = re.split(r'(^[A-Z]+p*_[A-Z]*_[0-9]*)', svgfile)[1].replace('_', ' ')
if extractor.title is None or extractor.title != title:
extractor.update_title_and_manuscript(title)
if converter.title is None or converter.title != title:
converter.title = title.replace(' ', '_')
if 'page' in svgfile:
page_number = svgfile.replace('.svg','').split('page')[1]
else:
page_number = svgfile.replace('.svg','').split('_')[len(svgfile.replace('.svg','').split('_'))-1]
- pdffile = '{}{}{}'.format(pdf_dir, sep, svgfile.replace('.svg', '.pdf'))
- path_svg_file = converter.get_file_name(pdffile, page_number=page_number)
- if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0:
- transkriptionField = TranskriptionField(path_svg_file)
- transkriptionField.shrink_svg_to_transkription_field()
- svgfile = '{}{}{}'.format(svg_dir, sep, svgfile)
- xml_target_file = extractor.get_file_name(svgfile, page_number)
- if extractor.extractAndWriteInformation(svgfile, xml_target_file=xml_target_file,\
- page_number=page_number, pdfFile=pdffile, svg_file=path_svg_file) == 0\
- and extractor.manuscript_file is not None:
- update_manuscript_file(extractor.manuscript_file, page_number, xml_target_file, status='websvg created,information extracted')
- return 0
+ if not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
+ pdffile = '{}{}{}'.format(pdf_dir, sep, svgfile.replace('.svg', '.pdf'))
+ try:
+ svgfile = '{}{}{}'.format(svg_dir, sep, svgfile)
+ exit_status = process_file(converter, extractor, svgfile, pdffile, page_number)
+ except Exception as err:
+ error_handler.record_error(svgfile, pdffile, title, page_number, xml_target_file, error=err)
+ print(Fore.RED)
+ print('There was an error ->', err)
+ print(Style.RESET_ALL)
+ error_handler.write()
+ return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/test_data/pdfsvg/W_II_1_page001.xml
===================================================================
--- svgscripts/test_data/pdfsvg/W_II_1_page001.xml (revision 32)
+++ svgscripts/test_data/pdfsvg/W_II_1_page001.xml (revision 33)
@@ -1,2784 +1,2784 @@
-
+svgWordPosition2019-06-17 17:01:32
- 2019-06-17 18:28:35
+ 2019-06-17 23:19:43
Index: svgscripts/test_data/pdfsvg/W_II_1.xml
===================================================================
--- svgscripts/test_data/pdfsvg/W_II_1.xml (revision 32)
+++ svgscripts/test_data/pdfsvg/W_II_1.xml (revision 33)
@@ -1,14 +1,14 @@
xmlManuscriptFile2019-06-17 17:44:05
- 2019-06-17 18:28:35
+ 2019-06-17 23:19:43
-
+
Index: svgscripts/extractWordPosition.py
===================================================================
--- svgscripts/extractWordPosition.py (revision 32)
+++ svgscripts/extractWordPosition.py (revision 33)
@@ -1,569 +1,575 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract the position of the words in a svg file and write them to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import inspect
import getopt
from lxml import etree as ET
from os import sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
from progress.bar import Bar
import re
import sys
import warnings
from myxmlwriter import write_pretty
from datatypes.lineNumber import LineNumber
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.pdf import PDFText
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word
from datatypes.word_insertion_mark import WordInsertionMark
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Extractor:
"""
This class can be used to extract the word positions in a svg file and write it to a xml file.
Args:
[xml_dir (str): target directory]
[title (str): title of document]
[manuscript_file (str): xml file containing information about the archival unity to which the current page belongs
[extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that
are part of the transkription field.
"""
UNITTESTING = False
SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ]
def __init__(self, xml_dir=None, title=None, manuscript_file=None, extract_transkription_field_only=False, compare2pdf=False):
if bool(xml_dir):
self.xml_dir = xml_dir
not isdir(self.xml_dir) and mkdir(self.xml_dir)
else:
self.xml_dir = 'xml' if(isdir('xml')) else ''
self.compare2pdf = compare2pdf
self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
self.title = title
self.manuscript_file = manuscript_file
self.extract_transkription_field_only = extract_transkription_field_only
self.manuscript_tree = None
if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file):
self.manuscript_tree = ET.parse(self.manuscript_file)
self.title = self.manuscript_tree.getroot().get('title')
elif bool(self.manuscript_file):
raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file))
elif bool(self.title):
self.update_title_and_manuscript(self.title, False)
def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None):
"""Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word).
If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created.
:returns: the new word counter (int)
"""
break_points = []
if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
for Sonderzeichen in self.SONDERZEICHEN_LIST:
contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ]
if True in contains_Sonderzeichen:
break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]]
for sz_point in [i for i, e in break_points]:
wim_index = len(page.word_insertion_marks)
x = float(word_part_objs[sz_point]['x'])
y = float(word_part_objs[sz_point]['y'])
if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None:
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
xmin = transkription_field.xmin
ymin = transkription_field.ymin
wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\
line_number=page.get_line_number(y-1), mark_type=Sonderzeichen)
page.word_insertion_marks.append(wim)
if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points
THRESHOLDX = 20 # Threshold between line number and text
last_x = -1
for i, x in enumerate([float(dict['x']) for dict in word_part_objs]):
if(last_x > -1 and (x - last_x > THRESHOLDX)):
break_points.append((i, i))
last_x = x
if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words
from_index = 0
for end_point, next_from_index in break_points:
new_word_part_objs = word_part_objs[from_index:end_point]
new_endX = word_part_objs[end_point]['x']
from_index = next_from_index
index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
if from_index > 0 and from_index < len(word_part_objs):
new_word_part_objs = word_part_objs[from_index:]
index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
return index
else:
if len(word_part_objs) > 0:
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\
debug_msg_string=debug_msg, transkription_field=transkription_field)
text = self.get_word_from_part_obj(word_part_objs)
line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
#newWord = Word.CREATE_WORD(page=page, word_part_objs=word_part_objs, id=index, endX=endX, endSign=endSign, matrix=matrix, debug_msg=debug_msg)
#newWord.attach_word_to_tree(page.page_tree) -> now we attach all words with update_and_attach_words2tree()
page.words.append(newWord)
return int(index) + 1
else:
return int(index)
- def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None):
+ def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default'):
"""Extracts information about positions of text elements and writes them to a xml file.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
- page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile)
+ exit_status = 0
+ with warnings.catch_warnings(record=record_warnings) as w:
+ warnings.simplefilter(warning_filter)
+ page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile)
+ if w is not None and len(w) > 0:
+ page.page_tree.getroot().set('status', 'with warnings')
+ exit_status = 1
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition')
- return 0
+ return exit_status
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None):
"""Extracts information about positions of text elements.
[:returns:] (datatypes.page) the Page containing all information.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
transkription_field = TranskriptionField(file_name) if bool(self.extract_transkription_field_only) else None
svg_tree = ET.parse(file_name)
page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number, pdfFile=pdfFile,\
svg_file=svg_file, extract_transkription_field_only=self.extract_transkription_field_only)
page.add_source(file_name)
sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot())
page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
if transkription_field is not None:
page.init_line_numbers(self.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax)
self.extract_word_position(svg_tree, page, transkription_field=transkription_field)
if page.pdfFile is not None and isfile(page.pdfFile):
pdftext = PDFText(page.pdfFile, sonderzeichen=self.SONDERZEICHEN_LIST)
pdftext.compare_svgWords2pdfWords(page, transkription_field=transkription_field, split_wrongly_concatenated_words=self.compare2pdf)
page.create_writing_processes_and_attach2tree()
page.categorize_paths(transkription_field=transkription_field)
self.update_and_attach_words2tree(page)
for word_insertion_mark in page.word_insertion_marks:
# it is not clear if we really need to know this alternative word ordering. See 'TODO.md'
#word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark)
word_insertion_mark.attach_object_to_tree(page.page_tree)
return page
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def extract_line_numbers(self, svg_tree, transkription_field):
"""Extracts line numbers and write them to a xml file.
"""
nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
line_numbers = [ LineNumber(raw_text_node=item, transkription_field=transkription_field)\
for item in filter(lambda x: LineNumber.IS_A_LINE_NUMBER(x), nodes_near_tf)]
if len(line_numbers) > 0:
MINABOVE = 3
last_to_position = transkription_field.ymin
for line_number in line_numbers:
above_current_line_bottom = line_number.bottom + transkription_field.ymin - MINABOVE
bottoms = self.get_bottoms(svg_tree.getroot(), from_position=last_to_position, to_position=above_current_line_bottom)
last_to_position = above_current_line_bottom
if len(bottoms) > 0:
current_line_top = float(bottoms[len(bottoms)-1]) - transkription_field.ymin + MINABOVE
line_number.setTop(current_line_top)
return line_numbers
def extract_word_position(self, svg_tree, page, transkription_field=None):
"""Extracts word positions.
"""
counter = 0
word_part_obj = []
endSign = '%'
last_matrix = None
MAXBOTTOMDIFF = 5
MAXXDIFF = 6
if not Extractor.UNITTESTING:
bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field)
# check for line breaks
if (last_matrix is not None and len(word_part_obj) > 0 and (\
Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field)
word_part_obj = []
endX = current_matrix.getX()
if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: