Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 71) +++ svgscripts/datatypes/page.py (revision 72) @@ -1,655 +1,495 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import sys import warnings from .box import Box from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .faksimile_position import FaksimilePosition from .lineNumber import LineNumber from .line import Line from .mark_foreign_hands import MarkForeignHands from .matrix import Matrix from .path import Path from .positional_word_part import PositionalWordPart from .text_connection_mark import TextConnectionMark from .text_field import TextField from .transkriptionField import TranskriptionField from .writing_process import WritingProcess from .word import Word from .word_insertion_mark import WordInsertionMark sys.path.append('py2ttl') from class_spec import SemanticClass FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition' FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile' +STATUS_MERGED_OK = 'faksimile merged' +STATUS_POSTMERGED_OK = 'words processed' class Page(SemanticClass): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ UNITTESTING = False WARNING_MISSING_USE_NODE4PWP = PositionalWordPart.WARN_NO_USE_NODE_FOUND WARNING_MISSING_GLYPH_ID4WIM = WordInsertionMark.WARN_NO_GLYPH_ID PAGE_RECTO = 'recto' PAGE_VERSO = 'verso' def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, faksimile_image=None, faksimile_svgFile=None, pdfFile=None, svg_file=None, orientation='North', page_type=PAGE_VERSO, extract_transkription_field_only=True): self.title = title self.mark_foreign_hands = [] self.text_connection_marks = [] self.line_numbers = [] self.style_dict = {} self.sonderzeichen_list = [] self.svg_file = None self.svg_image = None self.pdfFile = None self.faksimile_svgFile = None self.source = None self.number = page_number if page_number is not None else -1 self.orientation = orientation self.page_type = page_type self.word_deletion_paths = [] self.faksimile_image = faksimile_image self.text_field = None self.lines = [] if xml_source_file is not None: if isfile(xml_source_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_source_file, parser) self.title = self.page_tree.getroot().get('title') self.number = self.page_tree.getroot().get('number') self.source = self.page_tree.getroot().get('source') self.orientation = self.page_tree.getroot().get('orientation') self.page_type = self.page_tree.getroot().get('pageType') self.add_style(style_node=self.page_tree.getroot().find('.//style')) self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None - self.faksimile_svgFile = self.page_tree.xpath('.//faksimile-svg/@file')[0]\ - if len(self.page_tree.xpath('.//faksimile-svg/@file')) > 0 else None + self.faksimile_svgFile = self.page_tree.xpath('.//data-source/@file')[0]\ + if len(self.page_tree.xpath('.//data-source/@file')) > 0 else None self.svg_image = SVGImage(node=self.page_tree.xpath('.//' + SVGImage.XML_TAG)[0])\ if len(self.page_tree.xpath('.//' + SVGImage.XML_TAG)) > 0 else None if len(self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)) > 0: self.faksimile_image = FaksimileImage(node=self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)[0]) self.text_field = self.faksimile_image.text_field self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 if pdfFile is not None and self.pdfFile is None: self.pdfFile = pdfFile ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) if faksimile_svgFile is not None and self.faksimile_svgFile is None: - self.faksimile_svgFile = faksimile_svgFile - ET.SubElement(self.page_tree.getroot(), 'faksimile-svg', attrib={'file': self.faksimile_svgFile}) + self.update_data_source(faksimile_svgFile=faksimile_svgFile) if faksimile_image is not None: self.faksimile_image = faksimile_image self.faksimile_image.attach_object_to_tree(self.page_tree) if svg_file is not None and self.svg_file is None: self.svg_file = svg_file tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height) self.svg_image.attach_object_to_tree(self.page_tree) if self.svg_image is not None and self.svg_file is None: self.svg_file = self.svg_image.file_name if self.svg_image is not None and self.width == 0.0: self.width = self.svg_image.width if self.svg_image is not None and self.height == 0.0: self.height = self.svg_image.height self.init_node_objects() else: raise Exception('File "{}" does not exist!'.format(xml_source_file)) elif xml_target_file is not None: self.word_insertion_marks = [] self.words = [] self.writing_processes = [] self.svg_file = svg_file self.pdfFile = pdfFile self.faksimile_svgFile = faksimile_svgFile if isfile(xml_target_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_target_file, parser) self.source = self.page_tree.getroot().get('source') if bool(self.page_tree.getroot().get('orientation')): self.orientation = self.page_tree.getroot().get('orientation') elif orientation is not None: self.page_tree.getroot().set('orientation', orientation) if bool(self.page_tree.getroot().get('title')): self.title = self.page_tree.getroot().get('title') elif title is not None: self.page_tree.getroot().set('title', title) if self.svg_file is None: self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 elif len(self.page_tree.xpath('.//svg/@file')) == 0: tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height) self.svg_image.attach_object_to_tree(self.page_tree) #ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) else: self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 if self.pdfFile is None: self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None elif len(self.page_tree.xpath('.//pdf/@file')) == 0: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG, WordInsertionMark.XML_TAG,\ WritingProcess.XML_TAG, Path.WORD_DELETION_PATH_TAG ]: for node in self.page_tree.xpath('//' + xpath2remove): node.getparent().remove(node) else: self.page_tree = ET.ElementTree(ET.Element('page')) self.pdfFile = pdfFile self.svg_file = svg_file if title is not None: self.page_tree.getroot().set('title', title) if orientation is not None: self.page_tree.getroot().set('orientation', orientation) self.page_tree.getroot().set('transkription-field-only', str(extract_transkription_field_only).lower()) if page_number is not None: self.page_tree.getroot().set('number', str(page_number)) if self.pdfFile is not None: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) if self.svg_file is not None: tf = TranskriptionField(self.svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height) self.svg_image.attach_object_to_tree(self.page_tree) #ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) if self.svg_image is None and self.svg_file is not None: self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height) self.svg_image.attach_object_to_tree(self.page_tree) def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None): """Adds a list of classes that are sonderzeichen and a style dictionary to page. """ self.sonderzeichen_list = sonderzeichen_list self.letterspacing_list = letterspacing_list self.style_dict = style_dict if style_node is not None: self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') } self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ] self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('letterspacing-list')) ] elif bool(self.style_dict): style_node = ET.SubElement(self.page_tree.getroot(), 'style') if len(self.sonderzeichen_list) > 0: style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list)) if len(self.letterspacing_list) > 0: style_node.set('letterspacing-list', ' '.join(self.letterspacing_list)) for key in self.style_dict.keys(): self.style_dict[key]['name'] = key ET.SubElement(style_node, 'class', attrib=self.style_dict[key]) fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value } fontsizes = sorted(fontsize_dict.values(), reverse=True) # create a mapping between fontsizes and word stages self.fontsizekey2stage_mapping = {} for fontsize_key, value in fontsize_dict.items(): if value >= fontsizes[0]-1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION }) elif value <= fontsizes[len(fontsizes)-1]+1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION }) else: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION }) def add_source(self, source): """Adds a source to page and attaches it to page_tree. """ self.source = source self.page_tree.getroot().set('source', self.source) - def categorize_paths(self, transkription_field=None): - """Categorize all paths that are part of the transkription field. - - :return: a dictionary containig a list for each category of path. - """ - if self.source is not None and isfile(self.source): - MAX_HEIGHT_LINES = 1 - max_line = sorted(\ - [line_number.bottom-line_number.top for line_number in self.line_numbers if line_number.id % 2 == 0],\ - reverse=True)[0] + 2 if len(self.line_numbers) > 0 else 17 - tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0 - tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0 - paths, attributes = svg_to_paths.svg2paths(self.source) - allpaths_on_tf = [] - allpaths_outside_tf = [] - attributes_outside_tf = [] - if transkription_field is not None: - for index in range(0, len(paths)): - path = paths[index] - attribute = attributes[index] - if len(path) > 0\ - and path != transkription_field.path\ - and path.bbox()[0] > tr_xmin\ - and path.bbox()[1] < transkription_field.xmax: - allpaths_on_tf.append(Path(id=index, path=path, style_class=attribute.get('class'))) - elif len(path) > 0\ - and path != transkription_field.path: - allpaths_outside_tf.append(path) - attributes_outside_tf.append(attribute) - path_dict = { 'text_area_deletion_paths': [],\ - 'deletion_or_underline_paths': [],\ - 'box_paths': [],\ - 'dots_paths': [],\ - 'word_connector_paths': [],\ - 'uncategorized_paths': [] } - for mypath in allpaths_on_tf: - xmin, xmax, ymin, ymax = mypath.path.bbox() - start_line_number = self.get_line_number(mypath.path.start.imag-tr_ymin) - if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1: - path_dict.get('dots_paths').append(mypath) - elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed(): - path_dict.get('box_paths').append(mypath) - elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed(): - path_dict.get('word_connector_paths').append(mypath) - elif abs(ymax-ymin) < MAX_HEIGHT_LINES: - path_dict.get('deletion_or_underline_paths').append(mypath) - elif start_line_number != -1 and start_line_number != self.get_line_number(mypath.path.end.imag-tr_ymin): - path_dict.get('text_area_deletion_paths').append(mypath) - else: - path_dict.get('uncategorized_paths').append(mypath) - underline_path = self.mark_words_intersecting_with_paths_as_deleted(path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin) - path_dict.update({'underline_path': underline_path}) - self.process_word_boxes(path_dict.get('box_paths'), transkription_field,\ - paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line) - return path_dict - elif not Page.UNITTESTING: - error_msg = 'Svg source file {} does not exist!'.format(self.source)\ - if self.source is not None else 'Page does not contain a source file!' - raise FileNotFoundError(error_msg) - return {} - def create_writing_processes_and_attach2tree(self): """Creates three stages of Nietzsche's process of writing. """ self.writing_processes = [ WritingProcess(version=WritingProcess.FIRST_VERSION),\ WritingProcess(version=WritingProcess.INSERTION_AND_ADDITION),\ WritingProcess(version=WritingProcess.LATER_INSERTION_AND_ADDITION) ] for writing_process in self.writing_processes: writing_process.attach_object_to_tree(self.page_tree) - for word in self.words: - for transkription_position in word.transkription_positions: - for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): - if font_key in self.fontsizekey2stage_mapping.keys(): - transkription_position.writing_process_id = self.fontsizekey2stage_mapping.get(font_key) - - def find_special_words(self, transkription_field=None): - """Find special words, remove them from words, process their content. - """ - if self.source is None or not isfile(self.source): - raise FileNotFoundError('Page does not have a source!') - if transkription_field is None: - transkription_field = TranskriptionField(self.source) - special_char_list = MarkForeignHands.get_special_char_list() - special_char_list += TextConnectionMark.get_special_char_list() - single_char_words = [ word for word in self.words if len(word.text) == 1 and word.text in special_char_list ] - for word in single_char_words: - if word.text == MarkForeignHands.CLASS_MARK: - id = len(self.mark_foreign_hands) - self.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id)) - self.words.remove(word) - elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\ - or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\ - and any(style in self.sonderzeichen_list for style\ - in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))): - id = len(self.text_connection_marks) - self.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id)) - self.words.remove(word) - svg_tree = ET.parse(self.source) - self.update_page_type(transkription_field=transkription_field) - self.update_line_number_area(transkription_field, svg_tree=svg_tree) - italic_classes = [ key for key in self.style_dict\ - if bool(self.style_dict[key].get('font-family')) and self.style_dict[key]['font-family'].endswith('Italic') ] - if len(self.mark_foreign_hands) > 0: - MarkForeignHands.find_content(self.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\ - SonderzeichenList=self.sonderzeichen_list) - if len(self.text_connection_marks) > 0: - TextConnectionMark.find_content_in_footnotes(self.text_connection_marks, transkription_field, svg_tree,\ - title=self.title, page_number=self.number) + #for word in self.words: + # for transkription_position in word.transkription_positions: + # for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): + # if font_key in self.fontsizekey2stage_mapping.keys(): + # transkription_position.writing_process_id = self.fontsizekey2stage_mapping.get(font_key) def get_biggest_fontSize4styles(self, style_set={}): """Returns biggest font size from style_dict for a set of style class names. [:returns:] (float) biggest font size OR 1 if style_dict is empty """ if bool(self.style_dict): sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True) return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1 else: return 1 def get_line_number(self, y): """Returns line number id for element at y. [:return:] (int) line number id or -1 """ if len(self.line_numbers) > 0: result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ] return result_list[0] if len(result_list) > 0 else -1 else: return -1 @classmethod - def get_pages_from_xml_file(cls, xml_file, status_contains='', word_selection_function=None): + def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None): """Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT. [optional: instantiation depends on the fulfilment of a status_contains and/or on the selection of some words by a word_selection_function]. """ source_tree = ET.parse(xml_file) if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION: page = cls(xml_source_file=xml_file) if word_selection_function is None or len(word_selection_function(page.words)) > 0: return [ page ] else: return [] elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: pages = [] - xpath = '//page/@output'\ - if status_contains == ''\ - else '//page[contains(@status, "{0}")]/@output'.format(status_contains) + xpath = '//page/@output' + if status_contains != '' and status_not_contain != '': + xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain) + elif status_contains != '': + xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains) + elif status_not_contain != '': + xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain) for xml_source_file in source_tree.xpath(xpath): if isfile(xml_source_file): pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function) return pages else: return [] @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'number': { 'class': str, 'cardinality': 1},\ 'faksimile_image': { 'class': FaksimileImage, 'cardinality': 1},\ 'orientation': { 'class': str, 'cardinality': 1},\ 'svg_image': { 'class': SVGImage, 'cardinality': 1}} properties.update(cls.create_semantic_property_dictionary('text_field', TextField,\ cardinality=1, name='pageIsOnTextField', label='page is on text field',\ comment='Relates a page to the text field on a faksimile image.')) for key in [ 'lines', 'words', 'writing_processes', 'word_deletion_paths', 'word_insertion_marks']: properties.update(cls.create_semantic_property_dictionary(key, list)) dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary def init_line_numbers(self, line_numbers, document_bottom): """Init line numbers. """ even_index = 0 MINABOVE = 1 self.line_numbers = [] if len(line_numbers) > 0: first_line_bottom = line_numbers[even_index].top - MINABOVE self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 while even_index < len(line_numbers): self.line_numbers.append(LineNumber(id=line_numbers[even_index].id-1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=line_numbers[even_index].top-MINABOVE)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 self.line_numbers.append(LineNumber(id=line_numbers[even_index-1].id+1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=document_bottom)) for line_number in self.line_numbers: line_number.attach_object_to_tree(self.page_tree) def init_node_objects(self): """Initialize all node objects. """ self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ] self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ] self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ] self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ] self.word_deletion_paths = [ Path(node=node) for node in self.page_tree.xpath('//' + Path.WORD_DELETION_PATH_TAG) ] if self.faksimile_image is not None and self.text_field is not None: for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks: simple_word.init_word(self) for wim in self.word_insertion_marks: if wim.line_number > -1: wim.line = [ line for line in self.lines if line.id == wim.line_number ][0] def is_locked(self): """Return true if page is locked. """ return len(self.page_tree.xpath('//metadata/lock')) > 0 def lock(self, reference_file, message=''): """Lock tree such that ids of words etc. correspond to ids in reference_file, optionally add a message that will be shown. """ if not self.is_locked(): metadata = self.page_tree.xpath('./metadata')[0]\ if len(self.page_tree.xpath('./metadata')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'metadata') lock = ET.SubElement(metadata, 'lock') ET.SubElement(lock, 'reference-file').text = reference_file if message != '': ET.SubElement(lock, 'message').text = message - - def mark_words_intersecting_with_paths_as_deleted(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): - """Marks all words that intersect with deletion paths as deleted - and adds these paths to word_deletion_paths. - - [:return:] list of .path.Path that might be word_underline_paths - """ - if not Page.UNITTESTING: - bar = Bar('mark words that intersect with deletion paths', max=len(self.words)) - for word in self.words: - not bool(Page.UNITTESTING) and bar.next() - word.deleted = False - for transkription_position in word.transkription_positions: - word_path = Path.create_path_from_transkription_position(transkription_position,\ - tr_xmin=tr_xmin, tr_ymin=tr_ymin) - intersecting_paths = [ deletion_path for deletion_path in deletion_paths\ - if do_paths_intersect_saveMode(deletion_path.path, word_path.path) ] - if len(intersecting_paths) > 0: - transkription_position.deleted = True - for deletion_path in intersecting_paths: - if deletion_path not in self.word_deletion_paths: - deletion_path.tag = Path.WORD_DELETION_PATH_TAG - deletion_path.attach_object_to_tree(self.page_tree) - self.word_deletion_paths.append(deletion_path) - word.partition_according_to_writing_process_id() - word.partition_according_to_deletion() - not bool(Page.UNITTESTING) and bar.finish() - # return those paths in deletion_paths that are not in self.word_deletion_paths - return [ word_underline_path for word_underline_path in set(deletion_paths) - set(self.word_deletion_paths) ] - - def process_word_boxes(self, box_paths, transkription_field, paths=None, attributes=None, max_line=17): - """Process word boxes: partition words according to word boxes. - """ - MAX_HEIGHT_LINES = 1 - if not Page.UNITTESTING: - bar = Bar('process word boxes', max=len(self.words)) - svg_tree = ET.parse(self.source) - namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } - allpaths_on_margin_field = [] - if paths is None or attributes is None: - paths, attributes = svg_to_paths.svg2paths(self.source) - for index in range(0, len(paths)): - path = paths[index] - xmin, xmax, ymin, ymax = path.bbox() - attribute = attributes[index] - if len(path) > 0\ - and path != transkription_field.path\ - and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\ - or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\ - and abs(ymax-ymin) < max_line: - allpaths_on_margin_field.append(Path(id=index, path=path, style_class=attribute.get('class'))) - box_line_number_dict = {} - for box_path in sorted(box_paths, key=lambda path: path.get_median_y()): - line_number = self.get_line_number(box_path.get_median_y(tr_ymin=transkription_field.ymin)) - if line_number not in box_line_number_dict.keys(): - box_line_number_dict.update({ line_number: [ box_path ]}) - else: - box_line_number_dict.get(line_number).append(box_path) - boxes = [] - for line_number in box_line_number_dict.keys(): - box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x()) - margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\ - if self.get_line_number(margin_box.get_median_y(tr_ymin=transkription_field.ymin)) == line_number ],\ - key=lambda path: path.get_x()) - threshold = 3 if line_number % 2 == 0 else 1.5 - for box_path in box_paths_on_line: - box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\ - transkription_field=transkription_field, namespaces=namespaces, threshold=threshold) - if box is not None: - boxes.append(box) - for word in self.words: - not bool(Page.UNITTESTING) and bar.next() - word.process_boxes(boxes, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin) - not bool(Page.UNITTESTING) and bar.finish() def unlock(self): """Lock tree such that ids of words etc. correspond to ids in reference_file, optionally add a message that will be shown. """ if self.is_locked(): lock = self.page_tree.xpath('//metadata/lock')[0] lock.getparent().remove(lock) def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') + def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None): + """Update the data source of page. + """ + if faksimile_svgFile is not None: + self.faksimile_svgFile = faksimile_svgFile + data_node = self.page_tree.xpath('.//data-source')[0]\ + if len(self.page_tree.xpath('.//data-source')) > 0\ + else ET.SubElement(self.page_tree.getroot(), 'data-source') + data_node.set('file', self.faksimile_svgFile) + if xml_correction_file is not None: + data_node.set('xml-corrected-words', xml_correction_file) + def update_line_number_area(self, transkription_field, svg_tree=None): """Determines the width of the area where the line numbers are written in the page.source file. """ THRESHOLD = 0.4 if svg_tree is None: svg_tree = ET.parse(self.source) if len(self.line_numbers) > 1: line_number = self.line_numbers[9]\ if transkription_field.is_page_verso() and len(self.line_numbers) > 8\ else self.line_numbers[1] ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\ if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\ and LineNumber.IS_A_LINE_NUMBER(item)\ and LineNumber(raw_text_node=item, transkription_field=transkription_field).id == line_number.id ] if len(ln_nodes) > 0: matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform')) if transkription_field.is_page_verso(): transkription_field.add_line_number_area_width(matrix.getX()) elif self.svg_file is not None and isfile(self.svg_file): svg_path_tree = ET.parse(self.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } svg_x = matrix.getX() svg_y = self.line_numbers[1].bottom + transkription_field.ymin use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\ .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces) if len(use_nodes) > 0: symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '') d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces) if len(d_strings) > 0 and d_strings[0] != '': path = parse_path(d_strings[0]) xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin transkription_field.add_line_number_area_width(matrix.getX() + width) def update_page_type(self, transkription_field=None): """Adds a source to page and attaches it to page_tree. """ if transkription_field is None: if self.source is None or not isfile(self.source): raise FileNotFoundError('Page does not have a source!') transkription_field = TranskriptionField(self.source) self.page_type = Page.PAGE_VERSO\ if transkription_field.is_page_verso()\ else Page.PAGE_RECTO self.page_tree.getroot().set('pageType', self.page_type) - -def do_paths_intersect_saveMode(path1, path2): - """Returns true if paths intersect, false if not or if there was an exception. - """ - try: - return path1.intersect(path2, justonemode=True) - except AssertionError: - return False Index: svgscripts/datatypes/word_insertion_mark.py =================================================================== --- svgscripts/datatypes/word_insertion_mark.py (revision 71) +++ svgscripts/datatypes/word_insertion_mark.py (revision 72) @@ -1,138 +1,138 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word insertion mark. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from svgpathtools.parser import parse_path import warnings from .line import Line from .positional_object import PositionalObject from .word import Word class WordInsertionMark(PositionalObject): """ This class represents a word insertion mark. Args: wim_node (etree.Element): element that contains information about a word_insertion_mark. OR id (int): word id x (float) y (float) height (float) width (float) previous_word_id (int): id of the word to which word insertion mark is attached inserted_words: Array->Word of inserted words marked by the word insertion mark. """ WARN_NO_GLYPH_ID = 'No glyph_id found' XML_TAG = 'word-insertion-mark' extraStringKeys = [ 'mark_type', 'symbol_id' ] def __init__(self, wim_node=None, id=0, x=-1.0, y=-1.0, height=0, width=0, previous_word_id=-1, next_word_id=-1, line_number=-1, symbol_id=None, inserted_words=[], inserted_word_id=-1, mark_type='A'): super(WordInsertionMark, self).__init__(id=id, node=wim_node, height=height, width=width, x=x, y=y, tag=WordInsertionMark.XML_TAG) self.stringKeys += [ 'mark_type', 'symbol_id' ] self.intKeys += [ 'line_number', 'next_word_id', 'previous_word_id' ] self.symbol_id = symbol_id self.mark_type = mark_type self.line_number = line_number self.line = None self.previous_word_id = previous_word_id self.next_word_id = next_word_id if wim_node is not None: self.mark_type = wim_node.get('mark-type') self.line_number = int(wim_node.get('line-number')) if bool(wim_node.get('line-number')) else -1 self.previous_word_id = int(wim_node.get('previous-word-id')) if bool(wim_node.get('previous-word-id')) else -1 self.next_word_id = int(wim_node.get('next-word-id')) if bool(wim_node.get('next-word-id')) else -1 def init_inserted_words(self, inserted_words=[], wim_node=None, inserted_word_id_string=None): if wim_node is not None and inserted_word_id_string is not None: ids = inserted_word_id_string.split(' ') inserted_words = [ Word.CREATE_WORD(word_node=word_node) for word_node in wim_node.getroottree().getroot().xpath('.//word[@id>="{0}" and @id<="{1}"]'.format(ids[0], ids[len(ids)-1])) ] if len(inserted_words) > 0: inserted_words[0].is_head_of_inserted_words = True inserted_words[len(inserted_words)-1].is_tail_of_inserted_words = True for word in inserted_words: word.set_word_insertion_mark(self) return inserted_words def attach_and_update_word_if_involved(self, word): if word.id == self.previous_word_id: word.is_before_inserted_words = True word.word_insertion_mark = self elif word.id == self.next_word_id: word.is_after_inserted_words = True word.word_insertion_mark = self elif word.id in [ inserted.id for inserted in self.inserted_words ]: word = [ inserted for inserted in self.inserted_words if inserted.id == word.id ][0] return word @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = super(cls,cls).get_semantic_dictionary() word_dicts = { key: { 'class': Word, 'cardinality': 1, 'cardinality_restriction': 'maxCardinality',\ 'label': 'has {} word'.format(key.replace('_word_id','')),\ 'name': 'has{}'.format(key.title().replace('_Id','').replace('_','')) }\ for key in [ 'previous_word_id', 'next_word_id' ] } dictionary['properties'].update(word_dicts) dictionary['properties'].update({'line': {'class': Line, 'cardinality': 1,\ 'name': 'wordInsertionMarkBelongsToLine', 'label': 'word insertion mark belongs to a specific line'}}) - for extraStringKey in cls.extraStringKeys: - dictionary['properties'].update(cls.create_semantic_property_dictionary(extraStringKey, str, cardinality=1)) + dictionary['properties'].update(cls.create_semantic_property_dictionary('mark_type', str, cardinality=1)) + dictionary['properties'].update(cls.create_semantic_property_dictionary('symbol_id', str, cardinality=1, cardinality_restriction='maxCardinality')) return dictionary @staticmethod def CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=0, x=0.0, y=0.0, xmin=0.0, ymin=0.0, line_number=-1, mark_type='A'): """Creates a (datatypes.word_insertion_mark) WordInsertionMark using a (lxml.ElementTree) svg_path_tree and the corresponding namespaces. """ THRESHOLD = 0.4 svg_x = x + xmin svg_y = y + ymin use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\ .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces) if len(use_nodes) > 0: symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '') d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces) height = 0.0 width = 0.0 if len(d_strings) > 0 and d_strings[0] != '': path = parse_path(d_strings[0]) xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin height = ymax - ymin return WordInsertionMark(id=id, x=x, y=y-height, height=height, width=width, line_number=line_number,\ mark_type=mark_type, symbol_id=symbol_id) else: warnings.warn('{} for word insertion mark {} on line {}'.format(WordInsertionMark.WARN_NO_GLYPH_ID, id, line_number)) return WordInsertionMark(id=id, x=x, y=y, line_number=line_number, mark_type=mark_type) Index: svgscripts/datatypes/image.py =================================================================== --- svgscripts/datatypes/image.py (revision 71) +++ svgscripts/datatypes/image.py (revision 72) @@ -1,116 +1,116 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent all image types. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile import sys from .attachable_object import AttachableObject from .text_field import TextField sys.path.append('py2ttl') from class_spec import SemanticClass class Image(AttachableObject,SemanticClass): """ This super class represents all types of images. Args: file_name (str): name of the image file. node (lxml.etree.Element) node, containing information URL (str): URL of image file. height (float): height of image width (float): width of image text_field (.text_field.TextField) text_field on image representation """ stringKeys = [ 'file_name', 'URL', 'local_path' ] floatKeys = [ 'height', 'width' ] XML_TAG = 'image' def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG): self.text_field = text_field if node is not None: self.file_name = node.get('file-name') self.local_path = node.get('local-path') self.URL = node.get('URL') self.height = float(node.get('height')) self.width = float(node.get('width')) if len(node.findall(TextField.XML_TAG)) > 0: self.text_field = TextField(node=node.find(TextField.XML_TAG)) else: self.tag = tag self.file_name = file_name self.local_path = local_path self.URL = URL self.height = height self.width = width def attach_object_to_tree(self, target_tree): """Attach object to tree. """ obj_node = target_tree.getroot().find('.//' + self.tag) \ if(len(target_tree.getroot().findall('.//' + self.tag)) > 0) \ else ET.SubElement(target_tree.getroot(), self.tag) for key in self.floatKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) for key in self.stringKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), self.__dict__[key]) if self.text_field is not None: self.text_field.attach_object_to_tree(obj_node) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} for floatKey in Image.floatKeys: - properties.update(cls.create_semantic_property_dictionary(floatKey, float)) + properties.update(cls.create_semantic_property_dictionary(floatKey, float, cardinality=1)) properties.update(cls.create_semantic_property_dictionary('file_name', str, cardinality=1)) - properties.update(cls.create_semantic_property_dictionary('URL', str)) + #properties.update(cls.create_semantic_property_dictionary('URL', str)) dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary class SVGImage(Image): """This class represents a svg image. """ XML_TAG = 'svg-image' def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG): if node is not None and node.tag != self.XML_TAG: file_name = node.get('file') height = float(node.get('height')) if bool(node.get('height')) else 0.0 width = float(node.get('width')) if bool(node.get('width')) else 0.0 node = None super(SVGImage, self).__init__(node=node, file_name=file_name, URL=URL,\ height=height, width=width, text_field=text_field, tag=self.XML_TAG) Index: svgscripts/datatypes/simple_word.py =================================================================== --- svgscripts/datatypes/simple_word.py (revision 71) +++ svgscripts/datatypes/simple_word.py (revision 72) @@ -1,118 +1,121 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent a simple word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc from lxml import etree as ET import sys from .line import Line from .faksimile_position import FaksimilePosition from .transkription_position import TranskriptionPosition from .word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass class SimpleWord(SemanticClass, metaclass=abc.ABCMeta): """ This class represents a simple word. """ XML_TAG = 'simple-word' XML_SUB_TAG = 'content' def __init__(self, id=0, line_number=-1, line=None, text='', deleted=False, transkription_positions=None, faksimile_positions=None): self.id = id self.text = text self.line_number = line_number self.lines = [] if line is not None: self.lines.append(line) self.transkription_positions = transkription_positions if transkription_positions is not None else [] self.faksimile_positions = faksimile_positions if faksimile_positions is not None else [] def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() if len(target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)) > 0: word_node = target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)[0] word_node.getparent().remove(word_node) word_node = ET.SubElement(target_tree, self.XML_TAG, attrib={'id': str(self.id)}) word_node.set('text', self.text) if self.line_number > -1: word_node.set('line-number', str(self.line_number)) for transkription_position in self.transkription_positions: transkription_position.attach_object_to_tree(word_node) for faksimile_position in self.faksimile_positions: faksimile_position.attach_object_to_tree(word_node) return word_node @classmethod def create_cls(cls, word_node): """Creates a cls from a (lxml.Element) node. [:return:] cls """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else -1 text = word_node.get('text') transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] return cls(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) else: error_msg = 'word_node has not been defined' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() - properties = { 'lines': {'class': Line, 'cardinality': 1,\ + properties = { 'lines': {'class': Line,\ + 'cardinality': 1,\ 'cardinality_restriction': 'minCardinality',\ 'name': 'wordBelongsToLine',\ 'label': 'word belongs to a line',\ 'comment': 'Relating a word to a line.'}} properties.update(cls.create_semantic_property_dictionary('transkription_positions', list, cardinality=1, cardinality_restriction='minCardinality')) properties.update(cls.create_semantic_property_dictionary('faksimile_positions', list, cardinality=1, cardinality_restriction='minCardinality')) properties.update(cls.create_semantic_property_dictionary('text', str, cardinality=1)) dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary def init_word(self, page): """Initialize word with objects from page. """ + for transkription_position in self.transkription_positions: + transkription_position.svg_image = page.svg_image self.faksimile_positions = FaksimilePosition.create_list_of_cls(self.faksimile_positions, page.faksimile_image, page.text_field) if self.line_number > -1: self.lines += [ line for line in page.lines if line.id == self.line_number ] Index: svgscripts/datatypes/faksimile_image.py =================================================================== --- svgscripts/datatypes/faksimile_image.py (revision 71) +++ svgscripts/datatypes/faksimile_image.py (revision 72) @@ -1,102 +1,103 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent faksimile images. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import fnmatch from lxml import etree as ET import os from os.path import basename, dirname, isfile, realpath, sep import sys from .image import Image from .text_field import TextField sys.path.append('svgscripts') from local_config import FAKSIMILE_LOCATION class FaksimileImage(Image): """ This class represents a faksimile image. Args: file_name (str): name of the image file. node (lxml.etree.Element) node, containing information URL (str): URL of image file. height (float): height of image width (float): width of image x (float): x y (float): y """ XML_TAG = 'faksimile-image' NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/download/' def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0, text_field=None): super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL, local_path=local_path,\ height=height, width=width, text_field=text_field, tag=self.XML_TAG) self.x = x self.y = y def get_image_joined_with_text_field(self, text_field): """Returns a new instance of itself that has a text_field (text_field.TextField). """ return FaksimileImage(file_name=self.file_name, local_path=self.local_path, URL=self.URL, height=self.height,\ width=self.width, x=self.x, y=self.y, text_field=text_field) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(FaksimileImage,cls).get_semantic_dictionary() dictionary['properties'].update(cls.create_semantic_property_dictionary('text_field', TextField)) + dictionary['properties'].update(cls.create_semantic_property_dictionary('URL', str, cardinality=1)) return dictionary @staticmethod def CREATE_IMAGE(image_node, source_file=None): """Instantiates a FaksimileImage from a (lxml.etree.Element) image_node. """ namespaces = image_node.nsmap if len(namespaces) == 0: namespaces = { 'xlink': '' } local_path = image_node.get('{%s}href' % namespaces['xlink']) file_name = basename(local_path) if file_name != local_path and source_file is not None: local_path = realpath(dirname(source_file)) + sep + local_path local_path = realpath(local_path) if not isfile(local_path): local_path = None for path, dirs, files in os.walk(os.path.abspath(FAKSIMILE_LOCATION)): for filename in fnmatch.filter(files, file_name): local_path = os.path.join(path, filename) break URL = FaksimileImage.NIETZSCHE_SOURCES_URL + file_name.replace('.jpg','') height = float(image_node.get('height')) if bool(image_node.get('height')) else 0.0 width = float(image_node.get('width')) if bool(image_node.get('width')) else 0.0 x = float(image_node.get('x')) if bool(image_node.get('x')) else 0.0 y = float(image_node.get('y')) if bool(image_node.get('y')) else 0.0 return FaksimileImage(file_name=file_name, local_path=local_path, URL=URL, height=height, width=width, x=x, y=y) Index: svgscripts/datatypes/word.py =================================================================== --- svgscripts/datatypes/word.py (revision 71) +++ svgscripts/datatypes/word.py (revision 72) @@ -1,509 +1,555 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from operator import attrgetter import sys import warnings from .matrix import Matrix from .path import Path from .simple_word import SimpleWord from .word_position import WordPosition from .transkription_position import TranskriptionPosition from .writing_process import WritingProcess sys.path.append('py2ttl') from class_spec import SemanticClass +def execute_function_on_parts(word_parts, func_name): + """Execute function on parts and add those parts instead of original word to word_parts. + + :return: new word_parts, output from func + """ + copy_parts = word_parts[:] + for word in word_parts: + output = eval('word.{0}()'.format(func_name)) + if len(word.word_parts) > 0: + for part_word in word.word_parts: + copy_parts.insert(copy_parts.index(word), part_word) + copy_parts.remove(word) + word.word_parts = [] + return copy_parts, output + +def update_transkription_position_ids(word): + """Update transkription_position' ids according to index. + """ + for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))): + transkription_position.id = index + class Word(SimpleWord): """ This class represents a word. """ DATA = 'debug-data' XML_TAG = 'word' XML_EARLIER_VERSION = 'earlier-version' def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None): super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.deleted = deleted self.debug_container = {} if len([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ]) > len(self.text): self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ]) self.word_part_objs = word_part_objs if word_part_objs is not None else [] self.is_head_of_inserted_words = False self.is_tail_of_inserted_words = False self.is_before_inserted_words = False self.is_after_inserted_words = False self.word_insertion_mark = None self.debug_msg = None self.writing_process_id = writing_process_id self.writing_processes = [] self.word_parts = word_parts if word_parts is not None else [] self.earlier_version = earlier_version self.box_paths = box_paths if box_paths is not None else [] def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = super(Word,self).attach_word_to_tree(target_tree) if self.deleted is not None: word_node.set('deleted', str(self.deleted).lower()) if self.writing_process_id > -1: word_node.set('writing-process-id', str(self.writing_process_id)) for word_part in self.word_parts: word_part.attach_word_to_tree(word_node) if self.earlier_version is not None: earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION) self.earlier_version.attach_word_to_tree(earlier_node) for index, box_path in enumerate(self.box_paths): box_path.id = index box_path.attach_object_to_tree(word_node) return word_node def belongs_to_multiple_writing_processes(self, include_parts=False): """Returns true if transkription_positions belong to different WritingProcesses. """ if len(self.word_parts) > 0 and include_parts: return len(set(word.writing_process_id for word in self.word_parts)) > 1 return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1 + @classmethod + def create_cls(cls, word_node): + """Creates a word from a (lxml.Element) node. + + [:return:] Word + """ + cls = super(Word,cls).create_cls(word_node) + cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1 + cls.split_strings = None + if bool(word_node.get('split')): + cls.split_strings = word_node.get('split').split(' ') + if ''.join(cls.split_strings) != cls.text: + error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\ + format(word_node.getroottree().docinfo.URL, str(cls.id))\ + + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\ + + 'Text attribute: "{0}".\n'.format(cls.text) + raise Exception(error_msg) + cls.deleted = word_node.get('deleted') == 'true'\ + if bool(word_node.get('deleted')) else None + cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_TAG) ] + cls.box_paths = [ Path(node=node) for node in word_node.xpath('.//' + Path.BOX_TAG ) ] + earlier_versions = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ] + if len(earlier_versions) > 0: + cls.earlier_version = earlier_versions[0] + return cls + + @staticmethod + def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): + """Creates a word from a (lxml.Element) node or word_part_objs. + + [:return:] Word + """ + if word_node is not None: # init word from xml node + id = int(word_node.get('id')) + line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number + text = word_node.get('text') + deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true' + transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] + faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] + word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ + if len(word_node.findall('.//' + Word.DATA)) > 0\ + else [ item.attrib for item in word_node.findall('.//part')] + return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\ + faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) + elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file + WIDTH = 5 + TOPCORRECTION = 2.0 + FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize + height = height + x = round(float(word_part_objs[0]['x']), 3) + if(page is not None and bool(page.style_dict)): + HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height + style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) + biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) + height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) + TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size + if endSign is not None and '%' in endSign: + lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ + for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ + if bool(page.style_dict[key].get('font-size'))] + lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 + endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR + elif endSign is not None and '%' in endSign: + endX = float(endX) + WIDTH + bottom = round(float(word_part_objs[0]['y']), 3) + y = round(bottom - height + TOPCORRECTION, 3) + width = round(float(endX) - x, 3) + transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] + text = ''.join([ dict['text'] for dict in word_part_objs]) + line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number + word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) + word.debug_msg = debug_msg + return word + else: + error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' + raise Exception('Error: {}'.format(error_msg)) + + @classmethod + def get_semantic_dictionary(cls): + """ Creates and returns a semantic dictionary as specified by SemanticClass. + """ + dictionary = super(Word,cls).get_semantic_dictionary() + dictionary['properties'].update(cls.create_semantic_property_dictionary('deleted', bool,\ + name='isWordDeleted', label='has word been deleted')) + #dictionary['properties'].update(cls.create_semantic_property_dictionary('writing_processes', WritingProcess, cardinality=1,\ + # cardinality_restriction='minCardinality', name='wordBelongsToWritingProcess', label='word has been written in a specific writing process')) + # TODO: change me after fixing word box issue!!!! + dictionary['properties'].update(cls.create_semantic_property_dictionary('writing_processes', WritingProcess,\ + name='wordBelongsToWritingProcess', label='word has been written in a specific writing process')) + #dictionary['properties'].update(cls.create_semantic_property_dictionary('word_parts', list,\ + # name='wordHasWordParts', label='word has word parts', comment='word consists of a list of words')) + return dictionary + def get_partial_word_over_box(self): """Partition a word according to its transkription_positions' has_box ->split word and add partial words as its parts. :return: word over box or self """ word_over_box = self if self.has_mixed_status('has_box'): transkription_positions = [] last_status = None for transkription_position in self.transkription_positions: if transkription_position.has_box != last_status\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_status: word_over_box = newWord transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.has_box if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_status: word_over_box = newWord self.transkription_positions = [] self.line_number = -1 elif len(self.word_parts) > 0: self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, 'get_partial_word_over_box') return word_over_box def has_mixed_status(self, property_key, include_parts=False): """Returns true if transkription_positions belong to different WritingProcesses. """ if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions): return False if len(self.word_parts) > 0 and include_parts: if False in set(property_key in word.__dict__.keys() for word in self.word_parts): return False return len(set(word.deleted for word in self.word_parts)) > 1 return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1 def init_word(self, page): """Initialize word with objects from page. """ super(Word,self).init_word(page) if self.writing_process_id > -1: self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ] writing_processes = self.writing_processes for word_part in self.word_parts: word_part.init_word(page) self.lines += word_part.lines - self.writing_processes + word_part.writing_processes + self.writing_processes += word_part.writing_processes self.lines = [ line for line in set(self.lines) ] self.writing_processes = [ wp for wp in set(self.writing_processes)] - + + def join(self, other_word, append_at_end_of_new_word=True): + """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. + """ + if append_at_end_of_new_word: + self.text = self.text + other_word.text + for position in other_word.transkription_positions: + position.id = str(len(self.transkription_positions)) + self.transkription_positions.append(position) + else: + self.text = other_word.text + self.text + index = 0 + for position in other_word.transkription_positions: + self.transkription_positions.insert(index, position) + index += 1 + while index < len(self.transkription_positions): + self.transkription_positions[index].id = str(index) + index += 1 + self.simplify_transkription_positions() + def partition_according_to_deletion(self): """Partition a word according to its transkription_positions' deletion status ->split word and add partial words as its parts. """ if self.has_mixed_status('deleted'): transkription_positions = [] last_status = None for transkription_position in self.transkription_positions: if transkription_position.deleted != last_status\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.deleted if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] self.line_number = -1 self.deleted = False elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion') elif not self.deleted\ and len(self.transkription_positions) > 0\ and self.transkription_positions[0].deleted: self.deleted = True def partition_according_to_writing_process_id(self): """Partition a word according to its transkription_positions' writing_process_ids ->split word and add partial words as its parts. """ if self.belongs_to_multiple_writing_processes(): last_writing_process_id = -1 transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.writing_process_id != last_writing_process_id\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_writing_process_id = transkription_position.writing_process_id if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] - self.line_number = -1 elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id') if self.belongs_to_multiple_writing_processes(include_parts=True): self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0] elif len(self.transkription_positions) > 0: self.writing_process_id = self.transkription_positions[0].writing_process_id def process_boxes(self, box_paths, parent_word=None, tr_xmin=0.0, tr_ymin=0.0): """Determines whether word is over a word box. """ test_case = len(box_paths) == 1 later_version_word = None if len(self.word_parts) > 0: for word in self.word_parts: later_version = word.process_boxes(box_paths, parent_word=self, tr_xmin=tr_xmin, tr_ymin=tr_ymin) - if later_version is not None and later_version.earlier_version is not None: + if later_version_word is None and later_version is not None and later_version.earlier_version is not None: later_version_word = later_version else: new_tp_dict = {} for transkription_position in self.transkription_positions: word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) containing_boxes = [ box_path for box_path in box_paths\ if word_path.is_partially_contained_by(box_path)\ or box_path.do_paths_intersect(word_path) ] + #containing_boxes = [ box_path for box_path in box_paths\ + # if box_path.contains_start_of_path(word_path) ] + if len(containing_boxes) > 0: + box_path = containing_boxes[0] + if box_path.contains_path(word_path): + transkription_position.has_box = box_path + elif box_path.contains_start_of_path(word_path): + split_position = box_path.path.bbox()[1] - tr_xmin + new_tps = transkription_position.split(split_position) + if len(new_tps) == 2: + new_tps[0].has_box = box_path + new_tp_dict.update({ transkription_position: new_tps }) + else: + transkription_position.has_box = box_path + elif box_path.contains_end_of_path(word_path): + split_position = box_path.path.bbox()[0] - tr_xmin + new_tps = transkription_position.split(split_position) + if len(new_tps) == 2: + new_tps[1].has_box = box_path + new_tp_dict.update({ transkription_position: new_tps }) + else: + transkription_position.has_box = box_path + else: + split_position1 = box_path.path.bbox()[0] - tr_xmin + split_position2 = box_path.path.bbox()[1] - tr_xmin + new_tps = transkription_position.split(split_position1, split_position2) + if len(new_tps) >= 2: + new_tps[1].has_box = box_path + new_tp_dict.update({ transkription_position: new_tps }) + else: + transkription_position.has_box = box_path + for replace_tp in new_tp_dict.keys(): if len(containing_boxes) > 0: box_path = containing_boxes[0] if box_path.contains_path(word_path): transkription_position.has_box = box_path elif box_path.contains_start_of_path(word_path): split_position = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[0].has_box = box_path new_tp_dict.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path elif box_path.contains_end_of_path(word_path): split_position = box_path.path.bbox()[0] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[1].has_box = box_path new_tp_dict.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path else: split_position1 = box_path.path.bbox()[0] - tr_xmin split_position2 = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position1, split_position2) if len(new_tps) >= 2: new_tps[1].has_box = box_path new_tp_dict.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path for replace_tp in new_tp_dict.keys(): for tp in new_tp_dict.get(replace_tp): self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp) self.transkription_positions.remove(replace_tp) update_transkription_position_ids(self) later_version_word = self.get_partial_word_over_box() if len(later_version_word.transkription_positions) > 0\ and later_version_word.transkription_positions[0].has_box is not None: box_holder = self if parent_word is None else parent_word box_holder.box_paths.append(later_version_word.transkription_positions[0].has_box) box_text = later_version_word.transkription_positions[0].has_box.earlier_text transkription_positions = TranskriptionPosition.copy_list_of_cls(later_version_word.transkription_positions) later_version_word.earlier_version = Word(text=box_text, transkription_positions=transkription_positions) #print(later_version_word.text, later_version_word.earlier_version.text) return later_version_word return later_version_word + def set_word_insertion_mark(self, word_insertion_mark): + """Sets word_insertion_mark + """ + self.word_insertion_mark = word_insertion_mark + + def set_writing_process_id_to_transkription_positions(self, page): + """Determines the writing process id of the transkription_positions. + """ + for transkription_position in self.transkription_positions: + if len(transkription_position.positional_word_parts) > 0: + for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): + if font_key in page.fontsizekey2stage_mapping.keys(): + transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) + + + def simplify_transkription_positions(self): + """Merge transkription_positions if possible. + """ + index = len(self.transkription_positions)-1 + while index > 0\ + and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]: + current_tp = self.transkription_positions[index] + index -= 1 + previous_tp = self.transkription_positions[index] + if previous_tp.writing_process_id == current_tp.writing_process_id: + positional_word_parts = previous_tp.positional_word_parts + positional_word_parts += current_tp.positional_word_parts + transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ + positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id) + if len(transkription_positions) == 1: + transkription_positions[0].writing_process_id = previous_tp.writing_process_id + self.transkription_positions.pop(index+1) + self.transkription_positions[index] = transkription_positions[0] + #print(self.text, len(self.transkription_positions)) + def split(self, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) return previousWord, currentWord, nextWord def split_according_to_status(self, status): """Split a word according to its transkription_positions' text. :return: a list of new word.Word """ new_words = [] if self.has_mixed_status(status): last_status = None transkription_positions = [] copy_keys = [ 'line_number', 'text', 'deleted', 'writing_process_id' ] for transkription_position in self.transkription_positions: if transkription_position.__dict__[status] != last_status\ and len(transkription_positions) > 0: newWord = Word(id=self.id+len(new_words), transkription_positions=transkription_positions) for key in copy_keys: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] newWord.__dict__[status] = transkription_positions[0].__dict__[status] new_words.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.__dict__[status] if len(transkription_positions) > 0: newWord = Word(id=self.id+len(new_words), transkription_positions=transkription_positions) for key in copy_keys: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] newWord.__dict__[status] = transkription_positions[0].__dict__[status] new_words.append(newWord) return new_words - - def join(self, other_word, append_at_end_of_new_word=True): - """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. - """ - if append_at_end_of_new_word: - self.text = self.text + other_word.text - for position in other_word.transkription_positions: - position.id = str(len(self.transkription_positions)) - self.transkription_positions.append(position) - else: - self.text = other_word.text + self.text - index = 0 - for position in other_word.transkription_positions: - self.transkription_positions.insert(index, position) - index += 1 - while index < len(self.transkription_positions): - self.transkription_positions[index].id = str(index) - index += 1 - self.simplify_transkription_positions() - - def set_word_insertion_mark(self, word_insertion_mark): - """Sets word_insertion_mark - """ - self.word_insertion_mark = word_insertion_mark - def simplify_transkription_positions(self): - """Merge transkription_positions if possible. - """ - index = len(self.transkription_positions)-1 - while index > 0\ - and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]: - current_tp = self.transkription_positions[index] - index -= 1 - previous_tp = self.transkription_positions[index] - if previous_tp.writing_process_id == current_tp.writing_process_id: - positional_word_parts = previous_tp.positional_word_parts - positional_word_parts += current_tp.positional_word_parts - transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ - positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id) - if len(transkription_positions) == 1: - transkription_positions[0].writing_process_id = previous_tp.writing_process_id - self.transkription_positions.pop(index+1) - self.transkription_positions[index] = transkription_positions[0] - #print(self.text, len(self.transkription_positions)) - - @classmethod - def create_cls(cls, word_node): - """Creates a word from a (lxml.Element) node. - [:return:] Word - """ - cls = super(Word,cls).create_cls(word_node) - cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1 - cls.split_strings = None - if bool(word_node.get('split')): - cls.split_strings = word_node.get('split').split(' ') - if ''.join(cls.split_strings) != cls.text: - error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\ - format(word_node.getroottree().docinfo.URL, str(cls.id))\ - + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\ - + 'Text attribute: "{0}".\n'.format(cls.text) - raise Exception(error_msg) - cls.deleted = word_node.get('deleted') == 'true'\ - if bool(word_node.get('deleted')) else None - cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_TAG) ] - cls.box_paths = [ Path(node=node) for node in word_node.xpath('.//' + Path.BOX_TAG ) ] - earlier_versions = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ] - if len(earlier_versions) > 0: - cls.earlier_version = earlier_versions[0] - return cls - - @staticmethod - def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): - """Creates a word from a (lxml.Element) node or word_part_objs. - - [:return:] Word - """ - if word_node is not None: # init word from xml node - id = int(word_node.get('id')) - line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number - text = word_node.get('text') - deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true' - transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] - faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] - word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ - if len(word_node.findall('.//' + Word.DATA)) > 0\ - else [ item.attrib for item in word_node.findall('.//part')] - return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\ - faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) - elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file - WIDTH = 5 - TOPCORRECTION = 2.0 - FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize - height = height - x = round(float(word_part_objs[0]['x']), 3) - if(page is not None and bool(page.style_dict)): - HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height - style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) - biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) - height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) - TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size - if endSign is not None and '%' in endSign: - lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ - for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ - if bool(page.style_dict[key].get('font-size'))] - lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 - endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR - elif endSign is not None and '%' in endSign: - endX = float(endX) + WIDTH - bottom = round(float(word_part_objs[0]['y']), 3) - y = round(bottom - height + TOPCORRECTION, 3) - width = round(float(endX) - x, 3) - transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] - text = ''.join([ dict['text'] for dict in word_part_objs]) - line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number - word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) - word.debug_msg = debug_msg - return word - else: - error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' - raise Exception('Error: {}'.format(error_msg)) - - @classmethod - def get_semantic_dictionary(cls): - """ Creates and returns a semantic dictionary as specified by SemanticClass. - """ - dictionary = super(Word,cls).get_semantic_dictionary() - dictionary['properties'].update(cls.create_semantic_property_dictionary('deleted', bool,\ - name='isWordDeleted', label='has word been deleted')) - dictionary['properties'].update(cls.create_semantic_property_dictionary('writing_processes', WritingProcess, cardinality=1,\ - cardinality_restriction='minCardinality', name='wordBelongsToWritingProcess', label='word has been written in a specific writing process')) - dictionary['properties'].update(cls.create_semantic_property_dictionary('word_parts', list,\ - name='wordHasWordParts', label='word has word parts', comment='word consists of a list of words')) - return dictionary - -def execute_function_on_parts(word_parts, func_name): - """Execute function on parts and add those parts instead of original word to word_parts. - - :return: new word_parts, output from func - """ - copy_parts = word_parts[:] - for word in word_parts: - output = eval('word.{0}()'.format(func_name)) - if len(word.word_parts) > 0: - for part_word in word.word_parts: - copy_parts.insert(copy_parts.index(word), part_word) - copy_parts.remove(word) - word.word_parts = [] - return copy_parts, output - -def update_transkription_position_ids(word): - """Update transkription_position' ids according to index. - """ - for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))): - transkription_position.id = index Index: svgscripts/process_words_post_merging.py =================================================================== --- svgscripts/process_words_post_merging.py (revision 0) +++ svgscripts/process_words_post_merging.py (revision 72) @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This program can be used to process words after they have been merged with faksimile data. +""" +# Copyright (C) University of Basel 2019 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +from colorama import Fore, Style +from deprecated import deprecated +from functools import cmp_to_key +import getopt +import inspect +import lxml.etree as ET +import re +import shutil +import string +from svgpathtools import svg2paths2, svg_to_paths +import sys +import tempfile +from operator import attrgetter +import os +from os import listdir, sep, path, setpgrp, devnull +from os.path import exists, isfile, isdir, dirname, basename +from progress.bar import Bar +import warnings + +if dirname(__file__) not in sys.path: + sys.path.append(dirname(__file__)) + +from datatypes.box import Box +from datatypes.mark_foreign_hands import MarkForeignHands +from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK +from datatypes.path import Path +from datatypes.text_connection_mark import TextConnectionMark +from datatypes.transkriptionField import TranskriptionField +from util import back_up +from process_files import update_svgposfile_status + +sys.path.append('shared_util') +from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT + + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.1" + +UNITTESTING = False + +def categorize_paths(page, transkription_field=None): + """Categorize all paths that are part of the transkription field. + + :return: a dictionary containig a list for each category of path. + """ + if page.source is not None and isfile(page.source): + MAX_HEIGHT_LINES = 1 + max_line = sorted(\ + [line_number.bottom-line_number.top for line_number in page.line_numbers if line_number.id % 2 == 0],\ + reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17 + tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0 + tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0 + paths, attributes = svg_to_paths.svg2paths(page.source) + allpaths_on_tf = [] + allpaths_outside_tf = [] + attributes_outside_tf = [] + if transkription_field is None: + transkription_field = TranskriptionField(page.source) + for index in range(0, len(paths)): + path = paths[index] + attribute = attributes[index] + if len(path) > 0\ + and path != transkription_field.path\ + and path.bbox()[0] > tr_xmin\ + and path.bbox()[1] < transkription_field.xmax: + allpaths_on_tf.append(Path(id=index, path=path, style_class=attribute.get('class'))) + elif len(path) > 0\ + and path != transkription_field.path: + allpaths_outside_tf.append(path) + attributes_outside_tf.append(attribute) + path_dict = { 'text_area_deletion_paths': [],\ + 'deletion_or_underline_paths': [],\ + 'box_paths': [],\ + 'dots_paths': [],\ + 'word_connector_paths': [],\ + 'uncategorized_paths': [] } + for mypath in allpaths_on_tf: + xmin, xmax, ymin, ymax = mypath.path.bbox() + start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin) + if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1: + path_dict.get('dots_paths').append(mypath) + elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed(): + path_dict.get('box_paths').append(mypath) + elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed(): + path_dict.get('word_connector_paths').append(mypath) + elif abs(ymax-ymin) < MAX_HEIGHT_LINES: + path_dict.get('deletion_or_underline_paths').append(mypath) + elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin): + path_dict.get('text_area_deletion_paths').append(mypath) + else: + path_dict.get('uncategorized_paths').append(mypath) + underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin) + path_dict.update({'underline_path': underline_path}) + process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\ + paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line) + return path_dict + elif not UNITTESTING: + error_msg = 'Svg source file {} does not exist!'.format(page.source)\ + if page.source is not None else 'Page does not contain a source file!' + raise FileNotFoundError(error_msg) + return {} + +def do_paths_intersect_saveMode(path1, path2): + """Returns true if paths intersect, false if not or if there was an exception. + """ + try: + return path1.intersect(path2, justonemode=True) + except AssertionError: + return False + +def find_special_words(page, transkription_field=None): + """Find special words, remove them from words, process their content. + """ + if page.source is None or not isfile(page.source): + raise FileNotFoundError('Page does not have a source!') + if transkription_field is None: + transkription_field = TranskriptionField(page.source) + special_char_list = MarkForeignHands.get_special_char_list() + special_char_list += TextConnectionMark.get_special_char_list() + single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ] + if not UNITTESTING: + bar = Bar('find special words', max=len(single_char_words)) + for word in single_char_words: + not bool(UNITTESTING) and bar.next() + if word.text == MarkForeignHands.CLASS_MARK: + id = len(page.mark_foreign_hands) + page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id)) + page.words.remove(word) + elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\ + or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\ + and any(style in page.sonderzeichen_list for style\ + in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))): + id = len(page.text_connection_marks) + page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id)) + page.words.remove(word) + not bool(UNITTESTING) and bar.finish() + svg_tree = ET.parse(page.source) + page.update_page_type(transkription_field=transkription_field) + page.update_line_number_area(transkription_field, svg_tree=svg_tree) + italic_classes = [ key for key in page.style_dict\ + if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ] + if len(page.mark_foreign_hands) > 0: + MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\ + SonderzeichenList=page.sonderzeichen_list) + if len(page.text_connection_marks) > 0: + TextConnectionMark.find_content_in_footnotes(page.text_connection_marks, transkription_field, svg_tree,\ + title=page.title, page_number=page.number) + +def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): + """Marks all words that intersect with deletion paths as deleted + and adds these paths to word_deletion_paths. + + [:return:] list of .path.Path that might be word_underline_paths + """ + if not UNITTESTING: + bar = Bar('mark words that intersect with deletion paths', max=len(page.words)) + for word in page.words: + not bool(UNITTESTING) and bar.next() + word.deleted = False + for transkription_position in word.transkription_positions: + word_path = Path.create_path_from_transkription_position(transkription_position,\ + tr_xmin=tr_xmin, tr_ymin=tr_ymin) + intersecting_paths = [ deletion_path for deletion_path in deletion_paths\ + if do_paths_intersect_saveMode(deletion_path.path, word_path.path) ] + if len(intersecting_paths) > 0: + transkription_position.deleted = True + for deletion_path in intersecting_paths: + if deletion_path not in page.word_deletion_paths: + deletion_path.tag = Path.WORD_DELETION_PATH_TAG + deletion_path.attach_object_to_tree(page.page_tree) + page.word_deletion_paths.append(deletion_path) + word.partition_according_to_deletion() + not bool(UNITTESTING) and bar.finish() + # return those paths in deletion_paths that are not in page.word_deletion_paths + return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ] + +def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None): + """Process words after merging with faksimile word positions. + """ + if page is None and svg_pos_file is None: + raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!') + if page is None: + page = Page(xml_source_file=svg_pos_file) + if page.source is None or not isfile(page.source): + raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file)) + if svg_pos_file is None: + svg_pos_file = page.page_tree.docinfo.URL + if new_words is not None: + page.words = sorted(new_words, key=attrgetter('id')) + for word_node in page.page_tree.xpath('.//word'): + word_node.getparent().remove(word_node) + transkription_field = TranskriptionField(page.source) + find_special_words(page, transkription_field=transkription_field) + update_writing_process_ids(page) + #TODO: find_hyphenated_words(page) + categorize_paths(page, transkription_field=transkription_field) + page.update_and_attach_words2tree() + if not UNITTESTING: + if target_svg_pos_file is None: + target_svg_pos_file = svg_pos_file + status = STATUS_MERGED_OK + ":" + STATUS_POSTMERGED_OK + update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status) + write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) + +def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17): + """Process word boxes: partition words according to word boxes. + """ + MAX_HEIGHT_LINES = 1 + if not UNITTESTING: + bar = Bar('process word boxes', max=len(page.words)) + svg_tree = ET.parse(page.source) + namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } + allpaths_on_margin_field = [] + if paths is None or attributes is None: + paths, attributes = svg_to_paths.svg2paths(page.source) + for index in range(0, len(paths)): + path = paths[index] + xmin, xmax, ymin, ymax = path.bbox() + attribute = attributes[index] + if len(path) > 0\ + and path != transkription_field.path\ + and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\ + or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\ + and abs(ymax-ymin) < max_line: + allpaths_on_margin_field.append(Path(id=index, path=path, style_class=attribute.get('class'))) + box_line_number_dict = {} + for box_path in sorted(box_paths, key=lambda path: path.get_median_y()): + line_number = page.get_line_number(box_path.get_median_y(tr_ymin=transkription_field.ymin)) + if line_number not in box_line_number_dict.keys(): + box_line_number_dict.update({ line_number: [ box_path ]}) + else: + box_line_number_dict.get(line_number).append(box_path) + boxes = [] + for line_number in box_line_number_dict.keys(): + box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x()) + margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\ + if page.get_line_number(margin_box.get_median_y(tr_ymin=transkription_field.ymin)) == line_number ],\ + key=lambda path: path.get_x()) + threshold = 3 if line_number % 2 == 0 else 1.5 + for box_path in box_paths_on_line: + box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\ + transkription_field=transkription_field, namespaces=namespaces, threshold=threshold) + if box is not None: + boxes.append(box) + for word in page.words: + not bool(UNITTESTING) and bar.next() + word.process_boxes(boxes, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin) + not bool(UNITTESTING) and bar.finish() + +def update_writing_process_ids(page): + """Update the writing_process_ids of the words and split accordingly. + """ + for word in page.words: + word.set_writing_process_id_to_transkription_positions(page) + word.partition_according_to_writing_process_id() + +def usage(): + """prints information on how to use the script + """ + print(main.__doc__) + +def main(argv): + """This program can be used to process words after they have been merged with faksimile data. + + svgscripts/process_words_post_merging.py [OPTIONS] + + a xml file about a manuscript, containing information about its pages. + a xml file about a page, containing information about svg word positions. + + OPTIONS: + -h|--help: show help + + :return: exit code (int) + """ + try: + opts, args = getopt.getopt(argv, "h", ["help"]) + except getopt.GetoptError: + usage() + return 2 + for opt, arg in opts: + if opt in ('-h', '--help'): + usage() + return 0 + if len(args) < 1: + usage() + return 2 + exit_status = 0 + file_a = args[0] + if isfile(file_a): + manuscript_file = file_a\ + if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ + else None + for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=STATUS_POSTMERGED_OK): + back_up(page, page.page_tree.docinfo.URL) + post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file) + else: + raise FileNotFoundError('File {} does not exist!'.format(file_a)) + return exit_status + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) Index: svgscripts/extractWordPosition.py =================================================================== --- svgscripts/extractWordPosition.py (revision 71) +++ svgscripts/extractWordPosition.py (revision 72) @@ -1,590 +1,586 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract the position of the words in a svg file and write them to a xml file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import inspect import getopt from lxml import etree as ET from os import sep, listdir, mkdir, path from os.path import exists, isfile, isdir from progress.bar import Bar import re import sys import warnings from datatypes.lineNumber import LineNumber from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.pdf import PDFText from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word from datatypes.word_insertion_mark import WordInsertionMark sys.path.append('shared_util') from myxmlwriter import write_pretty __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Extractor: """ This class can be used to extract the word positions in a svg file and write it to a xml file. Args: [xml_dir (str): target directory] [title (str): title of document] [manuscript_file (str): xml file containing information about the archival unity to which the current page belongs [extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that are part of the transkription field. """ UNITTESTING = False SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ] def __init__(self, xml_dir=None, title=None, manuscript_file=None, extract_transkription_field_only=False, compare2pdf=False): if bool(xml_dir): self.xml_dir = xml_dir not isdir(self.xml_dir) and mkdir(self.xml_dir) else: self.xml_dir = 'xml' if(isdir('xml')) else '' self.latest_status = None self.compare2pdf = compare2pdf self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else '' self.title = title self.manuscript_file = manuscript_file self.extract_transkription_field_only = extract_transkription_field_only self.manuscript_tree = None if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file): self.manuscript_tree = ET.parse(self.manuscript_file) self.title = self.manuscript_tree.getroot().get('title') elif bool(self.manuscript_file): raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file)) elif bool(self.title): self.update_title_and_manuscript(self.title, False) def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None): """Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word). If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created. :returns: the new word counter (int) """ break_points = [] if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points for Sonderzeichen in self.SONDERZEICHEN_LIST: contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ] if True in contains_Sonderzeichen: break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]] for sz_point in [i for i, e in break_points]: wim_index = len(page.word_insertion_marks) x = float(word_part_objs[sz_point]['x']) y = float(word_part_objs[sz_point]['y']) if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None: svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } xmin = transkription_field.xmin ymin = transkription_field.ymin wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\ line_number=page.get_line_number(y-1), mark_type=Sonderzeichen) page.word_insertion_marks.append(wim) if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points THRESHOLDX = 20 # Threshold between line number and text last_x = -1 for i, x in enumerate([float(dict['x']) for dict in word_part_objs]): if(last_x > -1 and (x - last_x > THRESHOLDX)): break_points.append((i, i)) last_x = x if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words from_index = 0 for end_point, next_from_index in break_points: new_word_part_objs = word_part_objs[from_index:end_point] new_endX = word_part_objs[end_point]['x'] from_index = next_from_index index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) if from_index > 0 and from_index < len(word_part_objs): new_word_part_objs = word_part_objs[from_index:] index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) return index else: if len(word_part_objs) > 0: transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\ debug_msg_string=debug_msg, transkription_field=transkription_field) text = self.get_word_from_part_obj(word_part_objs) line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2) if line_number == -1: if len(page.words) > 0: lastWord = page.words[len(page.words)-1] lastWord_lastTP = lastWord.transkription_positions[len(lastWord.transkription_positions)-1] lastTP = transkription_positions[len(transkription_positions)-1] if transkription_positions[0].left > lastWord_lastTP.left\ and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2: line_number = lastWord.line_number else: line_number = lastWord.line_number+1 newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions) page.words.append(newWord) return int(index) + 1 else: return int(index) def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default'): """Extracts information about positions of text elements and writes them to a xml file. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file exit_status = 0 with warnings.catch_warnings(record=record_warnings) as w: warnings.simplefilter(warning_filter) page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile) status_message = 'OK' if w is not None and len(w) > 0: status_message = 'with warnings' if True in [ str(warn.message).startswith(Page.WARNING_MISSING_USE_NODE4PWP) for warn in w ]: status_message += ':{}:'.format(Page.WARNING_MISSING_USE_NODE4PWP.lower()) if True in [ str(warn.message).startswith(Page.WARNING_MISSING_GLYPH_ID4WIM) for warn in w ]: status_message += ':{}:'.format(Page.WARNING_MISSING_GLYPH_ID4WIM.lower()) self.latest_status = status_message exit_status = 1 else: self.latest_status = None page.page_tree.getroot().set('status', status_message) write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition') return exit_status else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None): """Extracts information about positions of text elements. [:returns:] (datatypes.page) the Page containing all information. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file transkription_field = TranskriptionField(file_name) if bool(self.extract_transkription_field_only) else None svg_tree = ET.parse(file_name) page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number, pdfFile=pdfFile,\ svg_file=svg_file, extract_transkription_field_only=self.extract_transkription_field_only) page.add_source(file_name) sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot()) page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict) if transkription_field is not None: page.init_line_numbers(self.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax) self.extract_word_position(svg_tree, page, transkription_field=transkription_field) - #if page.pdfFile is not None and isfile(page.pdfFile): - # pdftext = PDFText(page.pdfFile, sonderzeichen=self.SONDERZEICHEN_LIST) - # pdftext.compare_svgWords2pdfWords(page, transkription_field=transkription_field, split_wrongly_concatenated_words=self.compare2pdf) page.create_writing_processes_and_attach2tree() - #page.categorize_paths(transkription_field=transkription_field) page.update_and_attach_words2tree() for word_insertion_mark in page.word_insertion_marks: # it is not clear if we really need to know this alternative word ordering. See 'TODO.md' #word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark) word_insertion_mark.attach_object_to_tree(page.page_tree) return page else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def extract_line_numbers(self, svg_tree, transkription_field): """Extracts line numbers and write them to a xml file. """ nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] line_numbers = [ LineNumber(raw_text_node=item, transkription_field=transkription_field)\ for item in filter(lambda x: LineNumber.IS_A_LINE_NUMBER(x), nodes_near_tf)] if len(line_numbers) > 0: MINABOVE = 3 last_to_position = transkription_field.ymin for line_number in line_numbers: above_current_line_bottom = line_number.bottom + transkription_field.ymin - MINABOVE bottoms = self.get_bottoms(svg_tree.getroot(), from_position=last_to_position, to_position=above_current_line_bottom) last_to_position = above_current_line_bottom if len(bottoms) > 0: current_line_top = float(bottoms[len(bottoms)-1]) - transkription_field.ymin + MINABOVE line_number.setTop(current_line_top) return line_numbers def extract_word_position(self, svg_tree, page, transkription_field=None): """Extracts word positions. """ counter = 0 word_part_obj = [] endSign = '%' last_matrix = None MAXBOTTOMDIFF = 5 MAXXDIFF = 6 if not Extractor.UNITTESTING: bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)])) for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field): current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field) # check for line breaks if (last_matrix is not None and len(word_part_obj) > 0 and (\ Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\ (abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\ (abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\ or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()): endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\ round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\ str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix))) counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field) word_part_obj = [] endX = current_matrix.getX() if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))): word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} ) else: endSign = text_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT endX = current_matrix.add2X(tspan_item.get('x')) if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))): y = current_matrix.add2Y(tspan_item.get('y')) word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix }) if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: """text_item has letterspacing class (set s & set t = new set with elements common to s and t) """ endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='tspan with letterspacing', transkription_field=transkription_field) word_part_obj = [] else: endSign = tspan_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='svg/text/tspan/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' last_matrix = current_matrix not bool(Extractor.UNITTESTING) and bar.next() if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\ transkription_field=transkription_field) word_part_obj = [] endSign = '%' not bool(Extractor.UNITTESTING) and bar.finish() def find_inserted_words_by_position(self, target_tree, x, y): """Returns an Array with the words that are inserted above the x, y position or [] if not found. """ warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.') MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 if(len(target_tree.getroot().xpath('//word[@id]')) > 0): result_list = [] minus2left = 20.0 minus2top = 19.0 while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX : result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ] minus2left -= 1 minus2top += 1 if len(result_list) > 0: result_bottom = result_list[len(result_list)-1].bottom result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)): result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def find_inserted_words(self, target_tree, word_insertion_mark): """Returns an Array with the words that are inserted above/underneath the word_insertion_mark. """ warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.') if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1: return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y) if(len(target_tree.getroot().xpath('//word[@id]')) > 0): MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 result_list = [] x = word_insertion_mark.x y = word_insertion_mark.y if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line line_number = word_insertion_mark.line_number - 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: minus2top = 1.0 while len(result_list) == 0 and minus2top < MINY: for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y - minus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break minus2top += 1 elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line line_number = word_insertion_mark.line_number + 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: plus2top = 1.0 while len(result_list) == 0 and plus2top < MINY : for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y + plus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break plus2top += 1 if len(result_list) > 0: # now, collect more words that are right of already collected words result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width for item in target_tree.getroot().xpath(\ '//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)): result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def get_bottoms(self, tree_root, from_position=-1.0, to_position=-1.0, transkription_field=None): """Returns all unique bottom values (Float) as a sorted list. """ bottom_list = sorted(set(item.get('transform').split(' ')[5].replace(')','') for item in tree_root.findall(".//text", tree_root.nsmap)), key=float) if transkription_field is not None: from_position = transkription_field.ymin to_position = transkription_field.ymax if (from_position > 0.0 and to_position > 0.0): return [ item for item in filter(lambda x: float(x) > from_position and float(x) < to_position, bottom_list) ] else: return bottom_list def get_file_name(self, file_name, page_number=None): """Returns the file_name of the target xml file. """ dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else '' if bool(self.title): return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml' else: return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml')) def get_page_number(self, file_name, page_number=None): """ Returns page number as a string (with leading zero(s) if len(page_number) < 3). """ if not bool(page_number) and bool(re.search(r'\d', file_name)): """if page_number=None and filename contains digits, then split filename into its parts that contain only digits, remove empty strings and return the last part containing only digits. """ page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop() if bool(page_number): leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else '' return leading_zeros + str(page_number) else: return '' def get_style(self, etree_root): """Returns the style specification as a dictionary. :returns: sonderzeichen_list: list of keys for classes that are 'Sonderzeichen' style_dict: dictionary: key = class name (str), value = style specification (dictionary) """ style_dict = {} sonderzeichen_list = [] letterspacing_list = [] style = etree_root.find('style', etree_root.nsmap) if style is not None: for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))): style_key = style_item.split('{')[0].replace('.', '') style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \ for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))} style_dict[style_key] = style_value_dict if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'): sonderzeichen_list.append(style_key) if bool(style_value_dict.get('letter-spacing')): letterspacing_list.append(style_key) return sonderzeichen_list, letterspacing_list, style_dict def get_text_items(self, tree_root, transkription_field=None): """Returns all text elements with a matrix or (if transkription_field is specified) all text elements that are located inside the transkription field. """ if transkription_field is not None: return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=x),\ tree_root.iterfind(".//text", tree_root.nsmap)) else: return tree_root.iterfind(".//text", tree_root.nsmap) def get_word_from_part_obj(self, word_part_obj): """Extracts all 'text' from a list of dicitonaries and concats it to a string. """ return ''.join([ dict['text'] for dict in word_part_obj]) def get_word_object_multi_char_x(self, word_part_obj_dict): """Returns the x of the last char of word_part_object. TODO: get real widths from svg_file!!! """ WIDTHFACTOR = 2.6 return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR def update_title_and_manuscript(self, title, update_manuscript=True): """Updates title and manuscript. """ self.title = title if update_manuscript or not bool(self.manuscript_file): self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml' if not isfile(self.manuscript_file): self.manuscript_tree = ET.ElementTree(ET.Element('manuscript', attrib={"title": self.title})) write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile') def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract the position of the words in a svg file and write them to a xml file. svgscripts/extractWordPosition.py [OPTIONS] svg file OR xml target file containing file name of svg file as "/page/@source". directory containing svg files OPTIONS: -h|--help: show help -c|--compare-to-pdf compare words to pdf and autocorrect -d|--xml-dir=xmlDir: target directory for the xml output file(s) -m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s) -o|--only-transkription-field: extract only words that are part of the transkription field. -p|--page=pageNumber: page number of the current page. For use with _one_ file only. -P|--PDF=pdfFile: pdf file - used for word correction -s|--svg=svgFile: svg web file -t|--title=title: title of the manuscript to which the current page(s) belong(s) -x|--xml-target-file=xmlOutputFile: xml target file :return: exit code (int) """ compare2pdf = True extract_transkription_field_only = True manuscript_file = None page_number = None pdfFile = None svg_file = None title = None xml_target_file = None xml_dir = ".{}xml".format(sep) try: opts, args = getopt.getopt(argv, "hocd:m:t:p:s:x:P:", ["help", "only-transkription-field", "compare-to-pdf", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-c', '--compare-to-pdf'): compare2pdf = True elif opt in ('-o', '--only-transkription-field'): extract_transkription_field_only = True elif opt in ('-d', '--xml-dir'): xml_dir = arg elif opt in ('-m', '--manuscript-file'): manuscript_file = arg elif opt in ('-t', '--title'): title = arg elif opt in ('-p', '--page'): page_number = str(arg) elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-P', '--PDF'): pdfFile = arg elif opt in ('-x', '--xml-target-file'): xml_target_file = str(arg) files_to_process = list() for arg in args: if isfile(arg): files_to_process.append(arg) elif isdir(arg): files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg))) else: print("'{}' does not exist!".format(arg)) return 2 if len(files_to_process) < 1 or args[0].endswith('xml'): if xml_target_file is None: xml_target_file = args[0] if len(args) > 0 else None if xml_target_file is not None and isfile(xml_target_file): target_file_tree = ET.parse(xml_target_file) file_name = target_file_tree.getroot().get('source') title = target_file_tree.getroot().get('title') if title is None else title page_number = target_file_tree.getroot().get('number') if page_number is None else page_number extract_transkription_field_only = (target_file_tree.getroot().get('transkription-field-only') == 'true')\ if target_file_tree.getroot().get('transkription-field-only') is not None else False if svg_file is None: if len(target_file_tree.xpath('//svg-image')) > 0: svg_file = target_file_tree.xpath('.//svg-image/@file-name')[0]\ if len(target_file_tree.xpath('.//svg-image/@file-name')) > 0 else None else: svg_file = target_file_tree.xpath('.//svg/@file')[0]\ if len(target_file_tree.xpath('.//svg/@file')) > 0 else None files_to_process.insert(0, file_name) if xml_target_file in files_to_process: files_to_process.remove(xml_target_file) else: usage() return 2 if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)): print("ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!") usage() return 2 extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, extract_transkription_field_only=extract_transkription_field_only, compare2pdf=compare2pdf) for file in files_to_process: extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/join_faksimileAndTranskription.py =================================================================== --- svgscripts/join_faksimileAndTranskription.py (revision 71) +++ svgscripts/join_faksimileAndTranskription.py (revision 72) @@ -1,597 +1,579 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET import re import shutil import string import sys import tempfile from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from progress.bar import Bar import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from convert_wordPositions import create_pdf_with_highlighted_words, create_svg_with_highlighted_words from create_task import CorrectWords from datatypes.faksimile import FaksimilePage, get_paths_inside_rect from datatypes.lineNumber import LineNumber -from datatypes.page import Page +from datatypes.page import Page, STATUS_MERGED_OK from datatypes.transkriptionField import TranskriptionField from process_files import update_svgposfile_status +from process_words_post_merging import post_merging_processing_and_saving from util import ExternalViewer, create_highlighted_svg_file, get_empty_node_ids, record_changes,\ record_changes_on_svg_file_to_page, record_changes_on_xml_file_to_page, get_mismatching_ids,\ replace_chars sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation) PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"') SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation) SINGLE_WORD_PATTERN = r"^[\w{}]$".format(string.punctuation) -STATUS_MERGED_OK = 'faksimile merged' HIGHLIGHT_COLOR = 'red' OPACITY = '0.5' def create_task_correct_words(target_dir, xml_source_file=None, source_svg_file=None, page=None, unmatched_word_ids=None, unmatched_node_ids=None): """Create a task CorrectWords or process corrected files. """ exit_status = 0 if xml_source_file is None or source_svg_file is None: if xml_source_file is None and page is not None and isfile(page.page_tree.docinfo.URL): xml_source_file = page.page_tree.docinfo.URL if xml_source_file is None else xml_source_file elif xml_source_file is None: raise Exception('create_task_correct_words needs a xml_source_file or a page that has a valid tree source!') if source_svg_file is None and page is not None and isfile(page.faksimile_svgFile): source_svg_file = page.faksimile_svgFile if source_svg_file is None else source_svg_file elif source_svg_file is None: raise Exception('create_task_correct_words needs a source_svg_file or a page that has a faksimile_svgFile!') if page is None: page = Page(xml_source_file=xml_source_file) correct_words = CorrectWords(xml_source_file, source_svg_file, target_dir, page=page,\ unmatched_node_ids=unmatched_node_ids) if not correct_words.has_been_created(page): if not page.is_locked(): reference_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.(xml|svg)') lock_dict = { 'reference_file': reference_file,\ 'message': 'Run:$ python3 {0} -c {1} {2}'.format(__file__, target_dir, source_svg_file)} write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file, script_name=__file__,\ file_type=FILE_TYPE_SVG_WORD_POSITION, **lock_dict) correct_words.create() if not UNITTESTING: print('Created a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description)) elif correct_words.has_been_finished(page): msg = 'Task "correct words" for page {} has been finished!'.format(str(page.number)) xml_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.xml', is_finished=True) transkription_svg = correct_words.get_target_filepath(page, is_faksimile_svg=False, is_finished=True) faksimile_svg = correct_words.get_target_filepath(page, is_finished=True) + faksimile_file = faksimile_svg if isfile(faksimile_svg) else source_svg_file if isfile(xml_file): msg += '\n Words loaded from file {}.'.format(xml_file) page = record_changes_on_xml_file_to_page(xml_source_file, xml_file) + page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=xml_file) elif isfile(transkription_svg): msg += '\n Words loaded from file {}.'.format(transkription_svg) page = record_changes_on_svg_file_to_page(xml_source_file, transkription_svg, word_ids=unmatched_word_ids) - faksimile_file = faksimile_svg if isfile(faksimile_svg) else source_svg_file + page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=transkription_svg) msg += '\n Faksimile loaded from file {}.'.format(faksimile_file) if not UNITTESTING: print(msg) exit_status = join_faksimileAndTranskription(faksimile_file, page=page) elif not UNITTESTING: print('There is a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description)) return exit_status def debug_function(words, input=''): """Custon debug function. """ if len([ word for word in words if word.debug_container.get('marked') ]) > 0: print(Fore.RED + 'marked word(s): {}'.format([ word.text for word in words if word.debug_container.get('marked') ])) if input != '': print('input: {}'.format(input)) print(Fore.RESET) def fix_errors(faksimile_file, unmerged_faksimile_positions, unmerged_words, text_field_id=None, faksimile_page=None, xml_source_file=None, manuscript_file=None, namespaces={}): """Creates a faksimile svg file and a pdf file highlighting the positions of the word positions that could not been merged. After correction, results are inserted into origianl file and processed again. :return: exit status (int) """ parser = ET.XMLParser(remove_blank_text=True) faksimile_tree = ET.parse(faksimile_file, parser) if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } if faksimile_page is None: faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces) if text_field_id is not None\ and text_field_id in [ faksimile_page.text_field.id for faksimile_page in faksimile_pages ]: faksimile_page = [ faksimile_page for faksimile_page in faksimile_pages if faksimile_page.text_field.id == text_field_id ][0] else: faksimile_page = faksimile_pages[0] if xml_source_file is None or manuscript_file is None: xml_source_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file) tmp_dir = tempfile.mkdtemp() tmp_pdf_file = tmp_dir + sep + 'output.pdf' tmp_svg_file = tmp_dir + sep + 'output.svg' tmp_faksimile = tmp_dir + sep + 'faksimile.svg' empyt_node_ids = get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces)\ if len(unmerged_faksimile_positions) < len(unmerged_words) else [] highlight_node_ids = [ faksimile_position.id for faksimile_position in unmerged_faksimile_positions ] highlight_node_ids += empyt_node_ids create_highlighted_svg_file(faksimile_tree, highlight_node_ids, target_file=tmp_faksimile, local_image_path=faksimile_page.faksimile_image.local_path, namespaces=namespaces, highlight_color=HIGHLIGHT_COLOR) #create_pdf_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, pdf_file_name=tmp_pdf_file, bg_color=HIGHLIGHT_COLOR) create_svg_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, svg_file_name=tmp_svg_file, bg_color=HIGHLIGHT_COLOR) exit_status = 2 if isfile(tmp_svg_file) and isfile(tmp_faksimile): ExternalViewer.show_files(list_of_files=[tmp_svg_file, tmp_faksimile]) record_changes_on_svg_file_to_page(xml_source_file, tmp_svg_file, word_ids=[ word.id for word in unmerged_words ]) record_changes(faksimile_file, tmp_faksimile, highlight_node_ids, namespaces=namespaces) shutil.rmtree(tmp_dir) exit_status = join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, do_fix_errors=False, join_single_char_words=True) return exit_status def get_filelist_and_manuscript_file(file_a, file_b=None): """Returns a file list and a manuscript file (or None) """ file_list = [] manuscript_file = None if isfile(file_a) and file_a.endswith('svg'): file_list.append(file_a) if file_b is not None and isfile(file_b): manuscript_file = file_b elif isfile(file_a) and file_a.endswith('xml'): manuscript_file = file_a if file_b is not None and isfile(file_b): file_list.append(file_b) elif isdir(file_b): file_list = [ svgfile for svgfile in listdir(file_b) if svgfile.endswith('svg') ] elif isdir(file_a): file_list = [ file_a + sep + svgfile for svgfile in listdir(file_a) if svgfile.endswith('svg') ] if file_b is not None and isfile(file_b): manuscript_file = file_b return file_list, manuscript_file def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None, redo_ok=False): """Return svg_pos_file and manuscript_file if they are ready for processing. """ svg_pos_file = None manuscript_tree = None if manuscript_file is not None\ and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')): manuscript_tree = ET.parse(manuscript_file) else: title_string = faksimile_page.title.replace(' ', '_') manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\ if isdir('.{}xml'.format(sep)) else title_string + '.xml' if isfile(manuscript_file): manuscript_tree = ET.parse(manuscript_file) if manuscript_tree is not None: if redo_ok and len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0: svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0] if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0: svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0] else: if not UNITTESTING: if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0: msg = Fore.LIGHTBLUE_EX +'->' + Fore.CYAN + 'Data from page {0} already merged with {1}!'.format(\ faksimile_page.page_number,\ manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)[0]) else: msg = Fore.MAGENTA + 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number) print(msg, end='') print(Style.RESET_ALL) return svg_pos_file, manuscript_file def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, page=None, do_fix_errors=False, redo_ok=False, debug_word_text='', **kwargs): """Joins the data of a faksimile file with the data of svgposfile. """ if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='') print(Style.RESET_ALL) if not do_fix_errors and 'do_fix_errors' in kwargs.keys(): do_fix_errors = kwargs.get('do_fix_errors') if not redo_ok and 'redo_ok' in kwargs.keys(): redo_ok = kwargs.get('redo_ok') if debug_word_text == '' and 'debug_word_text' in kwargs.keys(): debug_word_text = kwargs.get('debug_word_text') faksimile_tree = ET.parse(faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces) if page is not None: faksimile_pages = [ faksimile_page for faksimile_page in faksimile_pages\ if get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)[0]\ == page.page_tree.docinfo.URL ] exit_status = 0 for faksimile_page in faksimile_pages: svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok) if svg_pos_file is not None: image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field) if page is None: page = Page(xml_source_file=svg_pos_file, faksimile_image=image4page, faksimile_svgFile=faksimile_file) write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__,\ file_type=FILE_TYPE_SVG_WORD_POSITION) if not UNITTESTING: print(Fore.LIGHTBLUE_EX + '->', end='') print(Fore.CYAN + 'Joining data from page {0} with file {1} ... '.format(faksimile_page.page_number, svg_pos_file), end='') words = sort_words(page) if debug_word_text != '' and len([ word for word in words if word.text == debug_word_text ]) > 0: for word in words: if word.text == debug_word_text: word.debug_container.update({'marked': True}) if bool(kwargs.get('join_single_char_words')): removed_words = join_single_char_words(words) page.words = words page.update_and_attach_words2tree() #print([ word.text for word in page.words if word in removed_words ]) faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions) new_words = [] unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\ key=lambda text: len(text)) faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions, unique_faksimile_words) for word_text in unique_faksimile_words: process_word_text(new_words, word_text, faksimile_positions, words) if False not in [ word.joined for word in words ]\ and False not in [ position.joined for position in faksimile_positions]\ and not UNITTESTING: if page.is_locked(): page.unlock() - post_merging_processing_and_saving(svg_pos_file, new_words, page=page, manuscript_file=manuscript_file) + post_merging_processing_and_saving(svg_pos_file=svg_pos_file, new_words=new_words, page=page, manuscript_file=manuscript_file) print(Fore.GREEN + '[OK]') print(Style.RESET_ALL) elif not UNITTESTING: mismatch_words, mismatch_faksimile_positions = get_mismatching_ids(words, faksimile_positions) not_joined_fp = [ (position.id, position.text) for position in sorted(mismatch_faksimile_positions, key=lambda fp: fp.top) ] plural_fp = '' if len(not_joined_fp) < 2 else 's' not_joined_tw = [ (word.id, word.line_number, word.text) for word in sorted(mismatch_words, key=lambda word: word.transkription_positions[0].top) ] plural_tw = '' if len(not_joined_tw) < 2 else 's' print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp)) print('--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw)) debug_function(new_words, input='new_words') debug_function(words, input='words') print(Style.RESET_ALL) if kwargs.get('correct_words') is not None: unmatched_node_ids = [ position.id for position in mismatch_faksimile_positions ] unmatched_node_ids += get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces) exit_status = create_task_correct_words(kwargs.get('correct_words'), page=page, source_svg_file=faksimile_file,\ unmatched_word_ids=[ word.id for word in mismatch_words ],\ unmatched_node_ids=unmatched_node_ids) elif do_fix_errors: exit_status = fix_errors(faksimile_file, [position for position in faksimile_positions if not position.joined],\ [ word for word in words if not word.joined ], text_field_id=faksimile_page.text_field.id,\ faksimile_page=faksimile_page, xml_source_file=svg_pos_file,\ manuscript_file=manuscript_file, namespaces=namespaces) else: exit_status = 2 elif False in [ word.joined for word in words ]: print([ (word.id, word.text) for word in words if not word.joined ]) exit_status = 2 page = None return exit_status def join_single_char_words(words, threshold_x=5, threshold_y=5): """Join single char words. :return: a list of removed words """ #all_single_char_words = [ word for word in words if re.match(r'^\w$', word.text) ] removed_words = [] all_single_char_words = [ word for word in words if re.match(SINGLE_WORD_PATTERN, word.text) ] if not UNITTESTING: bar = Bar('Joining single char words', max=len(all_single_char_words)) line_numbers = sorted(set(word.line_number for word in all_single_char_words)) for line_number in line_numbers: single_char_words = [ word for word in all_single_char_words if word.line_number == line_number ] index = len(single_char_words) while index > 0: index -= 1 word = None not UNITTESTING and bar.next() if single_char_words[index] in words: single_char_word_index = words.index(single_char_words[index]) if re.match(SINGLE_PUNCTUATION_PATTERN, single_char_words[index].text)\ and words_close_enough(words[single_char_word_index-1], single_char_words[index], 15, 12): words[single_char_word_index-1].join(single_char_words[index]) removed_words.append(words.pop(single_char_word_index)) #print('{0} -> {1}, {2}'.format(word.text, words[single_char_word_index-1].text)) elif index > 0\ and words_close_enough(single_char_words[index-1], single_char_words[index], threshold_x, threshold_y): words[single_char_word_index-1].join(single_char_words[index]) removed_words.append(words.pop(single_char_word_index)) elif single_char_word_index > 0\ and words[single_char_word_index-1].line_number == line_number\ and words_close_enough(words[single_char_word_index-1], single_char_words[index], threshold_x, threshold_y): words[single_char_word_index-1].join(single_char_words[index]) removed_words.append(words.pop(single_char_word_index)) not UNITTESTING and bar.finish() return removed_words -def post_merging_processing_and_saving(svg_pos_file, new_words, page=None, manuscript_file=None, target_svg_pos_file=None): - """Process words after merging with faksimile word positions. - """ - if page is None: - page = Page(xml_source_file=svg_pos_file) - page.words = sorted(new_words, key=attrgetter('id')) - for word_node in page.page_tree.xpath('.//word'): - word_node.getparent().remove(word_node) - if page.source is None or not isfile(page.source): - raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file)) - transkription_field = TranskriptionField(page.source) - page.find_special_words(transkription_field=transkription_field) - #TODO: page.find_hyphenated_words() - page.categorize_paths(transkription_field=transkription_field) - page.update_and_attach_words2tree() - if target_svg_pos_file is None: - target_svg_pos_file = svg_pos_file - update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=STATUS_MERGED_OK) - write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) - def process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text='', min_length_split=5): """Joins faksimile_positions with text == word_text with words with text == word_text. """ text = word_text if alt_word_text == '' else alt_word_text fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ] words4word = [ word for word in words if word.text == word_text and not word.joined ] if alt_word_text != '': words4word += [ word for word in words if word.text == text and not word.joined ] words4word = sorted(words4word, key=attrgetter('id')) if len(fposition4word) == len(words4word): for index, faksimile_position in enumerate(fposition4word): faksimile_position.joined = True words4word[index].faksimile_positions = [ faksimile_position ] words[words.index(words4word[index])].joined = True new_words.append(words4word[index]) elif len(words4word) < len(fposition4word): if re.match(r'(.*)ss(.*)', text): alt_word_text = re.sub(r'ss', 'ß', text) process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split) elif re.match(SINGLE_PUNCTUATION_PATTERN, text): if text == '-': alt_word_text = text.replace('-', '–') process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split) else: print('single', word_text, len(fposition4word), len(words4word)) elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text): alt_word_text = re.sub(PUNCTUATION_PATTERN, "", text) debug_function(words4word, input='elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text) text {0}'.format(text)) if alt_word_text != '': pattern = r'(.*){0}(.*)'.format(alt_word_text) words4word += [ word for word in words if re.match(pattern, word.text) and not word.joined ] if len(words4word) < len(fposition4word): process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split) else: words4word = sorted(words4word, key=attrgetter('id')) for index, faksimile_position in enumerate(fposition4word): faksimile_position.joined = True words4word[index].faksimile_positions = [ faksimile_position ] if words4word[index].text[len(words4word[index].text)-1] != word_text[len(word_text)-1]\ and words.index(words4word[index])+1 < len(words)\ and words[words.index(words4word[index])+1].text == word_text[len(word_text)-1]: words4word[index].join(words[words.index(words4word[index])+1]) words[words.index(words4word[index])+1].joined = True words[words.index(words4word[index])].joined = True words4word[index].text = word_text new_words.append(words4word[index]) elif len(text) >= min_length_split and len([ word for word in words if word.text.startswith(text) and not word.joined ]) == len(fposition4word): new_words4word = [ word for word in words if word.text.startswith(text) and not word.joined ] debug_function(new_words4word, input='word.startswith {}'.format(text)) for index, fposition in enumerate(fposition4word): old_word = new_words4word[index] none_word, new_word, next_word = old_word.split(text, start_id=old_word.id) fposition4word[index].joined = True new_word.faksimile_positions = [ fposition4word[index] ] words[words.index(old_word)] = new_word if next_word is not None: next_word.id = len(words) next_word.joined = False words.append(next_word) new_word.joined = True new_words.append(new_word) elif len(text) >= min_length_split and len([ word for word in words if word.text.endswith(text) and not word.joined ]) == len(fposition4word): new_words4word = [ word for word in words if word.text.endswith(text) and not word.joined ] debug_function(new_words4word, input='word.endswith {}'.format(text)) for index, fposition in enumerate(fposition4word): old_word = new_words4word[index] before_word, new_word, none_word = old_word.split(text, start_id=old_word.id) fposition4word[index].joined = True new_word.faksimile_positions = [ fposition4word[index] ] words[words.index(old_word)] = new_word if before_word is not None: before_word.id = len(words) before_word.joined = False words.append(before_word) new_word.joined = True new_words.append(new_word) else: if len(text) > 1: new_words4word = [ word for word in words if word.text == text[1:] and not word.joined ] debug_function(new_words4word, input='else text {0}'.format(text)) if len(new_words4word) == 0: alt_word_text = text[1:] process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split) else: for new_word in new_words4word: collected_text = new_word.text current_word = new_word while collected_text != word_text and word_text.endswith(collected_text) and current_word.id > 0: previous_word = words[current_word.id-1] if word_text.endswith(previous_word.text + collected_text): words[current_word.id].joined = True previous_word.join(current_word) current_word = previous_word collected_text = current_word.text else: collected_text = previous_word.text + collected_text words4word.append(current_word) words4word = sorted(words4word, key=attrgetter('id')) for index, faksimile_position in enumerate(fposition4word): if index < len(words4word): faksimile_position.joined = True words4word[index].faksimile_positions = [ faksimile_position ] words4word[index].text = word_text words[words.index(words4word[index])].joined = True new_words.append(words4word[index]) else: print('<{0}> f{1}/t{2}, ids: {3}'.\ format(word_text, len(fposition4word), len(words4word), [ position.id for position in fposition4word ])) else: print('<{0}> f{1}/t{2}'.format(word_text, len(fposition4word), len(words4word))) def sort_words(page): """Returns sorted words (from top left to bottom right). """ if -1 in [ word.line_number for word in page.words ]: warnings.warn('{} in page file {} for word with ids {}'.format(LineNumber.WARN_NO_LINE_NUMBER, page.page_tree.docinfo.URL, page.page_tree.xpath('./word[not(@line-number)]/@id'))) words = [] for line_number in page.line_numbers: word_on_line = [ word for word in page.words if word.line_number == line_number.id ] if line_number.id % 2 == 0: words += sorted(word_on_line, key=lambda word: word.transkription_positions[0].left) else: words += sorted(word_on_line, key=cmp_to_key(\ lambda wordA, wordB: -1\ if wordA.transkription_positions[0].left < wordB.transkription_positions[0].left\ and abs(wordA.transkription_positions[0].bottom-wordB.transkription_positions[0].bottom) < wordA.transkription_positions[0].height/2\ else 1)) for index, word in enumerate(words): words[index].id = index words[index].joined = False return words def sort_faksimile_positions(faksimile_positions): """Returns sorted words (from top left to bottom right). """ for faksimile_position in faksimile_positions: faksimile_position.joined = False return sorted(faksimile_positions, key=cmp_to_key(\ lambda positionA, positionB: -1\ if positionA.left+positionA.width/2 <= positionB.left+positionB.width/2\ and positionA.top+positionA.height/2 <= positionB.top+positionB.height/2\ else 1\ )\ ) @deprecated(reason="Writing process id is now set to word not word_position, TODO: check faksimile_positions for split candidates!") def update_writing_process(word): """Updates the writing process of the faksimile word position by synchronizing it with the corresponding transkription word position. If there are several transkription positions belonging to different writing processes but just one faksimile position, then we skip the update. We will fix these faksimile positions by manually adding more word positions and processing those additions in a later stage. """ writing_processes = [ writing_process_id for writing_process_id in set( tp.writing_process_id for tp in word.transkription_positions ) ] if len(writing_processes) == 1 and len(word.faksimile_positions) > 0: word.faksimile_positions[0].writing_process_id = writing_processes[0] def words_close_enough(wordA, wordB, threshold_x=10, threshold_y=5): """Return true if words are closer than thresholds """ return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left)\ -wordB.transkription_positions[0].left) < threshold_x\ and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y #return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left+wordA.transkription_positions[len(wordA.transkription_positions)-1].width)\ # -wordB.transkription_positions[0].left) < threshold_x\ # and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION. svgscripts/join_faksimileAndTranskription.py [OPTIONS] [xmlManuscriptFile] a directory containing a svg file containing information about the word positions on the faksimile. a xml file about a manuscript, containing information about its pages. OPTIONS: -h|--help: show help -c|--correct-words=DIR create a taks "CorrectWords" in target dir DIR -d|--debug-word=WORD show debug information for word == WORD -f|--fix-errors: open faksimilie svg file if there are errors -i|--ignore-status-ok ignore status "OK:faksimile merged" in manuscript file and redo merging. -j|--join-single-char-words join single char words :return: exit code (int) """ commando_dict = { 'do_fix_errors': False, 'redo_ok': False, 'debug_word_text': '', 'correct_words': None,\ 'join_single_char_words': False } try: opts, args = getopt.getopt(argv, "hc:d:fij", ["help", "correct-words=", "debug-word=", "fix-errors", "ignore-status-ok",\ "join-single-char-words" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-c', '--correct-words'): commando_dict['correct_words'] = arg elif opt in ('-d', '--debug-word'): commando_dict['debug_word_text'] = arg elif opt in ('-f', '--fix-errors'): commando_dict['do_fix_errors'] = True elif opt in ('-i', '--ignore-status-ok'): commando_dict['redo_ok'] = True elif opt in ('-j', '--join-single-char-words'): commando_dict['join_single_char_words'] = True if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if exists(file_a): file_b = None if len(args) > 1 and exists(args[1]): file_b = args[1] file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b) for faksimile_file in file_list: join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, **commando_dict) else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: tests_svgscripts/test_manuscript.py =================================================================== --- tests_svgscripts/test_manuscript.py (revision 71) +++ tests_svgscripts/test_manuscript.py (revision 72) @@ -1,36 +1,36 @@ import unittest from os import sep, path from os.path import basename, dirname, isfile import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.manuscript import ArchivalManuscriptUnity class TestArchivalManuscriptUnity(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' def test_init(self): title = 'Test I 1' manuscript = ArchivalManuscriptUnity(title=title) self.assertEqual(manuscript.title, title) def test_get_semanticAndDataDict(self): semantic_dict = ArchivalManuscriptUnity.get_semantic_dictionary() #print(semantic_dict) def test_create_cls(self): manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript) self.assertEqual(manuscript.title, basename(self.test_manuscript).replace('.xml','').replace('_', ' ')) self.assertEqual(manuscript.manuscript_type, 'Notizheft') - self.assertEqual(len(manuscript.pages), 2) + self.assertEqual(len(manuscript.pages), 3) manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript, page_status_list=['faksimile merged']) - self.assertEqual(len(manuscript.pages), 1) - manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript, page_xpath='//pages/page/@output') self.assertEqual(len(manuscript.pages), 2) + manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript, page_xpath='//pages/page/@output') + self.assertEqual(len(manuscript.pages), 3) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_process_words_post_merging.py =================================================================== --- tests_svgscripts/test_process_words_post_merging.py (revision 0) +++ tests_svgscripts/test_process_words_post_merging.py (revision 72) @@ -0,0 +1,95 @@ +import unittest +from os import sep, path, remove +from os.path import isdir, isfile, dirname +import shutil +import sys +import lxml.etree as ET +import warnings +import sys + +sys.path.append('svgscripts') + +import process_words_post_merging +from datatypes.faksimile import FaksimilePage +from datatypes.mark_foreign_hands import MarkForeignHands +from datatypes.page import Page +from datatypes.path import Path +from datatypes.positional_word_part import PositionalWordPart +from datatypes.text_connection_mark import TextConnectionMark +from datatypes.transkriptionField import TranskriptionField +from datatypes.word_position import WordPosition + +class TestPostMerge(unittest.TestCase): + def setUp(self): + process_words_post_merging.UNITTESTING = True + DATADIR = path.dirname(__file__) + sep + 'test_data' + self.faksimile_dir = DATADIR + sep + 'faksimile_svg' + self.manuscript = DATADIR + sep + 'N_VII_1.xml' + self.manuscript_copy = self.manuscript.replace('.', '_copy.') + self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg' + self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' + self.Mp_XIV_1_mytest_421 = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml' + self.test_tcm_xml = DATADIR + sep + 'N_VII_1_page001.xml' + self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' + self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' + + def test_main(self): + process_words_post_merging.main([self.manuscript]) + + def test_categorize_paths(self): + page = Page(xml_source_file=self.pdf_xml) + page.source = self.pdf_xml_source + tr = TranskriptionField(page.source) + page.words = [ word for word in page.words if word.line_number == 33 ] + path_dict = process_words_post_merging.categorize_paths(page, tr) + self.assertEqual(True in [ word.deleted for word in page.words if word.id == 269 ], False) + self.assertEqual(len(path_dict.get('deletion_or_underline_paths')) > 0, True) + self.assertEqual(len(path_dict.get('box_paths')), 5) + words = [ word for word in page.words if len(word.box_paths) > 0 ] + self.assertEqual(len(words), 1) + self.assertEqual(words[0].word_parts[0].earlier_version is not None, True) + self.assertEqual(words[0].word_parts[0].earlier_version.text, ')') + + def test_find_special_words(self): + page = Page(xml_source_file=self.xml_file) + process_words_post_merging.find_special_words(page) + self.assertEqual(len(page.mark_foreign_hands), 1) + self.assertEqual(page.mark_foreign_hands[0].foreign_hands_text, 'x') + page.update_and_attach_words2tree() + nodes = page.page_tree.xpath('//' + MarkForeignHands.XML_TAG) + page = Page(xml_source_file=self.test_tcm_xml) + process_words_post_merging.find_special_words(page) + self.assertEqual(len(page.text_connection_marks), 1) + self.assertEqual(page.text_connection_marks[0].text_source.first_line, 2) + """ + page.update_and_attach_words2tree() + nodes = page.page_tree.xpath('//' + TextConnectionMark.XML_TAG) + print(ET.dump(nodes[0])) + """ + + def test_process_word_boxes(self): + page = Page(xml_source_file=self.pdf_xml) + page.source = self.pdf_xml_source + for word in page.words: + word.partition_according_to_writing_process_id() + tr = TranskriptionField(page.source) + box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ + 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ + 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ + 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ + 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] + box_paths = [ Path(d_string=d_string) for d_string in box_path_d ] + process_words_post_merging.process_word_boxes(page, box_paths, tr) + words_with_boxes = [ word for word in page.words if len(word.box_paths) > 0 ] + self.assertEqual(len(words_with_boxes), 5) + + def test_update_writing_process_ids(self): + page = Page(xml_source_file=self.pdf_xml) + page.words = [ word for word in page.words if word.text == 'Aber' and word.line_number == 2 ] + process_words_post_merging.update_writing_process_ids(page) + self.assertEqual(len(page.words[0].word_parts), 2) + self.assertEqual(page.words[0].word_parts[0].writing_process_id, 1) + self.assertEqual(page.words[0].word_parts[1].writing_process_id, 0) + +if __name__ == "__main__": + unittest.main() Index: tests_svgscripts/test_word.py =================================================================== --- tests_svgscripts/test_word.py (revision 71) +++ tests_svgscripts/test_word.py (revision 72) @@ -1,235 +1,244 @@ import unittest from os import sep, path import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.box import Box from datatypes.matrix import Matrix import datatypes.page from datatypes.path import Path from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word, execute_function_on_parts from datatypes.word_position import WordPosition class Page: def __init__(self): self.svg_file = None def get_line_number(self, input=0): return -1 def get_biggest_fontSize4styles(self, style_set={}): return 7 class TestWord(unittest.TestCase): def setUp(self): DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_file = DATADIR + sep + 'N_VII_1_page009.xml' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] x = 0 for dict in self.word_part_objs: dict['class'] = 'st22' dict['x'] = x dict['y'] = 11 x += 1 mylist = {'text': 'abc', 'id': '0', 'line-number': '2', 'deleted': 'true' } word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] self.word_node = ET.Element('word', attrib=mylist) word_position.attach_object_to_tree(self.word_node) x = 0 for char in mylist['text']: ET.SubElement(self.word_node, 'part', attrib={'text': char, 'x': str(x), 'y': '11', 'class': 'st22' }) x += 1 def test_Word_with_word_part_objs(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) self.assertEqual(word.id, 0) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_Word_with_word_node(self): word = Word.create_cls(self.word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, True) self.assertEqual(word.transkription_positions[0].bottom, 11) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 1) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') self.assertEqual(word.line_number, 2) self.assertEqual(word.transkription_positions[0].transform.isRotationMatrix(), True) def test_attach_word_to_tree(self): newWord = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) empty_tree = ET.ElementTree(ET.Element('page')) newWord.attach_word_to_tree(empty_tree) for word_node in empty_tree.getroot().xpath('//word'): word = Word.CREATE_WORD(word_node=word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, False) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_split(self): page = Page() pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('b') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) self.assertEqual(nextWord.id, 2) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('bc') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('ab', start_id=10) self.assertEqual(currentWord.id, 10) self.assertEqual(currentWord.text, 'ab') self.assertEqual(currentWord.transkription_positions[0].width, 2.1) self.assertEqual(nextWord.id, 11) self.assertEqual(nextWord.transkription_positions[0].width, 5.2) word_part_objs=[{'text': 'x', 'class':'st22', 'x': 0, 'y': 0},\ {'text': 'Insofern', 'class':'st22', 'x': 1, 'y': 0},\ {'text': 'x', 'class':'st22', 'x': 10, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofer') word_part_objs=[{'text': 'xInsofern', 'class':'st22', 'x': 0, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofern') def test_join(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word) self.assertEqual(word.text, 'abc.') other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word, append_at_end_of_new_word=False) self.assertEqual(word.text, '.abc.') """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_get_semanticAndDataDict(self): dictionary = Word.get_semantic_dictionary() #print(dictionary) def test_simplify_transkription_positions(self): node_string = """ """ nodeA = ET.fromstring(node_string) node_string = """ """ nodeB = ET.fromstring(node_string) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) self.assertEqual(len(word.transkription_positions), 2) word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_partition(self): page = datatypes.page.Page(xml_source_file=self.test_file) word = page.words[67] self.assertEqual(word.belongs_to_multiple_writing_processes(), True) word.partition_according_to_writing_process_id() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.belongs_to_multiple_writing_processes(), False) self.assertEqual(word.belongs_to_multiple_writing_processes(include_parts=True), True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) newWord = Word.create_cls(word_node) self.assertEqual(len(newWord.word_parts), 3) - self.assertEqual(newWord.line_number, -1) #print(ET.dump(empty_tree.getroot())) def test_partition_deletion(self): page = datatypes.page.Page(xml_source_file=self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.deleted = transkription_position.writing_process_id == 1 self.assertEqual(word.has_mixed_status('deleted'), True) word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.has_mixed_status('deleted'), False) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) page = datatypes.page.Page(xml_source_file=self.test_file) word = page.words[67] word.partition_according_to_writing_process_id() #print([(word.text, word.deleted) for word in word.word_parts]) word.word_parts[1].transkription_positions[1].deleted = True word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 4) #print([(word.text, word.deleted) for word in word.word_parts]) def test_execute_function_on_parts(self): page = datatypes.page.Page(xml_source_file=self.test_file) word_parts = [ page.words[67], page.words[68] ] word_parts, none = execute_function_on_parts(word_parts, 'partition_according_to_writing_process_id') self.assertEqual(len(word_parts) == 4, True) def test_process_word_boxes(self): page = datatypes.page.Page(xml_source_file=self.pdf_xml) page.source = self.pdf_xml_source for word in page.words: + word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [ 30, 276, 287, 295, 319 ] - for index in indices: + empty_tree = ET.ElementTree(ET.Element('page')) + for word_id, index in enumerate(indices): later_word = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) + #print(later_word.text) self.assertEqual(later_word.earlier_version is not None, True) + later_word.id = word_id + later_word.attach_word_to_tree(empty_tree) + #print(ET.dump(empty_tree.getroot())) + for word_node in empty_tree.getroot().xpath('./word'): + word = Word.create_cls(word_node) + #print(word.text) + self.assertEqual(word.earlier_version is not None, True) def test_split_according_to_status(self): page = datatypes.page.Page(xml_source_file=self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.text = 'asdf'\ if transkription_position.writing_process_id == 1\ else word.text self.assertEqual(word.has_mixed_status('text'), True) new_words = word.split_according_to_status('text') self.assertEqual(len(new_words) > 1, True) self.assertEqual(new_words[0].id, word.id) self.assertEqual(new_words[0].deleted, word.deleted) self.assertEqual(new_words[1].id, word.id+1) #print([ word.text for word in new_words ]) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_page.py =================================================================== --- tests_svgscripts/test_page.py (revision 71) +++ tests_svgscripts/test_page.py (revision 72) @@ -1,189 +1,140 @@ import unittest from os import sep, path from os.path import isdir, isfile, dirname, basename import lxml.etree as ET import sys import sys sys.path.append('svgscripts') dir_changed = False if not isdir('datatypes'): sys.path.append(dirname(sys.path[0])) dir_changed = True from datatypes.lineNumber import LineNumber from datatypes.mark_foreign_hands import MarkForeignHands -from datatypes.page import Page +from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.writing_process import WritingProcess from datatypes.word import Word class TestPage(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_tcm_xml = DATADIR + sep + 'N_VII_1_page001.xml' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' def test_Page(self): page = Page(xml_source_file=self.test_file, svg_file=self.test_svg_file) self.assertEqual(page.title, 'Mp XIV 1') self.assertEqual(page.number, '421') self.assertEqual(len(page.sonderzeichen_list), 2) self.assertEqual('st21' in page.sonderzeichen_list, True) self.assertEqual('st23' in page.sonderzeichen_list, True) self.assertEqual(page.style_dict['st0']['fill'], '#F8F9F8') self.assertEqual(page.width, 493.23) stage0 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 0 ] stage1 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 1 ] stage2 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 2 ] fontStage0 = float(page.style_dict.get(stage0[0]).get('font-size').replace('px','')) fontStage1 = float(page.style_dict.get(stage1[0]).get('font-size').replace('px','')) fontStage2 = float(page.style_dict.get(stage2[0]).get('font-size').replace('px','')) self.assertEqual(fontStage0 > fontStage1, True) self.assertEqual(fontStage1 > fontStage2, True) def test_get_biggest_fontSize4styles(self): page = Page(xml_source_file=self.test_file) style_set = { 'st12', 'st2', 'st14', 'st13' } self.assertEqual(page.get_biggest_fontSize4styles(style_set=style_set), 10) def test_get_words(self): page = Page(xml_source_file=self.test_file) words = page.words self.assertEqual(len(words), 440) self.assertEqual(words[0].text, '$') self.assertEqual(words[439].text, 'mußte!') def test_create_writing_process(self): page = Page(xml_source_file=self.test_file) page.create_writing_processes_and_attach2tree() - self.assertEqual(page.words[97].transkription_positions[0].writing_process_id, WritingProcess.LATER_INSERTION_AND_ADDITION) - self.assertEqual(page.words[129].transkription_positions[0].writing_process_id, WritingProcess.LATER_INSERTION_AND_ADDITION) + #self.assertEqual(page.words[97].transkription_positions[0].writing_process_id, WritingProcess.LATER_INSERTION_AND_ADDITION) + #self.assertEqual(page.words[129].transkription_positions[0].writing_process_id, WritingProcess.LATER_INSERTION_AND_ADDITION) def test_init_line_numbers(self): page = Page(xml_source_file=self.test_file) line_numbers = [ LineNumber(id=2, top=20, bottom=40), LineNumber(id=4, top=50, bottom=60), LineNumber(id=6, top=70, bottom=90) ] page.init_line_numbers(line_numbers, 122.345) self.assertEqual(len(page.line_numbers), 7) self.assertEqual(page.line_numbers[0].id, 1) self.assertEqual(page.line_numbers[6].id, 7) self.assertEqual(page.line_numbers[6].top, 91) self.assertEqual(page.line_numbers[6].bottom, 122.345) self.assertEqual(page.get_line_number(122), 7) self.assertEqual(page.get_line_number(92), 7) self.assertEqual(page.get_line_number(22), 2) def test_get_line_number(self): page = Page(xml_source_file=self.test_file) self.assertEqual(page.get_line_number( (page.words[0].transkription_positions[0].bottom+page.words[0].transkription_positions[0].top)/2), 1) self.assertEqual(page.get_line_number( (page.words[27].transkription_positions[0].bottom+page.words[27].transkription_positions[0].top)/2), 2) self.assertEqual(page.get_line_number( (page.words[105].transkription_positions[0].bottom+page.words[105].transkription_positions[0].top)/2), 7) - def test_categorize_paths(self): - Page.UNITTESTING = True - page = Page(xml_source_file=self.pdf_xml) - page.source = self.pdf_xml_source - tr = TranskriptionField(page.source) - page.words = [ word for word in page.words if word.line_number == 33 ] - path_dict = page.categorize_paths(tr) - self.assertEqual(True in [ word.deleted for word in page.words if word.id == 269 ], False) - self.assertEqual(len(path_dict.get('deletion_or_underline_paths')) > 0, True) - self.assertEqual(len(path_dict.get('box_paths')), 5) - words = [ word for word in page.words if len(word.box_paths) > 0 ] - self.assertEqual(len(words), 1) - self.assertEqual(words[0].word_parts[0].earlier_version is not None, True) - self.assertEqual(words[0].word_parts[0].earlier_version.text, ')') - - def test_find_special_words(self): - page = Page(xml_source_file=self.xml_file) - page.find_special_words() - self.assertEqual(len(page.mark_foreign_hands), 1) - self.assertEqual(page.mark_foreign_hands[0].foreign_hands_text, 'x') - page.update_and_attach_words2tree() - nodes = page.page_tree.xpath('//' + MarkForeignHands.XML_TAG) - page = Page(xml_source_file=self.test_tcm_xml) - page.find_special_words() - self.assertEqual(len(page.text_connection_marks), 1) - self.assertEqual(page.text_connection_marks[0].text_source.first_line, 2) - """ - page.update_and_attach_words2tree() - nodes = page.page_tree.xpath('//' + TextConnectionMark.XML_TAG) - print(ET.dump(nodes[0])) - """ - def test_update_page_type(self): page = Page(xml_source_file=self.pdf_xml) tf = TranskriptionField(self.pdf_xml_source) page.update_page_type(transkription_field=tf) self.assertEqual(page.page_type, Page.PAGE_VERSO) #page = Page(xml_source_file=self.xml_fileB) #page.update_page_type() #self.assertEqual(page.page_type, Page.PAGE_RECTO) def test_update_line_number_area(self): page = Page(xml_source_file=self.xml_file) transkription_field = TranskriptionField(page.source) page.update_line_number_area(transkription_field) self.assertEqual(transkription_field.line_number_area_width > 0, True) self.assertEqual(transkription_field.line_number_area_width < 15, True) page = Page(xml_source_file=self.xml_fileB) transkription_field = TranskriptionField(page.source) page.update_line_number_area(transkription_field) self.assertEqual(transkription_field.line_number_area_width > 0, True) self.assertEqual(transkription_field.line_number_area_width < 15, True) def test_get_pages_from_xml_file(self): pages = Page.get_pages_from_xml_file(self.test_manuscript) - self.assertEqual(len(pages), 2) + self.assertEqual(len(pages), 3) self.assertEqual(pages[0].number, '5') self.assertEqual(pages[1].number, '6') - pages = Page.get_pages_from_xml_file(self.test_manuscript, status_contains='faksimile merged') - self.assertEqual(len(pages), 1) + pages = Page.get_pages_from_xml_file(self.test_manuscript, status_contains=STATUS_MERGED_OK) + self.assertEqual(len(pages), 2) self.assertEqual(pages[0].number, '5') + pages = Page.get_pages_from_xml_file(self.test_manuscript, status_contains=STATUS_MERGED_OK, status_not_contain=STATUS_POSTMERGED_OK) + self.assertEqual(len(pages), 1) def test_get_semantic_dictionary(self): dictionary = Page.get_semantic_dictionary() #print(dictionary) - def test_process_word_boxes(self): - page = Page(xml_source_file=self.pdf_xml) - page.source = self.pdf_xml_source - for word in page.words: - word.partition_according_to_writing_process_id() - tr = TranskriptionField(page.source) - box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ - 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ - 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ - 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ - 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] - box_paths = [ Path(d_string=d_string) for d_string in box_path_d ] - page.process_word_boxes(box_paths, tr) - page = Page(xml_source_file='xml/N_VII_1_page015.xml') - tr = TranskriptionField(page.source) - path_dict = page.categorize_paths(transkription_field=tr) - words_with_boxes = [ word for word in page.words if len(word.box_paths) > 0 ] - self.assertEqual(len(words_with_boxes), 1) - def test_lock(self): page = Page(xml_source_file=self.test_tcm_xml) self.assertEqual(page.is_locked(), False) page.lock('asdf.txt') self.assertEqual(page.is_locked(), True) self.assertEqual(page.page_tree.xpath('//lock/reference-file/text()')[0], 'asdf.txt') page.unlock() self.assertEqual(page.is_locked(), False) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_data/N_VII_1_page006.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1_page006.xml (revision 71) +++ tests_svgscripts/test_data/N_VII_1_page006.xml (revision 72) @@ -1,1275 +1,1276 @@ svgWordPosition 2019-08-02 15:17:37 2019-08-02 15:17:37 2019-08-02 15:30:59 2019-08-02 15:30:59 - 2019-11-11 08:50:13 + 2019-11-14 09:38:45 - + + Index: tests_svgscripts/test_data/N_VII_1_page009.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1_page009.xml (revision 71) +++ tests_svgscripts/test_data/N_VII_1_page009.xml (revision 72) @@ -1,1727 +1,1727 @@ svgWordPosition 2019-08-02 15:17:42 2019-08-02 15:17:43 2019-08-15 14:39:17 2019-08-12 11:53:36 - + Index: tests_svgscripts/test_data/N_VII_1_page005_faksimile_merged.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1_page005_faksimile_merged.xml (revision 71) +++ tests_svgscripts/test_data/N_VII_1_page005_faksimile_merged.xml (revision 72) @@ -1,1688 +1,1688 @@ svgWordPosition 2019-08-02 15:17:40 2019-08-02 15:17:40 2019-08-19 11:43:03 2019-08-19 11:43:03 2019-08-19 11:42:56 2019-08-19 11:43:27 2019-08-15 11:39:29 - + x Index: tests_svgscripts/test_data/N_VII_1.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1.xml (revision 71) +++ tests_svgscripts/test_data/N_VII_1.xml (revision 72) @@ -1,170 +1,171 @@ xmlManuscriptFile 2019-08-02 15:28:57 2019-08-02 15:31:25 H. J. Mette In schwarzen Lederdeckel gebundenes Oktavheft (10,5x17), 194 durchweg beschriebene Seiten; Studien aus der Umwertungszeit, die zum Teil für das <i>Jenseits </i>verwandt worden sind. BAW 1, XCVI M. Montinari Oktavheft, 10,5x17. 194 Seiten. Schwarze und violette Tinte, sowie Bleistift. Überwiegend deutsche Schrift. Von hinten nach vorn beschrieben. Alte Signatur: N XLIII. KGW VII 4/2, 632 Oktavheft. Schwarzer Ledereinband mit Goldprägung (vorn und hinten senkrechte Linie, parallel zum Rücken; vorn rechts unten Initialen „F. N.“, Einzelstempel) und umlaufender Blindlinie. Am hinteren Deckel lederne Stifthülse. Buchblock stellenweise gelockert. Vorsätze aus Moiré-Papier. 194 Seiten, 10,8x17,3, unliniiert. April bis Juni 1885 von KGW VII Fragmentgruppe 34 16. April 1885 bis Anfang Juni 1885 der identifizierten Briefentwürfe + KGW VII 34[1-256] M. Montinari (zu 34[257]): „dieses und die beiden folgenden Fragmente 34[258.259] wurden von N in einen Brief von Paul Lanzky von Anfang Juni 1885 (KGB III 4, S. 28, Nr. 281) eingetragen.“ KGW VII 4/2, 374. Vorderer Deckel Vorsatz Rekto Vorsatz Verso (kaschiert) 1 1 194 Lage, 6 Blatt Vorsatz - 11 1 11 Einzelblatt 12-13 12 13 Lage, 4 Blatt 14-21 14 21 Lage, 8 Blatt 22-37 22 37 Lage, 8 Blatt 38-53 38 53 Lage, 8 Blatt 54-69 54 69 Lage, 8 Blatt 70-85 70 85 Lage, 8 Blatt 86-101 86 101 Lage, 8 Blatt 102-117 102 117 Lage, 8 Blatt 118-133 118 133 Lage, 8 Blatt 134-149 134 149 Lage, 8 Blatt 150-165 150 165 Lage, 8 Blatt 166-181 166 181 Lage, 8 Blatt 182 - Vorsatz 182 194 Vorsatz Rekto (kaschiert) 194 1 194 Vorsatz Verso Hinterer Deckel 1885-4-1 1885-6-28 KGW IX 1 2001 Bearbeitet von Marie-Luise Haase, Michael Kohlenbach, Johannes Neininger, Wolfert von Rahden, Thomas Riebe und René Stockmar unter Mitarbeit von Dirk Setton. Marie-Luise Haase und Michael Kohlenbach 71/209 N XLIII Index: tests_svgscripts/test_data/W_I_8_page125.xml =================================================================== --- tests_svgscripts/test_data/W_I_8_page125.xml (revision 71) +++ tests_svgscripts/test_data/W_I_8_page125.xml (revision 72) @@ -1,4441 +1,4441 @@ svgWordPosition 2019-04-18 08:59:59 2019-04-23 14:56:32 2019-05-03 12:23:12 - + - +