Index: Friedrich-Nietzsche-late-work-ontology.ttl =================================================================== --- Friedrich-Nietzsche-late-work-ontology.ttl (revision 92) +++ Friedrich-Nietzsche-late-work-ontology.ttl (revision 93) @@ -1,24 +1,26 @@ @prefix dct: . @prefix homotypic: . +@prefix stoff: . +@prefix text: . @prefix owl: . @prefix rdfs: . @prefix xsd: . @prefix tln: . a owl:Ontology; dct:license ; dct:title "An ontology about the collected late works of Friedrich Nietzsche"@en; dct:description """Formal description of specific concepts in the scientific study of Friedrich Nietzsches late work."""@en; dct:creator "Dominique Steinbach, tool coordinator/software developer, NIE-INE/digital edition of der späte Nietzsche, Basel University, Switzerland"@en; dct:contributor "Christian Steiner, software developer, digital edition of der späte Nietzsche, University of Basel, Switzerland"@en; dct:publisher "Basel University, Switzerland"@en. tln:inheritOverwritesWord a owl:ObjectProperty ; rdfs:subPropertyOf tln:overwritesWord; rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ; rdfs:comment "The author has used this word in order to overwrite that word."@en ; rdfs:isDefinedBy ; owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ). Index: svgscripts/datatypes/word_deletion_path.py =================================================================== --- svgscripts/datatypes/word_deletion_path.py (revision 0) +++ svgscripts/datatypes/word_deletion_path.py (revision 93) @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This class can be used to represent word deletion paths. +""" +# Copyright (C) University of Basel 2019 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.1" + +from lxml import etree as ET +from os.path import isfile +from svgpathtools.parser import parse_path +from svgpathtools.path import Line +from svgpathtools.path import Path as SVGPath +import sys + +from .path import Path +from .style import Style + +sys.path.append('py2ttl') +from class_spec import SemanticClass + +class WordDeletionPath(Path): + """ + This class represents word deletion paths. + + Args: + path (svgpathtools.path.Path) svg path representation. + style (datatypes.style.Style) + """ + XML_TAG = 'word-deletion-path' + BOX_TAG = 'box-path' + + def __init__(self, super_path, style): + super(WordDeletionPath,self).__init__(id=super_path.id, path=super_path.path) + self.style = style + + @classmethod + def create_cls(cls, node, page): + """Create and return a cls. + """ + path = Path(node=node) + style = Style.create_cls(page, path.style_class) + return cls(path, style) + + @classmethod + def get_semantic_dictionary(cls): + """ Creates and returns a semantic dictionary as specified by SemanticClass. + """ + dictionary = super(WordDeletionPath,cls).get_semantic_dictionary() + dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('style', Style)) + return cls.return_dictionary_after_updating_super_classes(dictionary) + + Index: svgscripts/datatypes/super_page.py =================================================================== --- svgscripts/datatypes/super_page.py (revision 92) +++ svgscripts/datatypes/super_page.py (revision 93) @@ -1,290 +1,290 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a super page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile, basename, dirname from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import sys import warnings from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .mark_foreign_hands import MarkForeignHands from .text_connection_mark import TextConnectionMark from .text_field import TextField from .writing_process import WritingProcess class SuperPage: """ This super class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition' FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile' PAGE_RECTO = 'recto' PAGE_VERSO = 'verso' STATUS_MERGED_OK = 'faksimile merged' STATUS_POSTMERGED_OK = 'words processed' UNITTESTING = False XML_TAG = 'page' def __init__(self, xml_file, title=None, page_number='', orientation='North', page_type=PAGE_VERSO, should_xml_file_exist=False): self.properties_dictionary = {\ 'faksimile_image': (FaksimileImage.XML_TAG, None, FaksimileImage),\ 'faksimile_svgFile': ('data-source/@file', None, str),\ 'number': ('page/@number', str(page_number), str),\ 'orientation': ('page/@orientation', orientation, str),\ 'page_type': ('page/@pageType', page_type, str),\ 'pdfFile': ('pdf/@file', None, str),\ 'source': ('page/@source', None, str),\ 'svg_file': ('svg/@file', None, str),\ 'svg_image': (SVGImage.XML_TAG, None, SVGImage),\ 'text_field': (FaksimileImage.XML_TAG + '/' + TextField.XML_TAG, None, TextField),\ 'title': ('page/@title', title, str),\ } self.online_properties = [] self.line_numbers = [] self.lines = [] self.mark_foreign_hands = [] self.page_tree = None self.sonderzeichen_list = [] self.style_dict = {} self.text_connection_marks = [] self.word_deletion_paths = [] self.word_insertion_marks = [] self.words = [] self.writing_processes = [] self.xml_file = xml_file if not self.is_page_source_xml_file(): msg = f'ERROR: xml_source_file {self.xml_file} is not of type "{FILE_TYPE_SVG_WORD_POSITION}"' raise Exception(msg) self._init_tree(should_xml_file_exist=should_xml_file_exist) def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None): """Adds a list of classes that are sonderzeichen and a style dictionary to page. """ self.sonderzeichen_list = sonderzeichen_list self.letterspacing_list = letterspacing_list self.style_dict = style_dict if style_node is not None: self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') } self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ] self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('letterspacing-list')) ] elif bool(self.style_dict): style_node = ET.SubElement(self.page_tree.getroot(), 'style') if len(self.sonderzeichen_list) > 0: style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list)) if len(self.letterspacing_list) > 0: style_node.set('letterspacing-list', ' '.join(self.letterspacing_list)) for key in self.style_dict.keys(): self.style_dict[key]['name'] = key ET.SubElement(style_node, 'class', attrib=self.style_dict[key]) fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value } fontsizes = sorted(fontsize_dict.values(), reverse=True) # create a mapping between fontsizes and word stages self.fontsizekey2stage_mapping = {} for fontsize_key, value in fontsize_dict.items(): if value >= fontsizes[0]-1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION }) - elif value <= fontsizes[len(fontsizes)-1]+1: + elif value <= fontsizes[-1]+1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION }) else: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION }) def get_biggest_fontSize4styles(self, style_set={}): """Returns biggest font size from style_dict for a set of style class names. [:returns:] (float) biggest font size OR 1 if style_dict is empty """ if bool(self.style_dict): sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True) return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1 else: return 1 def get_line_number(self, y): """Returns line number id for element at y. [:return:] (int) line number id or -1 """ if len(self.line_numbers) > 0: result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ] return result_list[0] if len(result_list) > 0 else -1 else: return -1 def init_all_properties(self, overwrite=False): """Initialize all properties. """ for property_key in self.properties_dictionary.keys(): if property_key not in self.online_properties: self.init_property(property_key, overwrite=overwrite) def init_property(self, property_key, value=None, overwrite=False): """Initialize all properties. Args: property_key: key of property in self.__dict__ value: new value to set to property overwrite: whether or not to update values from xml_file (default: read only) """ if value is None: if property_key not in self.online_properties: xpath, value, cls = self.properties_dictionary.get(property_key) if len(self.page_tree.xpath('//' + xpath)) > 0: value = self.page_tree.xpath('//' + xpath)[0] if value is not None: if cls.__module__ == 'builtins': self.update_tree(value, xpath) self.__dict__.update({property_key: cls(value)}) else: value = cls(node=value)\ if type(value) != cls\ else value self.__dict__.update({property_key: value}) self.__dict__.get(property_key).attach_object_to_tree(self.page_tree) else: self.__dict__.update({property_key: value}) self.online_properties.append(property_key) elif overwrite or property_key not in self.online_properties: xpath, default_value, cls = self.properties_dictionary.get(property_key) if cls.__module__ == 'builtins': self.__dict__.update({property_key: cls(value)}) self.update_tree(value, xpath) else: self.__dict__.update({property_key: value}) self.__dict__.get(property_key).attach_object_to_tree(self.page_tree) self.online_properties.append(property_key) def is_locked(self): """Return true if page is locked. """ return len(self.page_tree.xpath('//metadata/lock')) > 0 def is_page_source_xml_file(self, source_tree=None): """Return true if xml_file is of type FILE_TYPE_SVG_WORD_POSITION. """ if not isfile(self.xml_file): return True if source_tree is None: source_tree = ET.parse(self.xml_file) return source_tree.getroot().find('metadata/type').text == self.FILE_TYPE_SVG_WORD_POSITION def lock(self, reference_file, message=''): """Lock tree such that ids of words etc. correspond to ids in reference_file, optionally add a message that will be shown. """ if not self.is_locked(): metadata = self.page_tree.xpath('./metadata')[0]\ if len(self.page_tree.xpath('./metadata')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'metadata') lock = ET.SubElement(metadata, 'lock') ET.SubElement(lock, 'reference-file').text = reference_file if message != '': ET.SubElement(lock, 'message').text = message def unlock(self): """Lock tree such that ids of words etc. correspond to ids in reference_file, optionally add a message that will be shown. """ if self.is_locked(): lock = self.page_tree.xpath('//metadata/lock')[0] lock.getparent().remove(lock) def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') def update_property_dictionary(self, property_key, default_value): """Update properties_dictionary. """ content = self.properties_dictionary.get(property_key) if content is not None: self.properties_dictionary.update({property_key: (content[0], default_value, content[2])}) else: msg = f'ERROR: properties_dictionary does not contain a key {property_key}!' raise Exception(msg) def update_tree(self, value, xpath): """Update tree. """ node_name = dirname(xpath) node = self.page_tree.xpath('//' + node_name)[0]\ if len(self.page_tree.xpath('//' + node_name)) > 0\ else ET.SubElement(self.page_tree.getroot(), node_name) node.set(basename(xpath).replace('@', ''), str(value)) def _init_tree(self, should_xml_file_exist=False): """Initialize page_tree from xml_file if it exists. """ if isfile(self.xml_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(self.xml_file, parser) elif not should_xml_file_exist: self.page_tree = ET.ElementTree(ET.Element('page')) self.page_tree.docinfo.URL = self.xml_file else: msg = f'ERROR: xml_source_file {self.xml_file} does not exist!' raise FileNotFoundError(msg) Index: svgscripts/datatypes/path.py =================================================================== --- svgscripts/datatypes/path.py (revision 92) +++ svgscripts/datatypes/path.py (revision 93) @@ -1,197 +1,197 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent all svg path types. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from svgpathtools.parser import parse_path from svgpathtools.path import Line from svgpathtools.path import Path as SVGPath import sys from .attachable_object import AttachableObject sys.path.append('py2ttl') from class_spec import SemanticClass class Path(AttachableObject,SemanticClass): """ This super class represents all types of svg paths. Args: node (lxml.etree.Element) node, containing information path (svgpathtools.path.Path) svg path representation. """ XML_TAG = 'path' WORD_DELETION_PATH_TAG = 'word-deletion-path' BOX_TAG = 'box-path' def __init__(self, id=0, node=None, path=None, parent_path=None, d_string=None, style_class='', tag=XML_TAG): self.intKeys = [ 'id' ] self.stringKeys = [ 'style_class' ] self.floatKeys = [] self.start_line_number = -1 self.parent_path = parent_path if node is not None: self.id = int(node.get('id')) if bool(node.get('id')) else 0 self.path = parse_path(node.get('d')) if bool(node.get('d')) else None self.d_attribute = node.get('d') self.style_class = node.get('style-class') self.tag = node.tag else: self.tag = tag self.id = id self.path = path if self.path is None\ and d_string is not None\ and d_string != '': self.path = parse_path(d_string) self.d_attribute = self.path.d() if self.path is not None else '' self.style_class = style_class def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree, self.tag) for key in self.floatKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) for key in self.intKeys + self.stringKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(self.__dict__[key])) if self.path is not None: obj_node.set('d', self.path.d()) @classmethod def create_cls(cls, id=0, path=None, style_class='', page=None, tag=XML_TAG): """Create and return a cls. """ if path is not None\ and path.start.imag <= path.end.imag\ and page is not None\ and style_class != ''\ and len(path._segments) == 1\ and type(path._segments[0]) == Line\ and style_class in page.style_dict.keys()\ and 'stroke-width' in page.style_dict[style_class].keys(): # If path is a Line and its style_class specifies a stroke-width, correct path stroke_width_correction = float(page.style_dict[style_class]['stroke-width'])/2 xmin = path.start.real xmax = path.end.real ymin = path.start.imag-stroke_width_correction ymax = path.end.imag+stroke_width_correction #path = parse_path(f'M {xmin}, {ymin} L {xmax}, {ymin} L {xmax}, {ymax} L {xmin}, {ymax} z') path = SVGPath(Line(start=(complex(f'{xmin}+{ymin}j')), end=(complex(f'{xmax}+{ymin}j'))),\ Line(start=(complex(f'{xmax}+{ymin}j')), end=(complex(f'{xmax}+{ymax}j'))),\ Line(start=(complex(f'{xmax}+{ymax}j')), end=(complex(f'{xmin}+{ymax}j'))),\ Line(start=(complex(f'{xmin}+{ymax}j')), end=(complex(f'{xmin}+{ymin}j')))) return cls(id=id, path=path, style_class=style_class, tag=tag) def contains_path(self, other_path): """Returns true if other_path is contained in this path. """ this_xmin, this_xmax, this_ymin, this_ymax = self.path.bbox() other_xmin, other_xmax, other_ymin, other_ymax = other_path.path.bbox() return other_xmin >= this_xmin and other_xmax <= this_xmax\ and other_ymin >= this_ymin and other_ymax <= this_ymax def contains_start_of_path(self, other_path): """Returns true if start of other_path is contained in this path. """ this_xmin, this_xmax, this_ymin, this_ymax = self.path.bbox() other_xmin, other_xmax, other_ymin, other_ymax = other_path.path.bbox() return other_xmin >= this_xmin and other_xmin < this_xmax\ and other_ymin >= this_ymin and other_ymax <= this_ymax def contains_end_of_path(self, other_path): """Returns true if end of other_path is contained in this path. """ this_xmin, this_xmax, this_ymin, this_ymax = self.path.bbox() other_xmin, other_xmax, other_ymin, other_ymax = other_path.path.bbox() return other_xmax >= this_xmin and other_xmax < this_xmax\ and other_ymin >= this_ymin and other_ymax <= this_ymax @classmethod def create_path_from_transkription_position(cls, transkription_position, tr_xmin=0.0, tr_ymin=0.0): """Create a .path.Path from a .transkription_position.TranskriptionPosition. """ if len(transkription_position.positional_word_parts) > 0: first_pwp = transkription_position.positional_word_parts[0] last_pwp = transkription_position.positional_word_parts[len(transkription_position.positional_word_parts)-1] xmin = tr_xmin + first_pwp.left xmax = tr_xmin + last_pwp.left + last_pwp.width ymin = tr_ymin + sorted(pwp.top for pwp in transkription_position.positional_word_parts)[0] ymax = tr_ymin + sorted([pwp.bottom for pwp in transkription_position.positional_word_parts], reverse=True)[0] else: xmin = tr_xmin + transkription_position.left xmax = xmin + transkription_position.width ymin = tr_ymin + transkription_position.top ymax = ymin + transkription_position.height word_path = parse_path('M {}, {} L {}, {} L {}, {} L {}, {} z'.format(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax)) return cls(path=word_path) def do_paths_intersect(self, other_path): """Returns true if paths intersect, false if not or if there was an exception. """ try: return self.path.intersect(other_path.path, justonemode=True) except AssertionError: return False def get_median_y(self, tr_ymin=0.0): """Return the median of ymin + ymax. """ return (self.path.bbox()[2] + self.path.bbox()[3])/2 - tr_ymin def get_x(self, tr_xmin=0.0): """Return xmin. """ return self.path.bbox()[0] - tr_xmin @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {'d_attribute': { 'class': str, 'cardinality': 0,\ 'name': 'hasDAttribute', 'label': 'svg path has d attribute',\ 'comment': 'The d attribute defines a path to be drawn.'}} - properties.update(cls.create_semantic_property_dictionary('style_class', str)) + #properties.update(cls.create_semantic_property_dictionary('style_class', str)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def is_partially_contained_by(self, other_path): """Returns true if other_path containes this path partially. """ return other_path.contains_start_of_path(self) or other_path.contains_end_of_path(self) Index: svgscripts/datatypes/word.py =================================================================== --- svgscripts/datatypes/word.py (revision 92) +++ svgscripts/datatypes/word.py (revision 93) @@ -1,800 +1,828 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy import inspect from lxml import etree as ET from operator import attrgetter import re import string import sys import warnings from .box import Box from .matrix import Matrix from .path import Path from .simple_word import SimpleWord from .style import Style +from .word_deletion_path import WordDeletionPath from .word_position import WordPosition from .transkription_position import TranskriptionPosition from .writing_process import WritingProcess SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation) def execute_function_on_parts(word_parts, func_name): """Execute function on parts and add those parts instead of original word to word_parts. :return: new word_parts, output from func """ copy_parts = word_parts[:] for word in word_parts: output = eval('word.{0}()'.format(func_name)) if len(word.word_parts) > 0: for part_word in word.word_parts: copy_parts.insert(copy_parts.index(word), part_word) copy_parts.remove(word) word.word_parts = [] return copy_parts, output def update_transkription_position_ids(word): """Update transkription_position' ids according to index. """ word_part_ids = [ wp.id for wp in word.word_parts ] if len(word_part_ids) != len(set(word_part_ids)): for id, wp in enumerate(word.word_parts): wp.id = id for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))): transkription_position.id = index transkription_position.has_box = None transkription_position.deleted = False class Word(SimpleWord): """ This class represents a word. """ COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ] APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' } DATA = 'debug-data' + RDFS_SUBCLASSOF_LIST = ['http://www.e-editiones.ch/ontology/text#HandwrittenText'] XML_TAG = 'word' XML_EARLIER_VERSION = 'earlier-version' XML_OVERWRITES = 'overwrites' XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\ 'isDeletionOfWord': 'deletesEarlierPart',\ 'isExtensionOfWord': 'extendsEarlierVersion',\ 'isTransformationOfWord': 'transformsEarlierPart' } def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None): super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.corrections = [] self.deleted = deleted + self.deletion_paths = [] self.debug_container = {} self.debug_msg = None self.earlier_version = earlier_version self.edited_text = None self.isClarificationOfWord = None self.isDeletionOfWord = None self.isExtensionOfWord = None self.isTransformationOfWord = None if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0: self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ]) self.overwrites_word = None self.styles = styles\ if styles is not None\ else [] self.verified = None self.writing_process_id = writing_process_id self.writing_processes = [] self.word_insertion_mark = None self.word_box = None self.word_parts = word_parts if word_parts is not None else [] self.word_part_objs = word_part_objs if word_part_objs is not None else [] - self.IS_DEBUG_WORD = (self.text == 'Übertreibung' and self.line_number == 8) + + def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): + """Add a word deletion path to word. + """ + if len(self.word_parts) > 0: + for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) + elif self.deleted and len(self.transkription_positions) > 0: + word_path = Path.create_path_from_transkription_position(self.transkription_positions[0],\ + tr_xmin=tr_xmin, tr_ymin=tr_ymin) + self.deletion_paths = [ deletion_path for deletion_path in deletion_paths\ + if do_paths_intersect_saveMode(deletion_path, word_path) ] + + def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = super(Word,self).attach_word_to_tree(target_tree) if self.deleted is not None: word_node.set('deleted', str(self.deleted).lower()) if self.verified is not None: word_node.set('verified', str(self.verified).lower()) if self.edited_text is not None: word_node.set('edited-text', self.edited_text) if self.writing_process_id > -1: word_node.set('writing-process-id', str(self.writing_process_id)) for index, word_part in enumerate(self.word_parts): word_part.id = index word_part.attach_word_to_tree(word_node) if self.earlier_version is not None: earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION) self.earlier_version.attach_word_to_tree(earlier_node) if self.overwrites_word is not None\ and len(self.overwrites_word.transkription_positions) > 0: overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES) self.overwrites_word.attach_word_to_tree(overwrite_node) if self.word_box is not None: self.word_box.attach_object_to_tree(word_node) if len(self.corrections) > 0: word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ]))) for key in self.XML_CORRECTION_DICT.keys(): if self.__dict__[key] is not None: word_node.set(self.XML_CORRECTION_DICT[key], 'true') return word_node def belongs_to_multiple_writing_processes(self, include_parts=False): """Returns true if transkription_positions belong to different WritingProcesses. """ if len(self.word_parts) > 0 and include_parts: return len(set(word.writing_process_id for word in self.word_parts)) > 1 return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1 def set_parent_word_writing_process_id(self): """Set writing_process_id for parent word. """ ids = set(word.transkription_positions[0].style for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None) if len(ids) > 1: self.writing_process_id = max([style.writing_process_id for style in ids]) if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\ for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\ > 1: self.writing_process_id += 1 @classmethod def create_cls(cls, word_node): """Creates a word from a (lxml.Element) node. [:return:] Word """ cls = super(Word,cls).create_cls(word_node) cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1 cls.split_strings = None cls.join_string = word_node.get('join') if bool(word_node.get('split')): cls.split_strings = word_node.get('split').split(' ') if ''.join(cls.split_strings) != cls.text: error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\ format(word_node.getroottree().docinfo.URL, str(cls.id))\ + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\ + 'Text attribute: "{0}".\n'.format(cls.text) raise Exception(error_msg) cls.verified = word_node.get('verified') == 'true'\ if bool(word_node.get('verified')) else None cls.deleted = word_node.get('deleted') == 'true'\ if bool(word_node.get('deleted')) else None cls.edited_text = word_node.get('edited-text') cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ] if bool(word_node.get('corrections')): for index in [ int(i) for i in word_node.get('corrections').split(' ') ]: if index < len(cls.word_parts): cls.corrections.append(cls.word_parts[index]) cls.earlier_version = None if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0: cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0] for key_value in cls.XML_CORRECTION_DICT.values(): if word_node.get(key_value) == 'true': cls.__dict__[key_value] = True if cls.earlier_version is not None: for word_part in cls.word_parts: for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\ and len(cls.word_parts) <= len(cls.earlier_version.word_parts): try: word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id] except Exception: msg = f'{cls.id} {cls.text}: {word_part.id}' raise Exception(msg) for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls.earlier_version for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\ if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\ else None cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\ if len(word_node.xpath('./' + Box.XML_TAG)) > 0\ else None return cls def create_earlier_version(self, root_word=None, id=0): """Create an earlier version of word. """ if root_word is None: root_word = self root_word.set_parent_word_writing_process_id() word_parts = [] non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\ if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ] non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts) if non_single_punctuation_word_parts_length > 0\ and len([ word_part for word_part in non_single_punctuation_word_parts\ if word_part.deleted ])\ == non_single_punctuation_word_parts_length: self.deleted = True for word_part in non_single_punctuation_word_parts: word_part.deleted = False for id, word_part in enumerate(self.word_parts): earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id) if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) elif word_part.overwrites_word is not None\ and (len(word_part.transkription_positions) > 0\ and word_part.overwrites_word.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style\ != word_part.overwrites_word.transkription_positions[0].style): word_part.overwrites_word.id = word_part.id word_parts.append(word_part.overwrites_word) word_part.isTransformationOfWord = word_part.overwrites_word #print(f'transform: {self.text}') if word_part not in self.corrections: self.corrections.append(word_part) elif root_word.writing_process_id > -1\ and (len(word_part.transkription_positions) > 0\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style.writing_process_id\ == root_word.writing_process_id): word_part.extendsEarlierVersion = True #print('extends') if word_part not in self.corrections: self.corrections.append(word_part) else: if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) else: #print(f'default: {self.text}') word_parts.append(earlierWordPart) text = ''.join([ word.text for word in word_parts ])\ if len(word_parts) > 0\ else self.text if len(word_parts) == 1: self.transkription_positions += word_parts[0].transkription_positions self.faksimile_positions += word_parts[0].faksimile_positions word_parts = [] new_transkription_positions = copy.deepcopy(self.transkription_positions) if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None: writing_process_id = self.transkription_positions[0].style.writing_process_id for new_tp in new_transkription_positions: new_tp.style.writing_process_id = writing_process_id return Word(id=id, text=text, transkription_positions=new_transkription_positions,\ faksimile_positions=self.faksimile_positions, line_number=self.line_number,\ word_parts=word_parts) def create_correction_history(self, page=None, box_style=None): """Create correction history. """ if self.word_box is not None: manuscript = self.transkription_positions[0].style.manuscript\ if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None\ else None style = Style() if box_style is not None: style = box_style if page is not None: style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript) for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]: style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions) for transkription_position in transkription_positions: transkription_position.style = style self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\ line_number=self.line_number) for word_part in self.word_parts: word_part.create_correction_history(page=page, box_style=box_style) if len(self.word_parts) > 0: earlier_version = self.create_earlier_version() extending_words = self._get_parts_with_property_key('extendsEarlierVersion') if len(extending_words) > 0: for word in extending_words: word.isExtensionOfWord = earlier_version if self.has_mixed_status('deleted', include_parts=True): self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ]) if len(self.corrections) > 0: self.earlier_version = earlier_version @staticmethod def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): """Creates a word from a (lxml.Element) node or word_part_objs. [:return:] Word """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number text = word_node.get('text') deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true' transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ if len(word_node.findall('.//' + Word.DATA)) > 0\ else [ item.attrib for item in word_node.findall('.//part')] return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file WIDTH = 5 TOPCORRECTION = 2.0 FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize height = height x = round(float(word_part_objs[0]['x']), 3) if(page is not None and bool(page.style_dict)): HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size if endSign is not None and '%' in endSign: lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ if bool(page.style_dict[key].get('font-size'))] lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR elif endSign is not None and '%' in endSign: endX = float(endX) + WIDTH bottom = round(float(word_part_objs[0]['y']), 3) y = round(bottom - height + TOPCORRECTION, 3) width = round(float(endX) - x, 3) transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] text = ''.join([ dict['text'] for dict in word_part_objs]) line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) word.debug_msg = debug_msg return word else: error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(Word,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\ cardinality=1, cardinality_restriction='minCardinality',\ name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\ name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.')) - dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deleted', bool,\ - name='isWordDeleted', label='has word been deleted', comment='Word has been deleted by the author.')) + dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\ + name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\ + comment='Word has been deleted by the author using a deletion path.')) + #dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deleted', bool,\ + # name='isWordDeleted', label='has word been deleted', comment='Word has been deleted by the author.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\ name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\ name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\ name='isClarificationOfWord', label='word is a clarification of word',\ comment='The author has used this part of the word in order to clarify the appearance of that word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\ name='isDeletionOfWord', label='word is a deletion of word',\ comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\ name='isExtensionOfWord', label='word is a extension of word',\ comment='The author has used this part of a word in order to extend an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\ name='isTransformationOfWord', label='word is a transformation of word',\ comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\ name='overwritesWord', label='word overwrites word',\ comment='The author has used this word in order to overwrite that word.')) # This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING, # cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class. dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\ name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\ subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING)) super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\ name='isCorrectionOfWord', label='word is a correction of word',\ comment='The author has used this word in order to correct that word.') for key in cls.XML_CORRECTION_DICT.keys(): correction_dict = dictionary[cls.PROPERTIES_KEY].get(key) correction_dict.update(super_property_dictionary) dictionary[cls.PROPERTIES_KEY].update({key: correction_dict}) return cls.return_dictionary_after_updating_super_classes(dictionary) def has_mixed_status(self, property_key, include_parts=False, concerns_word=True): """Returns true if transkription_positions have mixed status concerning the property_key in their __dict__. """ if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions): return False if len(self.word_parts) > 0 and include_parts: if concerns_word: if False in set(property_key in word.__dict__.keys() for word in self.word_parts): return False return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1 else: return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\ if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1 return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1 def init_word(self, page): """Initialize word with objects from page. """ super(Word,self).init_word(page) if self.writing_process_id > -1: self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ] writing_processes = self.writing_processes for word_part in self.word_parts: word_part.init_word(page) self.lines += word_part.lines self.writing_processes += word_part.writing_processes self.lines = [ line for line in set(self.lines) ] self.writing_processes = [ wp for wp in set(self.writing_processes)] if self.overwrites_word is not None: self.overwrites_word.init_word(page) if self.earlier_version is not None: if self.earlier_version.writing_process_id == -1: self.earlier_version.writing_process_id = self.writing_process_id-1 if self.earlier_version.line_number == -1: self.earlier_version.line_number = self.line_number self.earlier_version.init_word(page) def join(self, other_word, append_at_end_of_new_word=True): """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. """ if append_at_end_of_new_word: self.text = self.text + other_word.text for position in other_word.transkription_positions: position.id = str(len(self.transkription_positions)) self.transkription_positions.append(position) else: self.text = other_word.text + self.text index = 0 for position in other_word.transkription_positions: self.transkription_positions.insert(index, position) index += 1 while index < len(self.transkription_positions): self.transkription_positions[index].id = str(index) index += 1 self.simplify_transkription_positions() def partition_according_to_deletion(self): """Partition a word according to its transkription_positions' deletion status ->split word and add partial words as its parts. """ if self.has_mixed_status('deleted'): transkription_positions = [] last_status = None for transkription_position in self.transkription_positions: if transkription_position.deleted != last_status\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.deleted if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] self.line_number = -1 self.deleted = False elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion') elif not self.deleted\ and len(self.transkription_positions) > 0\ and self.transkription_positions[0].deleted: self.deleted = True def partition_according_to_writing_process_id(self): """Partition a word according to its transkription_positions' writing_process_ids ->split word and add partial words as its parts. """ if self.belongs_to_multiple_writing_processes(): last_writing_process_id = -1 transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.writing_process_id != last_writing_process_id\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_writing_process_id = transkription_position.writing_process_id if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id') if self.belongs_to_multiple_writing_processes(include_parts=True): self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0] elif len(self.transkription_positions) > 0: self.writing_process_id = self.transkription_positions[0].writing_process_id def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False): """Determines whether word is over a word box. """ word_over_box = None if len(self.word_parts) > 0: for word in self.word_parts: current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None)) if current_word is not None and current_word.word_box is not None: word_over_box = current_word else: new_tp_dict = {} for index, transkription_position in enumerate(self.transkription_positions): if previous_word_has_box and index == 0: if len(transkription_position.positional_word_parts) > 0: transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2 #print(f'{self.text}: {transkription_position.positional_word_parts[0].left}') else: transkription_position.left += 1 word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) containing_boxes = [ box_path for box_path in box_paths\ if word_path.is_partially_contained_by(box_path)\ or box_path.do_paths_intersect(word_path) ] if len(containing_boxes) > 0: if previous_word_has_box: print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}') self._set_box_to_transkription_position(containing_boxes[0], word_path,\ transkription_position, new_tp_dict, tr_xmin) box_paths.remove(containing_boxes[0]) for replace_tp in new_tp_dict.keys(): for tp in new_tp_dict.get(replace_tp): self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp) self.transkription_positions.remove(replace_tp) word_over_box = self._get_partial_word_over_box() update_transkription_position_ids(self) return word_over_box def set_word_insertion_mark(self, word_insertion_mark): """Sets word_insertion_mark """ self.word_insertion_mark = word_insertion_mark def set_writing_process_id_to_transkription_positions(self, page): """Determines the writing process id of the transkription_positions. """ for transkription_position in self.transkription_positions: if len(transkription_position.positional_word_parts) > 0: for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): if font_key in page.fontsizekey2stage_mapping.keys(): transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) def simplify_transkription_positions(self): """Merge transkription_positions if possible. """ index = len(self.transkription_positions)-1 while index > 0\ and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]: current_tp = self.transkription_positions[index] index -= 1 previous_tp = self.transkription_positions[index] if previous_tp.is_mergebale_with(current_tp): positional_word_parts = previous_tp.positional_word_parts positional_word_parts += current_tp.positional_word_parts transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id) if len(transkription_positions) == 1: transkription_positions[0].writing_process_id = previous_tp.writing_process_id\ if previous_tp.writing_process_id != -1\ else current_tp.writing_process_id self.transkription_positions.pop(index+1) self.transkription_positions[index] = transkription_positions[0] #print(self.text, len(self.transkription_positions)) def split(self, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) return previousWord, currentWord, nextWord def split_according_to_status(self, status, splits_are_parts=False): """Split a word according to its transkription_positions' text. :return: a list of new word.Word """ new_words = [] if self.has_mixed_status(status): last_status = None transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.__dict__[status] != last_status\ and len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.__dict__[status] if len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) if splits_are_parts: self.word_parts += new_words if len(self.word_parts) > 0: self.transkription_positions = [] return new_words def undo_partitioning(self): """Undo partitioning. """ if len(self.word_parts) > 0: for word_part in self.word_parts: word_part.undo_partitioning() if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]): self.transkription_positions += word_part.transkription_positions self.earlier_version = None self.edited_text = None self.word_box = None self.word_parts = [] self.corrections = [] self.earlier_versions = [] self.box_paths = [] def _create_new_word(self, transkription_positions, status, new_id=0): """Create a new word from self and transkription_positions. """ newWord = Word(id=new_id, transkription_positions=transkription_positions) for key in self.COPY_PROPERTY_KEY: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys(): newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status]) else: newWord.__dict__[status] = transkription_positions[0].__dict__[status] return newWord def _get_parts_with_property_key(self, property_key): """Return a list of word_parts with property == property_key. """ word_parts = [] for word_part in self.word_parts: if property_key in word_part.__dict__.keys(): word_parts.append(word_part) else: word_parts += word_part._get_parts_with_property_key(property_key) return word_parts def _get_partial_word_over_box(self): """Partition a word according to its transkription_positions' has_box ->split word and add partial words as its parts. :return: word over box or self """ word_over_box = None if self.has_mixed_status('has_box'): transkription_positions = [] last_word_box = None for transkription_position in self.transkription_positions: if transkription_position.has_box != last_word_box\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box transkription_positions = [] transkription_positions.append(transkription_position) last_word_box = transkription_position.has_box if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box self.transkription_positions = [] elif len(self.word_parts) > 0: #self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box') for word_part in self.word_parts: if word_over_box is None: word_over_box = word_part._get_partial_word_over_box() else: break elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1: word_over_box = self word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box return word_over_box def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin): """Set box_path to transkription_position that is contained by box_path. Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary. """ if box_path.contains_path(word_path): transkription_position.has_box = box_path elif box_path.contains_start_of_path(word_path): split_position = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[0].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path elif box_path.contains_end_of_path(word_path): split_position = box_path.path.bbox()[0] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path else: # box_path in the middle of word_pathz split_position1 = box_path.path.bbox()[0] - tr_xmin split_position2 = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position1, split_position2) if len(new_tps) >= 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path + +def do_paths_intersect_saveMode(mypath1, mypath2): + """Returns true if paths intersect, false if not or if there was an exception. + """ + try: + return mypath1.path.intersect(mypath2.path, justonemode=True)\ + or mypath1.is_partially_contained_by(mypath2) + except AssertionError: + return False + Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 92) +++ svgscripts/datatypes/page.py (revision 93) @@ -1,280 +1,296 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import sys import warnings from .box import Box from .color import Color from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .faksimile_position import FaksimilePosition from .lineNumber import LineNumber from .line import Line from .mark_foreign_hands import MarkForeignHands from .matrix import Matrix from .path import Path from .positional_word_part import PositionalWordPart from .super_page import SuperPage from .style import Style from .text_connection_mark import TextConnectionMark from .text_field import TextField from .transkriptionField import TranskriptionField from .writing_process import WritingProcess from .word import Word +from .word_deletion_path import WordDeletionPath from .word_insertion_mark import WordInsertionMark sys.path.append('py2ttl') from class_spec import SemanticClass FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK class Page(SemanticClass,SuperPage): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ UNITTESTING = False - def __init__(self, xml_source_file, faksimile_image=None, faksimile_svgFile=None): + def __init__(self, xml_source_file, faksimile_image=None, faksimile_svgFile=None, add_deletion_paths_to_words=True): super(Page,self).__init__(xml_source_file) self.update_property_dictionary('faksimile_image', faksimile_image) self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile) self.init_all_properties() self.add_style(style_node=self.page_tree.getroot().find('.//style')) self.init_node_objects() + if add_deletion_paths_to_words: + self.add_deletion_paths_to_words() + + def add_deletion_paths_to_words(self): + """Add deletion paths to words. + """ + if (self.svg_file is not None and isfile(self.svg_file))\ + or (self.source is not None and isfile(self.source)): + svg_file = self.svg_file if self.svg_file is not None else self.source + transkription_field = TranskriptionField(svg_file) + words = [ word for word in self.words if word.deleted or True in [ part.deleted for part in word.word_parts ]] + for word in words: + word.add_deletion_paths(self.word_deletion_paths, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin) @classmethod def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None): """Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT. [optional: instantiation depends on the fulfilment of a status_contains and/or on the selection of some words by a word_selection_function]. """ source_tree = ET.parse(xml_file) if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION: page = cls(xml_file) if word_selection_function is None or len(word_selection_function(page.words)) > 0: return [ page ] else: return [] elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: pages = [] xpath = '//page/@output' if status_contains != '' and status_not_contain != '': xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain) elif status_contains != '': xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains) elif status_not_contain != '': xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain) for xml_source_file in source_tree.xpath(xpath): if isfile(xml_source_file): pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function) return pages else: return [] @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'number': { 'class': str, 'cardinality': 1},\ 'faksimile_image': { 'class': FaksimileImage, 'cardinality': 1},\ 'orientation': { 'class': str, 'cardinality': 1},\ 'svg_image': { 'class': SVGImage, 'cardinality': 1}} properties.update(cls.create_semantic_property_dictionary('text_field', TextField,\ cardinality=1, name='pageIsOnTextField', label='page is on text field',\ comment='Relates a page to the text field on a faksimile image.')) for key in [ 'lines', 'words', 'writing_processes', 'word_deletion_paths', 'word_insertion_marks']: properties.update(cls.create_semantic_property_dictionary(key, list)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def init_node_objects(self): """Initialize all node objects. """ self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ] self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ] self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ] self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ] - self.word_deletion_paths = [ Path(node=node) for node in self.page_tree.xpath('//' + Path.WORD_DELETION_PATH_TAG) ] + self.word_deletion_paths = [ WordDeletionPath.create_cls(node, self) for node in self.page_tree.xpath('//' + WordDeletionPath.XML_TAG) ] if self.faksimile_image is not None and self.text_field is not None: for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks: simple_word.init_word(self) for wim in self.word_insertion_marks: if wim.line_number > -1: wim.line = [ line for line in self.lines if line.id == wim.line_number ][0] def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None): """Update the data source of page. """ if faksimile_svgFile is not None: self.faksimile_svgFile = faksimile_svgFile data_node = self.page_tree.xpath('.//data-source')[0]\ if len(self.page_tree.xpath('.//data-source')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'data-source') data_node.set('file', self.faksimile_svgFile) if xml_correction_file is not None: data_node.set('xml-corrected-words', xml_correction_file) def update_line_number_area(self, transkription_field, svg_tree=None): """Determines the width of the area where the line numbers are written in the page.source file. """ THRESHOLD = 0.4 if svg_tree is None: svg_tree = ET.parse(self.source) if len(self.line_numbers) > 1: line_number = self.line_numbers[9]\ if transkription_field.is_page_verso() and len(self.line_numbers) > 8\ else self.line_numbers[1] ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\ if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\ and LineNumber.IS_A_LINE_NUMBER(item)\ and LineNumber(raw_text_node=item, transkription_field=transkription_field).id == line_number.id ] if len(ln_nodes) > 0: matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform')) if transkription_field.is_page_verso(): transkription_field.add_line_number_area_width(matrix.getX()) elif self.svg_file is not None and isfile(self.svg_file): svg_path_tree = ET.parse(self.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } svg_x = matrix.getX() svg_y = self.line_numbers[1].bottom + transkription_field.ymin use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\ .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces) if len(use_nodes) > 0: symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '') d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces) if len(d_strings) > 0 and d_strings[0] != '': path = parse_path(d_strings[0]) xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin transkription_field.add_line_number_area_width(matrix.getX() + width) def update_page_type(self, transkription_field=None): """Adds a source to page and attaches it to page_tree. """ if transkription_field is None: if self.source is None or not isfile(self.source): raise FileNotFoundError('Page does not have a source!') transkription_field = TranskriptionField(self.source) self.page_type = Page.PAGE_VERSO\ if transkription_field.is_page_verso()\ else Page.PAGE_RECTO self.page_tree.getroot().set('pageType', self.page_type) def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False): """Update styles of words and add them to their transkription_positions. Args: add_to_parents: Add styles also to word (and if not None to manuscript). partition_according_to_styles: Partition word if its transkription_positions have different styles. """ style_dictionary = {} if words is None: words = self.words for word in words: if len(word.word_parts) > 0: self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\ add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles) for transkription_position in word.transkription_positions: if len(transkription_position.positional_word_parts) > 0: style_class = transkription_position.positional_word_parts[0].style_class writing_process_id = -1 for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]: writing_process_id = self.fontsizekey2stage_mapping.get(font_key) - style_class_key = (Style.remove_irrelevant_style_keys(style_class, self), writing_process_id) + style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id) if create_css: if style_dictionary.get((style_class_key, word.deleted)) is None: + color = word.deletion_paths[0].style.color\ + if len(word.deletion_paths) > 0 else None style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\ - create_css=create_css, deleted=word.deleted ) + create_css=create_css, deletion_color=color ) transkription_position.style = style_dictionary[(style_class_key, word.deleted)] #print(style_dictionary[(style_class_key, word.deleted)]) else: if style_dictionary.get(style_class_key) is None: style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css) style_dictionary[style_class_key].writing_process_id = style_class_key[1] transkription_position.style = style_dictionary[style_class_key] if add_to_parents and transkription_position.style not in word.styles: word.styles.append(transkription_position.style) if partition_according_to_styles: word.split_according_to_status('style', splits_are_parts=True) if manuscript is not None\ and add_to_parents: manuscript.update_styles(*style_dictionary.values()) Index: svgscripts/datatypes/style.py =================================================================== --- svgscripts/datatypes/style.py (revision 92) +++ svgscripts/datatypes/style.py (revision 93) @@ -1,193 +1,202 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent the style of a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy from lxml import etree as ET import re import sys from .color import Color sys.path.append('py2ttl') from class_spec import SemanticClass class Style(SemanticClass): """ This class represents the style of a word. Args: manuscript: a ArchivalManuscriptUnity """ NIETSCHES_FONTS = { 'german': 'deutsche Schreibschrift', 'latin': 'lateinische Schreibschrift' } COLOR_KEYS = [ 'black', 'red', 'blue', 'green', 'grey' ] RELEVANT_STYLE_KEYS = [ 'font-family', 'fill', 'stroke' ] ADDITIONAL_STYLE_KEYS = [ 'font-size' ] WRITING_INSTRUMENTS = { (COLOR_KEYS[0], False): 'schwarze Tinte',\ (COLOR_KEYS[0], True): 'Bleistift',\ (COLOR_KEYS[4], True): 'Bleistift',\ + (COLOR_KEYS[4], False): 'Bleistift',\ (COLOR_KEYS[1], False): 'braune Tinte',\ (COLOR_KEYS[1], True): 'Rotstift',\ (COLOR_KEYS[2], False): 'violette Tinte',\ (COLOR_KEYS[2], True): 'Blaustift',\ - (COLOR_KEYS[3], False): '„Tinte der letzten Korrektur“'} + (COLOR_KEYS[3], False): '„Tinte der letzten Korrektur“',\ + (COLOR_KEYS[3], True): '„Tinte der letzten Korrektur“'} - def __init__(self, manuscript=None, writing_process_id=-1, extended_styles=False, deleted=False): + def __init__(self, manuscript=None, writing_process_id=-1, extended_styles=False, deletion_color=None): self.color = Color.create_cls(manuscript=manuscript) self.css_styles = [] - self.deleted = deleted + self.css_string = None + self.deletion_color = deletion_color self.is_german = True self.font = self.NIETSCHES_FONTS['german'] self.font_family = 'Weidemann-Book' - self.font_size = None + self.font_size = '' self.manuscript = manuscript self.relevant_key_map = {} relevant_style_keys = self.RELEVANT_STYLE_KEYS + self.ADDITIONAL_STYLE_KEYS\ if extended_styles else self.RELEVANT_STYLE_KEYS for key in relevant_style_keys: if not key.startswith('font'): self.relevant_key_map.update({key: self.set_color}) elif key == 'font-family': self.relevant_key_map.update({key: self.set_font}) elif key == 'font-size': self.relevant_key_map.update({key: self.set_size}) self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, False)] self.writing_process_id = writing_process_id def create_a_copy_wo_writing_process_id(self): new_self = copy.deepcopy(self) new_self.writing_process_id = -1 return new_self def create_a_copy(self, reduce_writing_process_id=False): writing_process_id = self.writing_process_id\ if not reduce_writing_process_id\ else self.writing_process_id-1 copy = Style(manuscript=self.manuscript, writing_process_id=writing_process_id) copy.color = self.color copy.font_family = self.font_family copy.process_style_classes() if copy.manuscript is not None: copy.manuscript.update_styles(copy) return copy def create_css_styles(self): """Create css styles. """ - if self.deleted: - self.css_styles.append('text-decoration: line-through;') + if self.deletion_color is not None: + self.css_styles.append('text-decoration:line-through;') + self.css_styles.append(f'text-decoration-color:{self.deletion_color.hex_color};') + self.css_styles.append(f'-webkit-text-decoration-color:{self.deletion_color.hex_color};') if self.font_family.endswith('Bold'): - self.css_styles.append(f'font-weight: bold;') - if self.font_size is not None: - self.css_styles.append(f'font-size: {self.font_size};') - self.css_styles.append(f'color: {self.color.hex_color};') + self.css_styles.append(f'font-weight:bold;') + if self.font_size != '': + self.css_styles.append(f'font-size:{self.font_size};') + self.css_styles.append(f'color:{self.color.hex_color};') + self.css_string = ''.join(self.css_styles) @classmethod - def create_cls(cls, page, style_string, manuscript=None, create_css=False, deleted=False): + def create_cls(cls, page, style_string, manuscript=None, create_css=False, deletion_color=None): """Creates a Style from a style_string. :return: (datatypes.style) Style """ - style = cls(manuscript=manuscript, extended_styles=create_css, deleted=deleted) + style = cls(manuscript=manuscript, extended_styles=create_css, deletion_color=deletion_color) style_dict = { key: key_dict for key, key_dict in page.style_dict.items()\ if any(relevant_key in key_dict.keys() for relevant_key in style.relevant_key_map.keys()) } for style_key in style_string.split(' '): if style_key in style_dict.keys(): dictionary = style_dict[style_key] for key, set_function in [ (key, func) for key, func in style.relevant_key_map.items() if key in dictionary.keys() ]: if callable(set_function): set_function(dictionary[key]) style.process_style_classes() if create_css: style.create_css_styles() return style @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ properties = {} properties.update(cls.create_semantic_property_dictionary('font', str, cardinality=1,\ name='styleHasFont', label='style has font', comment='Connects a style with the kind of font Nietzsche used in writing.')) properties.update(cls.create_semantic_property_dictionary('writing_instrument', str, cardinality=1,\ name='styleHasWritingInstrument', label='style has writing instrument', comment='Connects a style with the description of a writing instrument.')) properties.update(cls.create_semantic_property_dictionary('color', Color, cardinality=1,\ name='styleHasColor', label='style has color', comment='Connects a style with a color.')) - properties.update(cls.create_semantic_property_dictionary('css_styles', str, cardinality=1,\ - subPropertyOf=cls.STOFF_STYLE_HAS_CSS_URL_STRING,cardinality_restriction='minCardinality',\ + #properties.update(cls.create_semantic_property_dictionary('css_styles', str,\ + properties.update(cls.create_semantic_property_dictionary('css_string', str,\ + subPropertyOf=cls.STOFF_STYLE_HAS_CSS_URL_STRING,\ name='styleHasCSS', label='style has css', comment='Connects a style with CSS style.')) dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } return cls.return_dictionary_after_updating_super_classes(dictionary) def process_style_classes(self): """Infere writing instrument from font-family and color. """ if self.font_family.startswith('NewsGothic'): self.is_german = False self.font = self.NIETSCHES_FONTS['latin'] if self.color.name in self.COLOR_KEYS: self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, self.font_family.endswith('Bold'))] def set_color(self, hex_color: str): - self.color = Color.create_cls(hex_color=hex_color, manuscript=self.manuscript) + if hex_color != 'none': + self.color = Color.create_cls(hex_color=hex_color, manuscript=self.manuscript) def set_font(self, font_family: str): self.font_family = font_family def set_size(self, font_size: str): self.font_size = font_size @classmethod def remove_irrelevant_style_keys(cls, style_string, page, extended_styles=False) -> str: """Return a style_string without irrelevant style keys. """ relevant_style_keys = cls.RELEVANT_STYLE_KEYS + cls.ADDITIONAL_STYLE_KEYS\ if extended_styles else cls.RELEVANT_STYLE_KEYS return ' '.join(sorted( style_key for style_key in style_string.split(' ')\ if len(\ [ key for key in page.style_dict[style_key].keys()\ if key in relevant_style_keys ]\ ) > 0 )) def __eq__(self, other): """Returns true if self is qualitatively identical to other. Reason: For qualities, the idea of numerical identity is silly. """ if other is None: return False return self.color == other.color\ and self.font_family == other.font_family\ and self.writing_process_id == other.writing_process_id\ - and self.css_styles == other.css_styles + and self.css_styles == other.css_styles\ + and self.font_size == other.font_size def __hash__(self): """Return a hash value for self. """ return hash((self.color.__hash__, self.font_family, self.writing_process_id)) Index: svgscripts/process_words_post_merging.py =================================================================== --- svgscripts/process_words_post_merging.py (revision 92) +++ svgscripts/process_words_post_merging.py (revision 93) @@ -1,481 +1,473 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path as PathlibPath from progress.bar import Bar import re import shutil import string from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.path import Path as SVGPath from svgpathtools.path import Line import sys import tempfile import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.box import Box from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField -from datatypes.word import Word, update_transkription_position_ids +from datatypes.word import Word, do_paths_intersect_saveMode, update_transkription_position_ids from util import back_up from process_files import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False DEBUG_WORD = None MERGED_DIR = 'merged' def categorize_paths(page, transkription_field=None): """Categorize all paths that are part of the transkription field. :return: a dictionary containig a list for each category of path. """ if page.source is not None and isfile(page.source): MAX_HEIGHT_LINES = 1 max_line = sorted(\ [line_number.bottom-line_number.top for line_number in page.line_numbers if line_number.id % 2 == 0],\ reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17 tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0 tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0 paths, attributes = svg_to_paths.svg2paths(page.source) allpaths_on_tf = [] allpaths_outside_tf = [] attributes_outside_tf = [] if transkription_field is None: transkription_field = TranskriptionField(page.source) for index, path in enumerate(paths): attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and path.bbox()[0] >= tr_xmin\ and path.bbox()[1] <= transkription_field.xmax: allpaths_on_tf.append(Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) elif len(path) > 0\ and path != transkription_field.path: allpaths_outside_tf.append(Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) #print(index, allpaths_outside_tf[len(allpaths_outside_tf)-1].path, path) attributes_outside_tf.append(attribute) path_dict = { 'text_area_deletion_paths': [],\ 'deletion_or_underline_paths': [],\ 'box_paths': [],\ 'dots_paths': [],\ 'word_connector_paths': [],\ 'uncategorized_paths': [] } for mypath in allpaths_on_tf: xmin, xmax, ymin, ymax = mypath.path.bbox() start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin) if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1: path_dict.get('dots_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed(): path_dict.get('box_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed(): path_dict.get('word_connector_paths').append(mypath) elif abs(ymax-ymin) < MAX_HEIGHT_LINES: mypath.start_line_number = start_line_number path_dict.get('deletion_or_underline_paths').append(mypath) elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin): # Check for "ladder", i.e. a path with 3 segments (seg0 is horizontal on line x, seg1 moves to line x+1, seg2 is horizontal on line x+1) if start_line_number + 1 == page.get_line_number(mypath.path.end.imag-tr_ymin)\ and len(mypath.path._segments) == 3\ and abs(mypath.path._segments[0].bbox()[3]-mypath.path._segments[0].bbox()[2]) < MAX_HEIGHT_LINES\ and abs(mypath.path._segments[2].bbox()[3]-mypath.path._segments[2].bbox()[2]) < MAX_HEIGHT_LINES: for index in 0, 2: new_path = Path(parent_path=mypath, path=SVGPath(mypath.path._segments[index])) new_path.start_line_number = page.get_line_number(new_path.path.start.imag-tr_ymin) path_dict.get('deletion_or_underline_paths').append(new_path) else: path_dict.get('text_area_deletion_paths').append(mypath) else: path_dict.get('uncategorized_paths').append(mypath) underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin) path_dict.update({'underline_path': underline_path}) path_dict['uncategorized_paths'] += process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\ paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line) return path_dict elif not UNITTESTING: error_msg = 'Svg source file {} does not exist!'.format(page.source)\ if page.source is not None else 'Page does not contain a source file!' raise FileNotFoundError(error_msg) return {} def copy_page_to_merged_directory(page, manuscript_file=None): """Copy page to directory that contains the first version of all svg_pos_files that have been merged with the faksimile position data. MERGED_DIR is a subfolder of svg_pos_files-directory. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) target_dir = svg_pos_file.parent / MERGED_DIR if not target_dir.is_dir(): target_dir.mkdir() target_pos_file = target_dir / svg_pos_file.name save_page(page, str(svg_pos_file), target_svg_pos_file=str(target_pos_file), status=STATUS_MERGED_OK, manuscript_file=manuscript_file) -def do_paths_intersect_saveMode(mypath1, mypath2): - """Returns true if paths intersect, false if not or if there was an exception. - """ - try: - return mypath1.path.intersect(mypath2.path, justonemode=True)\ - or mypath1.is_partially_contained_by(mypath2) - except AssertionError: - return False def find_special_words(page, transkription_field=None): """Find special words, remove them from words, process their content. """ if page.source is None or not isfile(page.source): raise FileNotFoundError('Page does not have a source!') if transkription_field is None: transkription_field = TranskriptionField(page.source) special_char_list = MarkForeignHands.get_special_char_list() special_char_list += TextConnectionMark.get_special_char_list() single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ] if not UNITTESTING: bar = Bar('find special words', max=len(single_char_words)) for word in single_char_words: not bool(UNITTESTING) and bar.next() if word.text == MarkForeignHands.CLASS_MARK: id = len(page.mark_foreign_hands) page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id)) page.words.remove(word) elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\ or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\ and any(style in page.sonderzeichen_list for style\ in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))): id = len(page.text_connection_marks) page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id)) page.words.remove(word) not bool(UNITTESTING) and bar.finish() svg_tree = ET.parse(page.source) page.update_page_type(transkription_field=transkription_field) page.update_line_number_area(transkription_field, svg_tree=svg_tree) italic_classes = [ key for key in page.style_dict\ if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ] if len(page.mark_foreign_hands) > 0: MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\ SonderzeichenList=page.sonderzeichen_list) if len(page.text_connection_marks) > 0: TextConnectionMark.find_content_in_footnotes(page.text_connection_marks, transkription_field, svg_tree,\ title=page.title, page_number=page.number) def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks all words that intersect with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] list of .path.Path that might be word_underline_paths """ if not UNITTESTING: bar = Bar('mark words that intersect with deletion paths', max=len(page.words)) for word in page.words: not bool(UNITTESTING) and bar.next() word = mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) for part_word in word.word_parts: part_word = mark_word_if_it_intersects_with_paths_as_deleted(part_word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) word.partition_according_to_deletion() not bool(UNITTESTING) and bar.finish() # return those paths in deletion_paths that are not in page.word_deletion_paths return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ] def mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks word if it intersects with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] word """ word.deleted = False for transkription_position in word.transkription_positions: word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) intersecting_paths = [ deletion_path for deletion_path in deletion_paths\ if do_paths_intersect_saveMode(deletion_path, word_path) ] if DEBUG_WORD is not None and word.text == DEBUG_WORD.text and word.line_number == DEBUG_WORD.line_number: relevant_paths = [ path for path in deletion_paths if path.start_line_number == DEBUG_WORD.line_number ] #print(word.line_number, word_path.path.bbox(), [ path.path.bbox() for path in relevant_paths]) if len(intersecting_paths) > 0: #print(f'{word.line_number}: {word.id}, {word.text}: {intersecting_paths}') transkription_position.deleted = True for deletion_path in intersecting_paths: if deletion_path.parent_path is not None: deletion_path = deletion_path.parent_path if deletion_path not in page.word_deletion_paths: deletion_path.tag = Path.WORD_DELETION_PATH_TAG deletion_path.attach_object_to_tree(page.page_tree) page.word_deletion_paths.append(deletion_path) return word def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None): """Process words after merging with faksimile word positions. """ if page is None and svg_pos_file is None: raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!') if page is None: page = Page(svg_pos_file) if page.source is None or not isfile(page.source): raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file)) if svg_pos_file is None: svg_pos_file = page.page_tree.docinfo.URL if new_words is not None: page.words = sorted(new_words, key=attrgetter('id')) for word_node in page.page_tree.xpath('.//word'): word_node.getparent().remove(word_node) manuscript = ArchivalManuscriptUnity.create_cls(manuscript_file)\ if manuscript_file is not None\ else None copy_page_to_merged_directory(page, manuscript_file=manuscript_file) transkription_field = TranskriptionField(page.source) update_faksimile_line_positions(page) find_special_words(page, transkription_field=transkription_field) #update_writing_process_ids(page) page.update_styles(manuscript=manuscript, partition_according_to_styles=True) #TODO: find_hyphenated_words(page) categorize_paths(page, transkription_field=transkription_field) save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=STATUS_POSTMERGED_OK, manuscript_file=manuscript_file) def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17) -> list: """Process word boxes: partition words according to word boxes. [:return:] a list of paths that are not boxes """ MAX_HEIGHT_LINES = 1 not_boxes = [] if not UNITTESTING: bar = Bar('process word boxes', max=len(page.words)) svg_tree = ET.parse(page.source) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } allpaths_on_margin_field = [] if paths is None or attributes is None: paths = [] raw_paths, attributes = svg_to_paths.svg2paths(page.source) for index, raw_path in enumerate(raw_paths): paths.append(Path.create_cls(id=index, path=raw_path, style_class=attributes[index].get('class'), page=page)) for index, mypath in enumerate(paths): path = mypath.path xmin, xmax, ymin, ymax = path.bbox() attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\ or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\ and abs(ymax-ymin) < max_line: allpaths_on_margin_field.append(mypath)#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) box_line_number_dict = {} for box_path in sorted(box_paths, key=lambda path: path.get_median_y()): line_number = page.get_line_number(box_path.get_median_y(tr_ymin=transkription_field.ymin)) if line_number > 0: if line_number not in box_line_number_dict.keys(): box_line_number_dict.update({ line_number: [ box_path ]}) else: box_line_number_dict.get(line_number).append(box_path) boxes = [] for line_number in box_line_number_dict.keys(): box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x()) margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\ if page.get_line_number(margin_box.get_median_y(tr_ymin=transkription_field.ymin)) == line_number ],\ key=lambda path: path.get_x()) threshold = 3 if line_number % 2 == 0 else 1.5 if len(margin_boxes_on_line) > 0: for box_path in box_paths_on_line: #print(line_number, box_path.path.d(), len(margin_boxes_on_line)) box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\ transkription_field=transkription_field, namespaces=namespaces, threshold=threshold) if box is not None: boxes.append(box) else: not_boxes += box_paths_on_line if len(boxes) > 0: for word in page.words: word.process_boxes(boxes, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin) word.create_correction_history(page) if not bool(UNITTESTING): bar.next() elif word.earlier_version is not None: #print(f'{word.text} -> {word.earlier_version.text}') if word.earlier_version.earlier_version is not None: print(f'{word.earlier_version.earlier_version.text}') not bool(UNITTESTING) and bar.finish() return not_boxes def reset_page(page): """Reset all words that have word_parts in order to run the script a second time. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) first_merge_version = svg_pos_file.parent / MERGED_DIR / svg_pos_file.name if first_merge_version.exists(): page = Page(str(first_merge_version)) else: word_with_wordparts = [ word for word in page.words if len(word.word_parts) > 0 ] word_with_wordparts += [ word for word in page.words if word.earlier_version is not None ] page_changed = False if len(word_with_wordparts) > 0: for word in word_with_wordparts: word.undo_partitioning() update_transkription_position_ids(word) page_changed = True no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if len(no_line_numbers) > 0: for word in no_line_numbers: if len(word.transkription_positions) > 0: word.line_number = page.get_line_number((word.transkription_positions[0].top+word.transkription_positions[0].bottom)/2) else: msg = f'Word {word.id} {word.text} has no transkription_position!' warnings.warn(msg) page_changed = True if page_changed: page.update_and_attach_words2tree() def save_page(page, svg_pos_file, target_svg_pos_file=None, status=None, manuscript_file=None): """Save page to target_file and update status of file. """ page.update_and_attach_words2tree() if not UNITTESTING: if target_svg_pos_file is None: target_svg_pos_file = svg_pos_file if status is not None: update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status) write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def update_faksimile_line_positions(page): """Update faksimile_positions of the lines """ num_lines = len(page.line_numbers) ymin = page.text_field.ymin\ if page.text_field is not None\ else 0.0 for line_number in page.line_numbers: if len([ word.faksimile_positions[0] for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) > 0: line_number.faksimile_inner_top = min([ word.faksimile_positions[0].top for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) line_number.faksimile_inner_bottom = max([ word.faksimile_positions[0].bottom for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) if line_number.id % 2 == 0: line_number.faksimile_outer_top = line_number.faksimile_inner_top - ymin line_number.faksimile_outer_bottom = line_number.faksimile_inner_bottom - ymin for index, line_number in enumerate(page.line_numbers): if line_number.faksimile_inner_bottom == 0.0\ or line_number.faksimile_inner_bottom < line_number.faksimile_inner_top: if index == 0 and num_lines > 1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].top elif index == num_lines-1 and page.text_field is not None: line_number.faksimile_inner_bottom = round(page.text_field.height + page.text_field.ymin, 3) elif index > 0 and index < num_lines-1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].faksimile_inner_top\ if page.line_numbers[index+1].faksimile_inner_top > page.line_numbers[index-1].faksimile_inner_bottom\ else page.line_numbers[index-1].faksimile_inner_bottom line_number.attach_object_to_tree(page.page_tree) def update_writing_process_ids(page): """Update the writing_process_ids of the words and split accordingly. """ for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to process words after they have been merged with faksimile data. svgscripts/process_words_post_merging.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -i|--include-missing-line-number run script on files that contain words without line numbers -r|--rerun rerun script on a svg_pos_file that has already been processed :return: exit code (int) """ status_not_contain = STATUS_POSTMERGED_OK include_missing_line_number = False try: opts, args = getopt.getopt(argv, "hir", ["help", "include-missing-line-number", "rerun" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-i', '--include-missing-line-number'): include_missing_line_number = True elif opt in ('-r', '--rerun'): status_not_contain = '' if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=status_not_contain): reset_page(page) no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if not include_missing_line_number and len(no_line_numbers) > 0: not UNITTESTING and print(Fore.RED + f'Page {page.title}, {page.number} has words with no line number!') for word in no_line_numbers: not UNITTESTING and print(f'Word {word.id}: {word.text}') else: back_up(page, page.xml_file) not UNITTESTING and print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL) post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: tests_svgscripts/test_data/N_VII_1_page006.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1_page006.xml (revision 92) +++ tests_svgscripts/test_data/N_VII_1_page006.xml (revision 93) @@ -1,1276 +1,1276 @@ svgWordPosition 2019-08-02 15:17:37 2019-08-02 15:17:37 2019-08-02 15:30:59 2019-08-02 15:30:59 - 2020-05-12 10:14:00 + 2020-05-13 17:06:34 Index: tests_svgscripts/test_style.py =================================================================== --- tests_svgscripts/test_style.py (revision 92) +++ tests_svgscripts/test_style.py (revision 93) @@ -1,92 +1,96 @@ import unittest from os import sep, path from os.path import dirname, basename, isfile, isdir import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.color import Color from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.page import Page from datatypes.style import Style class TestStyle(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_page = DATADIR + sep + 'N_VII_1_page001.xml' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' def test_create_cls(self): page = Page(self.test_page) style_string = "st11 st10 st5" style = Style.create_cls(page, style_string) self.assertEqual(style.font_family, 'Weidemann-Book') self.assertEqual(style.color.hex_color, "#DADADA") self.assertEqual(style.writing_instrument, 'schwarze Tinte') style_string = "st11 st10" style = Style.create_cls(page, style_string) self.assertEqual(style.font_family, 'Weidemann-Book') self.assertEqual(style.color.name, "black") self.assertEqual(style.writing_instrument, 'schwarze Tinte') style_string = "st11 st3" style = Style.create_cls(page, style_string, create_css=True) self.assertEqual(style.font_family, 'Weidemann-Book') self.assertEqual(style.font_size, '9px') + style_string = "st18" + page = Page(self.test_page) + style = Style.create_cls(page, style_string) + self.assertEqual(style.color.name, 'black') def test_remove_irrelevant_style_keys(self): page = Page(self.test_page) style_string = "st11 st10 st9 st5 st0" self.assertEqual(Style.remove_irrelevant_style_keys(style_string, page), "st11 st5 st9") def test_process_style_classes(self): style = Style() style.color = Color.create_cls(hex_color='#009CDE') style.process_style_classes() self.assertEqual(style.writing_instrument, 'violette Tinte') self.assertEqual(style.font, 'deutsche Schreibschrift') style.font_family = "NewsGothicBT-Bold" style.process_style_classes() self.assertEqual(style.writing_instrument, 'Blaustift') self.assertEqual(style.font, 'lateinische Schreibschrift') style = Style() style.font_family = "NewsGothicBT-Bold" style.process_style_classes() #print(style.css_styles) def test_get_semantic_dictionary(self): dictionary = Style.get_semantic_dictionary() #print(dictionary) def test_copy(self): manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript) page = Page(self.test_page) page.words = [ page.words[0] ] page.update_styles(manuscript=manuscript, add_to_parents=True) self.assertEqual(len(manuscript.styles), 1) styleA = page.words[0].transkription_positions[0].style styleB = styleA.create_a_copy() self.assertEqual(styleA == styleB, True) styleB = styleA.create_a_copy(reduce_writing_process_id=True) self.assertEqual(styleA != styleB, True) def test_eq(self): page = Page(self.test_page) style_string = "st11 st10 st5" styleA = Style.create_cls(page, style_string) styleB = Style.create_cls(page, style_string) self.assertEqual(styleA == styleB, True) style_string = "st11 st10" styleC = Style.create_cls(page, style_string) self.assertEqual(styleA != styleC, True) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_word_deletion_path.py =================================================================== --- tests_svgscripts/test_word_deletion_path.py (revision 0) +++ tests_svgscripts/test_word_deletion_path.py (revision 93) @@ -0,0 +1,40 @@ +import unittest +from os import sep, path +from os.path import isdir, dirname, basename +import lxml.etree as ET +from svgpathtools.parser import parse_path +from svgpathtools.path import Line +from svgpathtools.path import Path as SVGPath +import sys +import sys + +sys.path.append('svgscripts') + +from datatypes.page import Page +from datatypes.word_deletion_path import WordDeletionPath +from datatypes.transkription_position import TranskriptionPosition +from datatypes.positional_word_part import PositionalWordPart + +class TestPath(unittest.TestCase): + def setUp(self): + DATADIR = path.dirname(__file__) + sep + 'test_data' + self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml' + + def test_init(self): + page = Page(self.xml_file) + node = page.page_tree.xpath(WordDeletionPath.XML_TAG)[0] + path = WordDeletionPath.create_cls(node, page) + print(path) + + + def test_get_semantic_dict(self): + #print(WordDeletionPath.get_semantic_dictionary()) + pass + + + + + +if __name__ == "__main__": + unittest.main() + Index: tests_svgscripts/test_word.py =================================================================== --- tests_svgscripts/test_word.py (revision 92) +++ tests_svgscripts/test_word.py (revision 93) @@ -1,456 +1,469 @@ import unittest from os import sep, path import lxml.etree as ET import sys sys.path.append('svgscripts') from process_words_post_merging import reset_page, update_writing_process_ids from datatypes.box import Box from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.matrix import Matrix import datatypes.page from datatypes.path import Path from datatypes.positional_word_part import PositionalWordPart from datatypes.style import Style from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word, execute_function_on_parts, update_transkription_position_ids +from datatypes.word_deletion_path import WordDeletionPath from datatypes.word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass class Page: def __init__(self): self.svg_file = None def get_line_number(self, input=0): return -1 def get_biggest_fontSize4styles(self, style_set={}): return 7 class TestWord(unittest.TestCase): TESTCASE = None def setUp(self): DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_file = DATADIR + sep + 'N_VII_1_page009.xml' + self.word_deletion_path_file = DATADIR + sep + 'N_VII_1_page138.xml' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] x = 0 for dict in self.word_part_objs: dict['class'] = 'st22' dict['x'] = x dict['y'] = 11 x += 1 mylist = {'text': 'abc', 'id': '0', 'line-number': '2', 'deleted': 'true' } word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] self.word_node = ET.Element('word', attrib=mylist) word_position.attach_object_to_tree(self.word_node) x = 0 for char in mylist['text']: ET.SubElement(self.word_node, 'part', attrib={'text': char, 'x': str(x), 'y': '11', 'class': 'st22' }) x += 1 + def test_add_deletion_paths(self): + page = datatypes.page.Page(self.word_deletion_path_file, add_deletion_paths_to_words=False) + word = [ word for word in page.words if word.text == 'AufBau'][0] + #self.assertTrue(word.deleted) + self.assertTrue(len(word.word_parts) > 0) + self.assertTrue(word.word_parts[0].deleted) + word.add_deletion_paths(page.word_deletion_paths, tr_xmin=28.347656, tr_ymin=49.921875) + self.assertTrue(len(word.word_parts[0].deletion_paths) > 0) + #print(word.deletion_paths) + + def test_Word_with_word_part_objs(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) self.assertEqual(word.id, 0) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_Word_with_word_node(self): word = Word.create_cls(self.word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, True) self.assertEqual(word.transkription_positions[0].bottom, 11) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 1) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') self.assertEqual(word.line_number, 2) self.assertEqual(word.transkription_positions[0].transform.isRotationMatrix(), True) def test_attach_word_to_tree(self): newWord = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) empty_tree = ET.ElementTree(ET.Element('page')) newWord.attach_word_to_tree(empty_tree) for word_node in empty_tree.getroot().xpath('//word'): word = Word.CREATE_WORD(word_node=word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, False) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') @unittest.skipUnless(TESTCASE is None or TESTCASE == 0, 'Not testing this case') def test_create_correction_history_case0(self): # Case 1: whole word over box box = Box(earlier_text='XYX') word = Word(text='ASDF', transkription_positions=[TranskriptionPosition()]) word.word_box = box word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.overwrites_word is not None, True) @unittest.skipUnless(TESTCASE is None or TESTCASE == 1, 'Not testing this case') def test_create_correction_history_case1(self): # Case 2: part of word over box box = Box(earlier_text='XYX') partA = Word(text='A', transkription_positions=[TranskriptionPosition()]) partA.word_box = box partB = Word(text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.word_parts[0].overwrites_word is not None, True) @unittest.skipUnless(TESTCASE is None or TESTCASE == 2, 'Not testing this case') def test_create_correction_history_case3(self): # Case 3: part of word over box, word under box is part of earlier version box = Box(earlier_text='XYX') tp0 = TranskriptionPosition() tp0.style = Style(writing_process_id=0) tp1 = TranskriptionPosition() tp1.style = Style(writing_process_id=1) partA = Word(id=0, text='Test', transkription_positions=[ tp0]) partB = Word(id=1, text='er', transkription_positions=[ tp1]) partB.word_box = box word = Word(text='Tester', writing_process_id=1, word_parts=[ partA, partB ] ) word.create_correction_history(box_style=tp0.style) self.assertEqual(word.text, 'Tester') self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.earlier_version.text, 'TestXYX') self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) @unittest.skipUnless(TESTCASE is None or TESTCASE == 3, 'Not testing this case') def test_create_correction_history_case4(self): # Case 4: part of word is deleted partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.edited_text, 'SDF') @unittest.skipUnless(TESTCASE is None or TESTCASE == 4, 'Not testing this case') def test_create_correction_history_case5(self): tp0 = TranskriptionPosition() tp0.style = Style(writing_process_id=0) tp1 = TranskriptionPosition() tp1.style = Style(writing_process_id=1) partA = Word(id=0, text='Test', transkription_positions=[ tp0]) partB = Word(id=1, text='er', transkription_positions=[ tp1]) word = Word(text='Tester', word_parts=[ partA, partB ] ) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[1].extendsEarlierVersion, True) self.assertEqual(word.word_parts[1].isExtensionOfWord, word.earlier_version) #@unittest.skipUnless(TESTCASE is None or TESTCASE == 5, 'Not testing this case') #@unittest.skip('case tested, relies on a local xml file') def test_create_correction_history_case_full(self): page = datatypes.page.Page('xml/N_VII_1_page138.xml') manuscript = ArchivalManuscriptUnity() reset_page(page) update_writing_process_ids(page) word = [ word for word in page.words if word.text == 'Verschiedenes' and word.line_number == 4 ][0] wordAufBau = [ word for word in page.words if word.text == 'AufBau' ][0] #page.words = [ word ] page.update_styles(manuscript=manuscript, partition_according_to_styles=True) word.word_parts[0].transkription_positions[0].has_box = Box(earlier_text='v') self.assertEqual(len(word.word_parts), 2) word_over_box = word._get_partial_word_over_box() update_transkription_position_ids(word) word.create_correction_history(page) self.assertEqual(word.writing_process_id, 1) self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.earlier_version.text, 'verschiedenes') #print(word.earlier_version.id, [ (w.id, w.text) for w in word.earlier_version.word_parts ]) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) #print(ET.dump(word_node)) """ self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) self.assertEqual(word.word_parts[1].overwrites_word is not None, True) """ word = wordAufBau page.words = [ word ] page.update_styles(manuscript=manuscript, partition_according_to_styles=True) word.word_parts[0].deleted = True word.word_parts[1].transkription_positions[0].has_box = Box(earlier_text='b') self.assertEqual(len(word.word_parts), 3) word_over_box = word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 3) update_transkription_position_ids(word) word.create_correction_history(page) self.assertEqual(word.writing_process_id, 2) self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.text, 'AufBau') self.assertEqual(word.edited_text, 'Bau') self.assertEqual(word.earlier_version.text, 'Aufbau') self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) self.assertEqual(word.word_parts[1].overwrites_word is not None, True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) #print(ET.dump(word_node)) newWord = Word.create_cls(word_node) #@unittest.skip('') def test_earlier_version(self): partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) earlier_version = word.create_earlier_version() self.assertEqual(earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, earlier_version.word_parts[0]) def test_undo_partitioning(self): tps = [] for i, xy in enumerate([ 3, 4, 5 ]): tps.append(TranskriptionPosition(id=i, x=xy, y=xy, height=10, width=10)) partA = Word(id=0, text='Auf', writing_process_id=1, deleted=True, transkription_positions=[ tps[0]]) partB = Word(id=1, text='B', writing_process_id=2, transkription_positions=[tps[1]]) partC = Word(id=2, text='au', writing_process_id=1,transkription_positions=[tps[2]]) word = Word(text='Aufbau', writing_process_id=2, word_parts=[ partA, partB, partC ] ) word.undo_partitioning() self.assertEqual(len(word.transkription_positions), len(tps)) self.assertEqual(len(word.word_parts), 0) """ page = datatypes.page.Page('xml/N_VII_1_page138.xml') word = page.words[77] word.undo_partitioning() self.assertEqual(len(word.word_parts), 0) self.assertEqual(len(word.transkription_positions), 3) update_transkription_position_ids(word) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) print(ET.dump(word_node)) """ def test_split(self): page = Page() pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('b') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) self.assertEqual(nextWord.id, 2) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('bc') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('ab', start_id=10) self.assertEqual(currentWord.id, 10) self.assertEqual(currentWord.text, 'ab') self.assertEqual(currentWord.transkription_positions[0].width, 2.1) self.assertEqual(nextWord.id, 11) self.assertEqual(nextWord.transkription_positions[0].width, 5.2) word_part_objs=[{'text': 'x', 'class':'st22', 'x': 0, 'y': 0},\ {'text': 'Insofern', 'class':'st22', 'x': 1, 'y': 0},\ {'text': 'x', 'class':'st22', 'x': 10, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofer') word_part_objs=[{'text': 'xInsofern', 'class':'st22', 'x': 0, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofern') def test_join(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word) self.assertEqual(word.text, 'abc.') other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word, append_at_end_of_new_word=False) self.assertEqual(word.text, '.abc.') """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_get_semanticAndDataDict(self): dictionary = Word.get_semantic_dictionary() #print(dictionary) info_dict = dictionary['properties'].get('isDeletionOfWord') self.assertEqual(SemanticClass.SUPER_PROPERTY in info_dict.keys(), True) super_info_dict = info_dict[SemanticClass.SUPER_PROPERTY] #print(info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME)) def test_simplify_transkription_positions(self): node_string = """ """ nodeA = ET.fromstring(node_string) node_string = """ """ nodeB = ET.fromstring(node_string) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) self.assertEqual(len(word.transkription_positions), 2) word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) word.transkription_positions[1].writing_process_id = -1 word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) self.assertEqual(word.transkription_positions[0].writing_process_id, 0) """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_partition(self): page = datatypes.page.Page(self.test_file) word = page.words[67] self.assertEqual(word.belongs_to_multiple_writing_processes(), True) word.partition_according_to_writing_process_id() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.belongs_to_multiple_writing_processes(), False) self.assertEqual(word.belongs_to_multiple_writing_processes(include_parts=True), True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) newWord = Word.create_cls(word_node) self.assertEqual(len(newWord.word_parts), 3) #print(ET.dump(empty_tree.getroot())) def test_partition_deletion(self): page = datatypes.page.Page(self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.deleted = transkription_position.writing_process_id == 1 self.assertEqual(word.has_mixed_status('deleted'), True) word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.has_mixed_status('deleted'), False) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) page = datatypes.page.Page(self.test_file) word = page.words[67] word.partition_according_to_writing_process_id() #print([(word.text, word.deleted) for word in word.word_parts]) word.word_parts[1].transkription_positions[1].deleted = True word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 4) #print([(word.text, word.deleted) for word in word.word_parts]) partA = Word(text='A', deleted=True) partB = Word(text='SDF', deleted=False) word = Word(text='ASDF', word_parts=[ partA, partB]) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) def test_execute_function_on_parts(self): page = datatypes.page.Page(self.test_file) word_parts = [ page.words[67], page.words[68] ] word_parts, none = execute_function_on_parts(word_parts, 'partition_according_to_writing_process_id') self.assertEqual(len(word_parts) == 4, True) def test_process_word_boxes(self): page = datatypes.page.Page(self.pdf_xml) page.source = self.pdf_xml_source page.update_styles(partition_according_to_styles=True) tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [30, 277, 288, 297, 321] for word_id, index in enumerate(indices): word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) self.assertEqual(word_over_box is not None, True) self.assertEqual(word_over_box == page.words[index] or word_over_box in page.words[index].word_parts, True) #self.assertEqual(word_over_box in page.words[index].word_parts, True) def test_process_word_several_boxesOn1LIne(self): page = datatypes.page.Page(self.pdf_xml) page.source = self.pdf_xml_source for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [30, 277, 288, 297, 321] empty_tree = ET.ElementTree(ET.Element('page')) for word_id, index in enumerate(indices): word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) self.assertEqual(word_over_box is not None, True) def test_split_according_to_status(self): page = datatypes.page.Page(self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.text = 'asdf'\ if transkription_position.writing_process_id == 1\ else word.text self.assertEqual(word.has_mixed_status('text'), True) new_words = word.split_according_to_status('text') #print([word.text for word in new_words ]) self.assertEqual(len(new_words) > 1, True) self.assertEqual(new_words[0].id, word.id) self.assertEqual(new_words[0].deleted, word.deleted) self.assertEqual(new_words[1].id, word.id+1) manuscript = ArchivalManuscriptUnity() page = datatypes.page.Page(self.test_file) word = page.words[67] page.words = [ word ] page.update_styles(manuscript=manuscript) new_words = word.split_according_to_status('style', splits_are_parts=True) self.assertEqual(len(word.word_parts), 3) def test__create_new_word(self): manuscript = ArchivalManuscriptUnity() page = datatypes.page.Page(self.test_file) word = page.words[67] page.words = [ word ] page.update_styles(manuscript=manuscript) newWord = word._create_new_word([ word.transkription_positions[0] ], 'style') for key in Word.COPY_PROPERTY_KEY: self.assertEqual(newWord.__dict__[key], word.__dict__[key]) self.assertEqual(len(newWord.styles), 1) def test__get_partial_word_over_box(self): word = Word(text='test', transkription_positions=[ TranskriptionPosition(id=0), TranskriptionPosition(id=1) ]) word.transkription_positions[0].has_box = Box(earlier_text='asdf') word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 2) partA = Word(id=0, text='A', transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition(), TranskriptionPosition(id=1)]) partB.transkription_positions[0].has_box = Box(earlier_text='asdf') word = Word(text='ASDF', word_parts=[ partA, partB]) word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 2) if __name__ == "__main__": unittest.main() Index: tests_py2ttl/test_data/mapping_dict.xml =================================================================== --- tests_py2ttl/test_data/mapping_dict.xml (revision 92) +++ tests_py2ttl/test_data/mapping_dict.xml (revision 93) @@ -1,277 +1,283 @@ tln http://www.nie.org/ontology/nietzsche# ./tln-ontology_autogenerated.ttl http://www.nie.org/ontology/nietzsche#ArchivalManuscriptUnity http://www.nie.org/ontology/nietzsche#hasTitle http://www.nie.org/ontology/nietzsche#hasManuscriptType http://www.nie.org/ontology/nietzsche#hasStyles http://www.nie.org/ontology/nietzsche#hasPages http://www.nie.org/ontology/nietzsche#Path http://www.nie.org/ontology/nietzsche#hasDAttribute - http://www.nie.org/ontology/nietzsche#hasStyleClass http://www.nie.org/ontology/nietzsche#Box http://www.nie.org/ontology/nietzsche#hasDAttribute - http://www.nie.org/ontology/nietzsche#hasStyleClass http://www.nie.org/ontology/nietzsche#hasEarlierText http://www.nie.org/ontology/nietzsche#Color http://www.nie.org/ontology/nietzsche#colorHasName http://www.nie.org/ontology/nietzsche#hasHexadecimalValue http://www.nie.org/ontology/nietzsche#Image http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasFileName http://www.nie.org/ontology/nietzsche#FaksimileImage http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasFileName http://www.nie.org/ontology/nietzsche#hasUrl http://www.nie.org/ontology/nietzsche#hasTextField http://www.nie.org/ontology/nietzsche#PositionalObject http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#WordPosition http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#FaksimilePosition http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#isOnFaksimileImage http://www.nie.org/ontology/nietzsche#isOnTextField http://www.nie.org/ontology/nietzsche#Line http://www.nie.org/ontology/nietzsche#lineHasNumber http://www.nie.org/ontology/nietzsche#lineHasBottomValueOnTranskription http://www.nie.org/ontology/nietzsche#lineHasTopValueOnTranskription http://www.nie.org/ontology/nietzsche#lineHasInnerBottomValueOnFaksimile http://www.nie.org/ontology/nietzsche#lineHasInnerTopValueOnFaksimile http://www.nie.org/ontology/nietzsche#lineHasOuterBottomValueOnFaksimile http://www.nie.org/ontology/nietzsche#lineHasOuterTopValueOnFaksimile http://www.nie.org/ontology/nietzsche#isMainLine http://www.nie.org/ontology/nietzsche#SimpleWord http://www.nie.org/ontology/nietzsche#hasText http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#SpecialWord http://www.nie.org/ontology/nietzsche#hasText http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#MarkForeignHands http://www.nie.org/ontology/nietzsche#hasText http://www.nie.org/ontology/nietzsche#textOfForeignHands http://www.nie.org/ontology/nietzsche#penOfForeignHands http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#Page http://www.nie.org/ontology/nietzsche#hasNumber http://www.nie.org/ontology/nietzsche#hasOrientation http://www.nie.org/ontology/nietzsche#hasLines http://www.nie.org/ontology/nietzsche#hasWords http://www.nie.org/ontology/nietzsche#hasWritingProcesses http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks http://www.nie.org/ontology/nietzsche#hasFaksimileImage http://www.nie.org/ontology/nietzsche#hasSvgImage http://www.nie.org/ontology/nietzsche#pageIsOnTextField http://www.nie.org/ontology/nietzsche#Reference http://www.nie.org/ontology/nietzsche#firstLineOfReference http://www.nie.org/ontology/nietzsche#lastLineOfReference http://www.nie.org/ontology/nietzsche#IsUncertain http://www.nie.org/ontology/nietzsche#hasTitle http://www.nie.org/ontology/nietzsche#hasPageNumber http://www.nie.org/ontology/nietzsche#SVGImage http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasFileName http://www.nie.org/ontology/nietzsche#StandoffTag http://www.nie.org/ontology/nietzsche#standoffTagHasMarkup http://www.nie.org/ontology/nietzsche#standoffTagHasStartIndex http://www.nie.org/ontology/nietzsche#standoffTagHasEndIndex http://www.nie.org/ontology/nietzsche#Text http://www.nie.org/ontology/nietzsche#textHasContent http://www.nie.org/ontology/nietzsche#textHasMarkup http://www.nie.org/ontology/nietzsche#TextConnectionMark http://www.nie.org/ontology/nietzsche#hasText http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#textConnectionMarkHasTextSource http://www.nie.org/ontology/nietzsche#TextField http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#TranskriptionPosition http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#isOnSvgImage http://www.nie.org/ontology/nietzsche#Word http://www.nie.org/ontology/nietzsche#hasText - http://www.nie.org/ontology/nietzsche#isWordDeleted http://www.nie.org/ontology/nietzsche#hasEditedText http://www.nie.org/ontology/nietzsche#wordHasWordParts http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#wordHasStyle http://www.nie.org/ontology/nietzsche#overwritesWord http://www.nie.org/ontology/nietzsche#isTransformationOfWord http://www.nie.org/ontology/nietzsche#isExtensionOfWord http://www.nie.org/ontology/nietzsche#isDeletionOfWord http://www.nie.org/ontology/nietzsche#isClarificationOfWord http://www.nie.org/ontology/nietzsche#wordHasEarlierVersion http://www.nie.org/ontology/nietzsche#wordHasCorrection + http://www.nie.org/ontology/nietzsche#wordIsDeletedByPath + + http://www.nie.org/ontology/nietzsche#WordDeletionPath + + http://www.nie.org/ontology/nietzsche#hasDAttribute + + + http://www.nie.org/ontology/nietzsche#WordInsertionMark http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#hasMarkType http://www.nie.org/ontology/nietzsche#hasSymbolId http://www.nie.org/ontology/nietzsche#hasNextWord http://www.nie.org/ontology/nietzsche#hasPreviousWord http://www.nie.org/ontology/nietzsche#wordInsertionMarkBelongsToLine http://www.nie.org/ontology/nietzsche#WritingProcess http://www.nie.org/ontology/nietzsche#hasVersion http://www.nie.org/ontology/nietzsche#hasDescription xml-dictionary - 2020-02-21 14:31:03 + 2020-05-13 11:04:44