Index: svgscripts/process_footnotes.py =================================================================== --- svgscripts/process_footnotes.py (revision 109) +++ svgscripts/process_footnotes.py (revision 110) @@ -1,282 +1,294 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import lxml.etree as ET import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path as PathlibPath from progress.bar import Bar +import inspect import re import shutil import sys import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.archival_manuscript import ArchivalManuscriptUnity from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.atypical_writing import AtypicalWriting from datatypes.clarification import Clarification from datatypes.editor_comment import EditorComment from datatypes.editor_correction import EditorCorrection from datatypes.footnotes import extract_footnotes +from datatypes.imprint import extract_imprints from datatypes.line_continuation import LineContinuation from datatypes.standoff_tag import StandoffTag from datatypes.text import Text from datatypes.text_connection_mark import TextConnectionMark from datatypes.uncertain_decipherment import UncertainDecipherment from util import back_up from process_files import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False ATYPICAL_GROUP = re.compile(r'(.*:.*]\s*)(¿)(.*)') CLARIFICATION_GROUP = re.compile(r'(.*:.*]\s*)(Vk)(.*)') CONTINUATION_GROUP = re.compile(r'(.*:\s*)(Fortsetzung\s*)') COMMENT_GROUP = re.compile(r'(.*:.*])') EDITOR_CORRECTION_GROUP = re.compile(r'(.*:.*]\s*)(>[?]*)(.*)') LINE_REFERENCE_GROUP = re.compile(r'(\d+-|\d/(\d+/)*)*([0-9]+)(:.*)') LINE_REFERENCE_GROUP_START_INDEX = 1 LINE_REFERENCE_GROUP_MID_INDEX = 2 LINE_REFERENCE_GROUP_END_INDEX = 3 LINE_COMMENT_GROUP = re.compile(r'(.*\d+:)') UNCERTAINTY_WORD_GROUP = re.compile(r'(.*:.*]\s*)([>]*\?)(.*)') UNCERTAINTY_EDITOR_GROUP = re.compile(r'(.*)(\?)') WORD_REFERENCE_GROUP = re.compile(r'(.*[0-9]+:\s*)(.*)(].*)') DEBUG = False def categorize_footnotes(page, footnotes=None, debug=False, skip_after=-1.0, find_content=False): """Categorize footnotes. """ DEBUG = debug if footnotes is None: footnotes = extract_footnotes(page, skip_after=skip_after) for footnote in footnotes: line_match = re.match(LINE_REFERENCE_GROUP, footnote.content) if line_match is not None: _process_line_match(page, footnote, line_match) else: warnings.warn(f'Unknown editor comment without a line reference: <{footnote}>') if find_content and len(page.text_connection_marks) > 0: TextConnectionMark.find_content_in_footnotes(page, footnotes=footnotes) page.update_and_attach_words2tree() for line in page.lines: line.attach_object_to_tree(page.page_tree) DEBUG = False if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) +def save_imprints(page): + """Categorize footnotes. + """ + for imprint in extract_imprints(page): + imprint.attach_object_to_tree(page.page_tree) + if not UNITTESTING: + write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ + script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}', file_type=FILE_TYPE_SVG_WORD_POSITION) + def _is_uncertain(footnote) -> bool: """Return whether footnote contains sign for uncertainty. """ uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content) return (uncertain_match is not None\ and len([ markup for markup in footnote.standoff_markups\ if markup.css_string.endswith('italic;')\ and uncertain_match.end() >= markup.startIndex\ and uncertain_match.end() <= markup.endIndex ]) > 0) def _process_line_match(page, footnote, line_match): """Process footnote if reference to a line matches. """ word_match = re.match(WORD_REFERENCE_GROUP, footnote.content) end_line_number = int(line_match.group(LINE_REFERENCE_GROUP_END_INDEX)) lines = [] if line_match.group(LINE_REFERENCE_GROUP_START_INDEX) is not None: if line_match.group(LINE_REFERENCE_GROUP_MID_INDEX) is not None: line_ids = [ int(line_id) for line_id in\ line_match.group(LINE_REFERENCE_GROUP_START_INDEX).split('/')\ if line_id != '' ] + [ end_line_number ] lines = [ line for line in page.lines if line.id in line_ids ] else: start_line_number = int(line_match.group(1)[0:-1]) lines = [ line for line in page.lines if line.id >= start_line_number and line.id <= end_line_number ] else: lines = [ line for line in page.lines if line.id == end_line_number ] if word_match is not None: _process_word_match(page.words, footnote, line_match, word_match.group(2), end_line_number) elif len(lines) > 0: uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content) for line in lines: _process_line_reference(page, footnote, line, _is_uncertain(footnote)) else: warnings.warn(f'Footnote refers to missing line {line_number}: {footnote}') def _process_line_reference(page, footnote, line, is_uncertain): """Process footnote if there is a line reference. """ continuation_match = re.match(CONTINUATION_GROUP, footnote.content) if continuation_match is not None: reference_string = footnote.content[continuation_match.end():] if is_uncertain: reference_string = reference_string[:-1] line.editor_comments.append(LineContinuation.create_cls(reference_string=reference_string, is_uncertain=is_uncertain)) else: comment_match = re.match(LINE_COMMENT_GROUP, footnote.content) if comment_match is not None: is_uncertain = _is_uncertain(footnote) comment = footnote.content[comment_match.end():-1].strip()\ if is_uncertain\ else footnote.content[comment_match.end():].strip() line.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain)) else: warnings.warn(f'Unknown editor comment for line "{line.id}": <{footnote}>') def _process_word_match(words, footnote, line_match, word_text, line_number, parent_word_composition=None): """Process footnote if there is a word reference. """ referred_words = [ word for word in words\ if word.line_number == line_number\ and (word.text == word_text\ or re.match(rf'\W*{word_text}\W', word.text)\ or word.edited_text == word_text) ] referred_word_parts = [ word.word_parts for word in words\ if word.line_number == line_number\ and len(word.word_parts) > 0\ and word_text in [ wp.text for wp in word.word_parts ] ] overwritten_word_matches = [ word for word in words\ if word.line_number == line_number\ and len(word.word_parts) > 0\ and len([word_part for word_part in word.word_parts\ if word_part.overwrites_word is not None\ and word_part.overwrites_word.text == word_text]) > 0] if len(referred_words) > 0\ or len(overwritten_word_matches) > 0\ or len(referred_word_parts) > 0: word = None if len(referred_words) == 1: word = referred_words[0] elif len(overwritten_word_matches) > 0: word = [ word_part.overwrites_word for word_part in overwritten_word_matches[0].word_parts\ if word_part.overwrites_word is not None and word_part.overwrites_word.text == word_text][0] elif len(referred_word_parts) > 0: word = [ word_part for word_part in referred_word_parts[0] if word_part.text == word_text ][0] else: word = [ better_word for better_word in referred_words if better_word.text == word_text][0] atypical_match = re.match(ATYPICAL_GROUP, footnote.content) correction_match = re.match(EDITOR_CORRECTION_GROUP, footnote.content) clarification_match = re.match(CLARIFICATION_GROUP, footnote.content) is_uncertain = re.match(UNCERTAINTY_WORD_GROUP, footnote.content) is not None if correction_match is not None: correction = correction_match.group(3).strip() word.editor_comment = EditorCorrection(correction_text=correction, is_uncertain=is_uncertain) if not is_uncertain: word.edited_text = correction elif clarification_match is not None: word.editor_comment = Clarification(text=footnote.extract_part(word_text, css_filter='bold;')) elif atypical_match is not None: text = footnote.extract_part(word_text, css_filter='bold;')\ if footnote.markup_contains_css_filter('bold;')\ else None word.editor_comment = AtypicalWriting(text=text) elif is_uncertain: word.editor_comment = UncertainDecipherment() else: comment_match = re.match(COMMENT_GROUP, footnote.content) if comment_match is not None: is_uncertain = _is_uncertain(footnote) comment = footnote.content[comment_match.end():-1].strip()\ if is_uncertain\ else footnote.content[comment_match.end():].strip() word.editor_comment = EditorComment(comment=comment, is_uncertain=is_uncertain) else: warnings.warn(f'Unknown editor comment for word "{word.text}": <{footnote}>') elif re.match(r'.*\s.*', word_text): for word_part in word_text.split(' '): _process_word_match(words, footnote, line_match, word_part, line_number, parent_word_composition=word_text) elif len([word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]) > 0: new_words = [] for word in [word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]: new_words += word.word_parts _process_word_match(new_words, footnote, line_match, word_text, line_number) else: warnings.warn(f'No word found with text "{word_text}" on line {line_number}: <{footnote}>') def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to process the footnotes of a page. svgscripts/process_footnotes.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -s|--skip-until=left skip all nodes.get('X') < left :return: exit code (int) """ skip_after=-1.0 try: opts, args = getopt.getopt(argv, "hs:", ["help", "skip-until=" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-s', '--skip-until'): skip_after = float(arg) if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK): if not UNITTESTING: print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL) back_up(page, page.xml_file) categorize_footnotes(page, skip_after=skip_after, find_content=True) + save_imprints(page) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/datatypes/word.py =================================================================== --- svgscripts/datatypes/word.py (revision 109) +++ svgscripts/datatypes/word.py (revision 110) @@ -1,907 +1,913 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy import inspect from lxml import etree as ET from operator import attrgetter import re import string import sys import warnings from .box import Box from .editor_comment import EditorComment from .matrix import Matrix from .path import Path from .simple_word import SimpleWord from .style import Style from .word_deletion_path import WordDeletionPath from .word_position import WordPosition from .transkription_position import TranskriptionPosition from .writing_process import WritingProcess SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation) def execute_function_on_parts(word_parts, func_name): """Execute function on parts and add those parts instead of original word to word_parts. :return: new word_parts, output from func """ copy_parts = word_parts[:] for word in word_parts: output = eval('word.{0}()'.format(func_name)) if len(word.word_parts) > 0: for part_word in word.word_parts: copy_parts.insert(copy_parts.index(word), part_word) copy_parts.remove(word) word.word_parts = [] return copy_parts, output def update_transkription_position_ids(word): """Update transkription_position' ids according to index. """ word_part_ids = [ wp.id for wp in word.word_parts ] if len(word_part_ids) != len(set(word_part_ids)): for id, wp in enumerate(word.word_parts): wp.id = id for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))): transkription_position.id = index transkription_position.has_box = None transkription_position.deleted = False class Word(SimpleWord): """ This class represents a word. """ COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ] APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' } DATA = 'debug-data' - RDFS_SUBCLASSOF_LIST = ['http://www.e-editiones.ch/ontology/text#HandwrittenText'] + RDFS_SUBCLASSOF_LIST = ['https://www.e-editiones.ch/ontology/text#HandwrittenText'] XML_TAG = 'word' XML_EARLIER_VERSION = 'earlier-version' XML_OVERWRITES = 'overwrites' XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\ 'isDeletionOfWord': 'deletesEarlierPart',\ 'isExtensionOfWord': 'extendsEarlierVersion',\ 'isTransformationOfWord': 'transformsEarlierPart' } def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None): super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.corrections = [] + self.clean_edited_text = None self.deleted = deleted self.deletion_paths = [] self.deletion_paths_near_word = [] self.debug_container = {} self.debug_msg = None self.earlier_version = earlier_version self.edited_text = None self.editor_comment = None self.isClarificationOfWord = None self.isDeletionOfWord = None self.isExtensionOfWord = None self.isTransformationOfWord = None if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0: self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ]) self.overwrites_word = None self.process_flags = [] self.styles = styles\ if styles is not None\ else [] self.verified = None self.writing_process_id = writing_process_id self.writing_processes = [] self.word_insertion_mark = None self.word_box = None self.word_parts = word_parts if word_parts is not None else [] self.word_part_objs = word_part_objs if word_part_objs is not None else [] def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Add a word deletion path to word. """ if len(self.word_parts) > 0: for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) elif self.deleted: index = 0 while len(self.deletion_paths) == 0 and index < len(self.transkription_positions): include_pwps = (len(self.transkription_positions[index].positional_word_parts) > 0 and abs(self.transkription_positions[index].left-self.transkription_positions[index].positional_word_parts[0].left) < 10) word_path = Path.create_path_from_transkription_position(self.transkription_positions[index],\ tr_xmin=tr_xmin, tr_ymin=tr_ymin, include_pwps=include_pwps) self.deletion_paths += [ deletion_path for deletion_path in deletion_paths\ if not Path.is_path_contained(self.deletion_paths, deletion_path)\ and deletion_path.do_paths_intersect(word_path) ] index += 1 def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = super(Word,self).attach_word_to_tree(target_tree) if self.deleted is not None: word_node.set('deleted', str(self.deleted).lower()) if self.verified is not None: word_node.set('verified', str(self.verified).lower()) if self.edited_text is not None: word_node.set('edited-text', self.edited_text) if self.editor_comment is not None: self.editor_comment.attach_object_to_tree(word_node) if self.writing_process_id > -1: word_node.set('writing-process-id', str(self.writing_process_id)) if len(self.process_flags) > 0: word_node.set('process-flags', ' '.join(self.process_flags)) for index, word_part in enumerate(self.word_parts): word_part.id = index word_part.attach_word_to_tree(word_node) if self.earlier_version is not None: earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION) self.earlier_version.attach_word_to_tree(earlier_node) if self.overwrites_word is not None\ and len(self.overwrites_word.transkription_positions) > 0: overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES) self.overwrites_word.attach_word_to_tree(overwrite_node) if self.word_box is not None: self.word_box.attach_object_to_tree(word_node) if len(self.corrections) > 0: word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ]))) for deletion_id, deletion_path in enumerate(self.deletion_paths): deletion_path.id = deletion_id deletion_path.tag = WordDeletionPath.XML_TAG deletion_path.attach_object_to_tree(word_node) for key in self.XML_CORRECTION_DICT.keys(): if self.__dict__[key] is not None: word_node.set(self.XML_CORRECTION_DICT[key], 'true') return word_node def belongs_to_multiple_writing_processes(self, include_parts=False): """Returns true if transkription_positions belong to different WritingProcesses. """ if len(self.word_parts) > 0 and include_parts: return len(set(word.writing_process_id for word in self.word_parts)) > 1 return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1 def set_parent_word_writing_process_id(self): """Set writing_process_id for parent word. """ ids = set(word.transkription_positions[0].style for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None) if len(ids) > 1: self.writing_process_id = max([style.writing_process_id for style in ids]) if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\ for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\ > 1: self.writing_process_id += 1 @classmethod def create_cls(cls, word_node): """Creates a word from a (lxml.Element) node. [:return:] Word """ cls = super(Word,cls).create_cls(word_node) cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1 cls.split_strings = None cls.join_string = word_node.get('join') if bool(word_node.get('split')): cls.split_strings = word_node.get('split').split(' ') if ''.join(cls.split_strings) != cls.text: error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\ format(word_node.getroottree().docinfo.URL, str(cls.id))\ + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\ + 'Text attribute: "{0}".\n'.format(cls.text) raise Exception(error_msg) cls.verified = word_node.get('verified') == 'true'\ if bool(word_node.get('verified')) else None cls.deleted = word_node.get('deleted') == 'true'\ if bool(word_node.get('deleted')) else None cls.edited_text = word_node.get('edited-text') + if cls.edited_text is not None: + cls.clean_edited_text = cls._create_clean_text(cls.edited_text) cls.editor_comment = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ][0]\ if len([ node for node in word_node.xpath('./' + EditorComment.XML_TAG) ]) > 0 else None cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ] if bool(word_node.get('corrections')): for index in [ int(i) for i in word_node.get('corrections').split(' ') ]: if index < len(cls.word_parts): cls.corrections.append(cls.word_parts[index]) cls.earlier_version = None if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0: cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0] for key_value in cls.XML_CORRECTION_DICT.values(): if word_node.get(key_value) == 'true': cls.__dict__[key_value] = True if cls.earlier_version is not None: for word_part in cls.word_parts: for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\ and len(cls.word_parts) <= len(cls.earlier_version.word_parts): try: word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id] except Exception: msg = f'{cls.id} {cls.text}: {word_part.id}' raise Exception(msg) for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls.earlier_version for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\ if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\ else None cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\ if len(word_node.xpath('./' + Box.XML_TAG)) > 0\ else None cls.deletion_paths = [ Path(node=node) for node in word_node.xpath(f'./{WordDeletionPath.XML_TAG}') ] cls.process_flags = word_node.get('process-flags').split(' ')\ if bool(word_node.get('process-flags'))\ else [] return cls @classmethod def join_words(cls, list_of_words, add_white_space_between_words=False): """Creates a word from a list of words. [:return:] Word """ if len(list_of_words) > 1: deleted = True in [ word.deleted for word in list_of_words ]\ and len(set([ word.deleted for word in list_of_words ])) == 1 line_number = list_of_words[0].line_number\ if len(set([ word.line_number for word in list_of_words ])) == 1\ else -1 faksimile_positions = [] for word in list_of_words: if len(word.word_parts) > 0: faksimile_positions += word.faksimile_positions index = list_of_words.index(word) list_of_words.remove(word) for part_word in reversed(word.word_parts): list_of_words.insert(index, part_word) new_word_text = ''.join([word.text for word in list_of_words])\ if not add_white_space_between_words\ else ' '.join([word.text for word in list_of_words]) new_word = cls(id=list_of_words[0].id, text=new_word_text, faksimile_positions=faksimile_positions,\ line_number=line_number, deleted=deleted, word_parts=list_of_words) if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]: change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0] new_word.edited_text = new_word.text.replace(change_text, change_text[:-1]) for id, word in enumerate(new_word.word_parts): word.id = id return new_word if len(list_of_words) > 0: return list_of_words[0] else: return None def create_earlier_version(self, root_word=None, id=0): """Create an earlier version of word. """ if root_word is None: root_word = self root_word.set_parent_word_writing_process_id() word_parts = [] non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\ if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ] non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts) if non_single_punctuation_word_parts_length > 0\ and len([ word_part for word_part in non_single_punctuation_word_parts\ if word_part.deleted ])\ == non_single_punctuation_word_parts_length: self.deleted = True for word_part in non_single_punctuation_word_parts: word_part.deleted = False for id, word_part in enumerate(self.word_parts): earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id) if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) elif word_part.overwrites_word is not None\ and ((len(word_part.transkription_positions) > 0\ and word_part.overwrites_word.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style\ != word_part.overwrites_word.transkription_positions[0].style) or word_part.word_box.earlier_version): word_part.overwrites_word.id = word_part.id word_parts.append(word_part.overwrites_word) word_part.isTransformationOfWord = word_part.overwrites_word #print(f'transform: {self.text}') if word_part not in self.corrections: self.corrections.append(word_part) elif root_word.writing_process_id > -1\ and (len(word_part.transkription_positions) > 0\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style.writing_process_id\ == root_word.writing_process_id): word_part.extendsEarlierVersion = True #print('extends') if word_part not in self.corrections: self.corrections.append(word_part) else: if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) else: #print(f'default: {self.text}') word_parts.append(earlierWordPart) text = ''.join([ word.text for word in word_parts ])\ if len(word_parts) > 0\ else self.text if len(word_parts) == 1: self.transkription_positions += word_parts[0].transkription_positions self.faksimile_positions += word_parts[0].faksimile_positions word_parts = [] new_transkription_positions = copy.deepcopy(self.transkription_positions) if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None: writing_process_id = self.transkription_positions[0].style.writing_process_id for new_tp in new_transkription_positions: new_tp.style.writing_process_id = writing_process_id return Word(id=id, text=text, transkription_positions=new_transkription_positions,\ faksimile_positions=self.faksimile_positions, line_number=self.line_number,\ word_parts=word_parts) def create_correction_history(self, page=None, box_style=None): """Create correction history. """ if self.word_box is not None: manuscript = self.transkription_positions[0].style.manuscript\ if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None\ else None style = Style() if box_style is not None: style = box_style if page is not None: style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript) for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]: style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions) for transkription_position in transkription_positions: transkription_position.style = style self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\ line_number=self.line_number) for word_part in self.word_parts: word_part.create_correction_history(page=page, box_style=box_style) if len(self.word_parts) > 0: earlier_version = self.create_earlier_version() extending_words = self._get_parts_with_property_key('extendsEarlierVersion') if len(extending_words) > 0: for word in extending_words: word.isExtensionOfWord = earlier_version if self.has_mixed_status('deleted', include_parts=True): self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ]) if len(self.corrections) > 0: self.earlier_version = earlier_version @staticmethod def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): """Creates a word from a (lxml.Element) node or word_part_objs. [:return:] Word """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number text = word_node.get('text') deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true' transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ if len(word_node.findall('.//' + Word.DATA)) > 0\ else [ item.attrib for item in word_node.findall('.//part')] return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file WIDTH = 5 TOPCORRECTION = 2.0 FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize height = height x = round(float(word_part_objs[0]['x']), 3) if(page is not None and bool(page.style_dict)): HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size if endSign is not None and '%' in endSign: lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ if bool(page.style_dict[key].get('font-size'))] lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR elif endSign is not None and '%' in endSign: endX = float(endX) + WIDTH bottom = round(float(word_part_objs[0]['y']), 3) y = round(bottom - height + TOPCORRECTION, 3) width = round(float(endX) - x, 3) transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] text = ''.join([ dict['text'] for dict in word_part_objs]) line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) word.debug_msg = debug_msg return word else: error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(Word,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\ cardinality=1, cardinality_restriction='minCardinality',\ name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\ name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\ name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\ comment='Word has been deleted by the author using a deletion path.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comment', EditorComment,\ name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\ name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\ name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.')) + dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('clean_edited_text', str,\ + name='hasCleanEditedText', label='word has an edited text without punctuation',\ + comment='Word has a text without punctuation that is edited automatically by removing deleted parts or hyphens.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\ name='isClarificationOfWord', label='word is a clarification of word',\ comment='The author has used this part of the word in order to clarify the appearance of that word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\ name='isDeletionOfWord', label='word is a deletion of word',\ comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\ name='isExtensionOfWord', label='word is a extension of word',\ comment='The author has used this part of a word in order to extend an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\ name='isTransformationOfWord', label='word is a transformation of word',\ comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\ name='overwritesWord', label='word overwrites word',\ comment='The author has used this word in order to overwrite that word.')) # This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING, # cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class. dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\ name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\ subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING)) super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\ name='isCorrectionOfWord', label='word is a correction of word',\ comment='The author has used this word in order to correct that word.') for key in cls.XML_CORRECTION_DICT.keys(): correction_dict = dictionary[cls.PROPERTIES_KEY].get(key) correction_dict.update(super_property_dictionary) dictionary[cls.PROPERTIES_KEY].update({key: correction_dict}) return cls.return_dictionary_after_updating_super_classes(dictionary) def has_mixed_status(self, property_key, include_parts=False, concerns_word=True): """Returns true if transkription_positions have mixed status concerning the property_key in their __dict__. """ if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions): return False if len(self.word_parts) > 0 and include_parts: if concerns_word: if False in set(property_key in word.__dict__.keys() for word in self.word_parts): return False return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1 else: return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\ if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1 return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1 def init_word(self, page): """Initialize word with objects from page. """ super(Word,self).init_word(page) if self.writing_process_id > -1: self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ] writing_processes = self.writing_processes for word_part in self.word_parts: word_part.init_word(page) self.lines += word_part.lines self.writing_processes += word_part.writing_processes self.lines = [ line for line in set(self.lines) ] self.writing_processes = [ wp for wp in set(self.writing_processes)] if self.overwrites_word is not None: self.overwrites_word.init_word(page) if self.earlier_version is not None: if self.earlier_version.writing_process_id == -1: self.earlier_version.writing_process_id = self.writing_process_id-1 if self.earlier_version.line_number == -1: self.earlier_version.line_number = self.line_number self.earlier_version.init_word(page) self.deletion_paths = [ page.get_word_deletion_path(path) for path in self.deletion_paths if path.path is not None ] def join(self, other_word, append_at_end_of_new_word=True, add_white_space_between_words=False): """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. """ if append_at_end_of_new_word: self.text = self.text + other_word.text\ if not add_white_space_between_words\ else self.text + ' ' + other_word.text for position in other_word.transkription_positions: position.id = str(len(self.transkription_positions)) self.transkription_positions.append(position) for position in other_word.faksimile_positions: position.id = str(len(self.faksimile_positions)) self.faksimile_positions.append(position) else: self.text = other_word.text + self.text index = 0 for position in other_word.transkription_positions: self.transkription_positions.insert(index, position) index += 1 while index < len(self.transkription_positions): self.transkription_positions[index].id = str(index) index += 1 index = 0 for position in other_word.faksimile_positions: self.faksimile_positions.insert(indexposition) index += 1 while index < len(self.faksimile_positions): self.faksimile_positions[index].id = str(index) index += 1 self.simplify_transkription_positions() def partition_according_to_deletion(self): """Partition a word according to its transkription_positions' deletion status ->split word and add partial words as its parts. """ if self.has_mixed_status('deleted'): transkription_positions = [] last_status = None for transkription_position in self.transkription_positions: if transkription_position.deleted != last_status\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) for tp in transkription_positions: newWord.deletion_paths += tp._deletion_paths self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.deleted if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) for tp in transkription_positions: newWord.deletion_paths += tp._deletion_paths self.word_parts.append(newWord) self.transkription_positions = [] self.line_number = -1 self.deleted = False elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion') elif not self.deleted\ and len(self.transkription_positions) > 0\ and self.transkription_positions[0].deleted: self.deleted = True for tp in self.transkription_positions: self.deletion_paths += tp._deletion_paths def partition_according_to_writing_process_id(self): """Partition a word according to its transkription_positions' writing_process_ids ->split word and add partial words as its parts. """ if self.belongs_to_multiple_writing_processes(): last_writing_process_id = -1 transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.writing_process_id != last_writing_process_id\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_writing_process_id = transkription_position.writing_process_id if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id') if self.belongs_to_multiple_writing_processes(include_parts=True): self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0] elif len(self.transkription_positions) > 0: self.writing_process_id = self.transkription_positions[0].writing_process_id def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False): """Determines whether word is over a word box. """ word_over_box = None if len(self.word_parts) > 0: for word in self.word_parts: current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None)) if current_word is not None and current_word.word_box is not None: word_over_box = current_word else: new_tp_dict = {} for index, transkription_position in enumerate(self.transkription_positions): if previous_word_has_box and index == 0: if len(transkription_position.positional_word_parts) > 0: transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2 #print(f'{self.text}: {transkription_position.positional_word_parts[0].left}') else: transkription_position.left += 1 word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) containing_boxes = [ box_path for box_path in box_paths\ if word_path.is_partially_contained_by(box_path)\ or box_path.do_paths_intersect(word_path) ] if len(containing_boxes) > 0: if previous_word_has_box: print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}') self._set_box_to_transkription_position(containing_boxes[0], word_path,\ transkription_position, new_tp_dict, tr_xmin) box_paths.remove(containing_boxes[0]) for replace_tp in new_tp_dict.keys(): for tp in new_tp_dict.get(replace_tp): self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp) self.transkription_positions.remove(replace_tp) word_over_box = self._get_partial_word_over_box() update_transkription_position_ids(self) return word_over_box def set_word_insertion_mark(self, word_insertion_mark): """Sets word_insertion_mark """ self.word_insertion_mark = word_insertion_mark def set_writing_process_id_to_transkription_positions(self, page): """Determines the writing process id of the transkription_positions. """ for transkription_position in self.transkription_positions: if len(transkription_position.positional_word_parts) > 0: for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): if font_key in page.fontsizekey2stage_mapping.keys(): transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) def simplify_transkription_positions(self): """Merge transkription_positions if possible. """ index = len(self.transkription_positions)-1 while index > 0\ and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]: current_tp = self.transkription_positions[index] index -= 1 previous_tp = self.transkription_positions[index] if previous_tp.is_mergebale_with(current_tp): positional_word_parts = previous_tp.positional_word_parts positional_word_parts += current_tp.positional_word_parts transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id) if len(transkription_positions) == 1: transkription_positions[0].writing_process_id = previous_tp.writing_process_id\ if previous_tp.writing_process_id != -1\ else current_tp.writing_process_id self.transkription_positions.pop(index+1) self.transkription_positions[index] = transkription_positions[0] #print(self.text, len(self.transkription_positions)) def split(self, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) previousWord.faksimile_positions = self.faksimile_positions current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) nextWord.faksimile_positions = self.faksimile_positions all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) currentWord.faksimile_positions = self.faksimile_positions return previousWord, currentWord, nextWord def split_according_to_status(self, status, splits_are_parts=False): """Split a word according to its transkription_positions' text. :return: a list of new word.Word """ new_words = [] if self.has_mixed_status(status): last_status = None transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.__dict__[status] != last_status\ and len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.__dict__[status] if len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) if splits_are_parts: self.word_parts += new_words if len(self.word_parts) > 0: self.transkription_positions = [] return new_words def undo_partitioning(self): """Undo partitioning. """ if len(self.word_parts) > 0: for word_part in self.word_parts: word_part.undo_partitioning() if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]): self.transkription_positions += word_part.transkription_positions self.earlier_version = None self.edited_text = None self.word_box = None self.word_parts = [] self.corrections = [] self.earlier_versions = [] self.box_paths = [] def _create_new_word(self, transkription_positions, status, new_id=0): """Create a new word from self and transkription_positions. """ newWord = Word(id=new_id, transkription_positions=transkription_positions) for key in self.COPY_PROPERTY_KEY: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys(): newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status]) else: newWord.__dict__[status] = transkription_positions[0].__dict__[status] return newWord def _get_parts_with_property_key(self, property_key): """Return a list of word_parts with property == property_key. """ word_parts = [] for word_part in self.word_parts: if property_key in word_part.__dict__.keys(): word_parts.append(word_part) else: word_parts += word_part._get_parts_with_property_key(property_key) return word_parts def _get_partial_word_over_box(self): """Partition a word according to its transkription_positions' has_box ->split word and add partial words as its parts. :return: word over box or self """ word_over_box = None if self.has_mixed_status('has_box'): transkription_positions = [] last_word_box = None for transkription_position in self.transkription_positions: if transkription_position.has_box != last_word_box\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box transkription_positions = [] transkription_positions.append(transkription_position) last_word_box = transkription_position.has_box if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box self.transkription_positions = [] elif len(self.word_parts) > 0: #self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box') for word_part in self.word_parts: if word_over_box is None: word_over_box = word_part._get_partial_word_over_box() else: break elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1: word_over_box = self word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box return word_over_box def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin): """Set box_path to transkription_position that is contained by box_path. Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary. """ if box_path.contains_path(word_path): transkription_position.has_box = box_path elif box_path.contains_start_of_path(word_path): split_position = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[0].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path elif box_path.contains_end_of_path(word_path): split_position = box_path.path.bbox()[0] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path else: # box_path in the middle of word_pathz split_position1 = box_path.path.bbox()[0] - tr_xmin split_position2 = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position1, split_position2) if len(new_tps) >= 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path def do_paths_intersect_saveMode(mypath1, mypath2): """Returns true if paths intersect, false if not or if there was an exception. """ try: return mypath1.path.intersect(mypath2.path, justonemode=True)\ or mypath1.is_partially_contained_by(mypath2) except AssertionError: return False Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 109) +++ svgscripts/datatypes/page.py (revision 110) @@ -1,428 +1,430 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile, basename from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import re import sys import warnings from .box import Box from .color import Color from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .faksimile_position import FaksimilePosition +from .imprint import Imprint from .lineNumber import LineNumber from .line import Line from .mark_foreign_hands import MarkForeignHands from .matrix import Matrix from .path import Path from .positional_word_part import PositionalWordPart from .super_page import SuperPage from .style import Style from .text_connection_mark import TextConnectionMark from .text_field import TextField from .transkriptionField import TranskriptionField from .writing_process import WritingProcess from .word import Word from .word_deletion_path import WordDeletionPath from .word_insertion_mark import WordInsertionMark sys.path.append('py2ttl') from class_spec import SemanticClass sys.path.append('shared_util') from main_util import extract_paths_on_tf, get_paths_near_position FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK class Page(SemanticClass,SuperPage): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. faksimile_image: FaksimileImage. faksimile_svgFile: svg file containing information about word positions. """ UNITTESTING = False def __init__(self, xml_source_file=None, faksimile_image=None, faksimile_svgFile=None, add_paths_near_words=False, warn=False, number=None): if xml_source_file is not None: super(Page,self).__init__(xml_source_file) self.update_property_dictionary('faksimile_image', faksimile_image) self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile) self.init_all_properties() self.add_style(style_node=self.page_tree.getroot().find('.//style')) self.faksimile_text_field = None self.svg_text_field = None self.init_node_objects() self.warn = warn self.add_deletion_paths_to_words(add_paths_near_words) else: self.page_tree = None self.number = number def add_deletion_paths_to_words(self, add_paths_near_words=False): """Add deletion paths to words. """ words = [ word for word in self.words if (len(word.word_parts) == 0 and word.deleted and len(word.deletion_paths) == 0)\ or 'add_paths_near_words' in word.process_flags ] words += [ word for word in self.words\ if len(word.word_parts) > 0 and True in\ [ (wp.deleted and len(wp.deletion_paths) == 0) for wp in word.word_parts ]] if len(words) > 0 and ((self.svg_file is not None and isfile(self.svg_file))\ or (self.source is not None and isfile(self.source))): svg_file = self.svg_file if self.svg_file is not None else self.source transkription_field = TranskriptionField(svg_file) tr_xmin = transkription_field.xmin if (self.svg_image is None or self.svg_image.text_field is None) else 0 tr_ymin = transkription_field.ymin if (self.svg_image is None or self.svg_image.text_field is None) else 0 word_deletion_paths = self.word_deletion_paths index = 0 dp_updated = False while index < len(words): word = words[index] word.add_deletion_paths(word_deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) if len(word.deletion_paths) > 0 or True in [ len(w.deletion_paths) > 0 for w in word.word_parts ]: deletion_paths = word.deletion_paths for wp in word.word_parts: deletion_paths += wp.deletion_paths for deletion_path in deletion_paths: if deletion_path not in self.word_deletion_paths: self.word_deletion_paths.append(deletion_path) elif not dp_updated: word_deletion_paths = extract_paths_on_tf(self) dp_updated = True index -= 1 if add_paths_near_words\ and ('add_paths_near_words' in word.process_flags\ or ((word.deleted and len(word.deletion_paths) == 0)\ or True in [ (w.deleted and len(w.deletion_paths) == 0) for w in word.word_parts ])): if not dp_updated\ and 'add_paths_near_words' in word.process_flags: word_deletion_paths = extract_paths_on_tf(self) dp_updated = True transform = None tp = None target_word = word paths_near_word = [] if word.deleted and len(word.transkription_positions) > 0: transform = word.transkription_positions[0].transform for tp in word.transkription_positions: word.deletion_paths_near_word += get_paths_near_position(tp, word_deletion_paths) elif len(word.word_parts) > 0: for wp in word.word_parts: if wp.deleted and len(wp.transkription_positions) > 0: target_word = wp for tp in wp.transkription_positions: wp.deletion_paths_near_word = get_paths_near_position(tp, word_deletion_paths) if self.warn and (word.deleted and len(word.deletion_paths) == 0): warnings.warn(\ f'WARNING: {self.title} {self.number}: {word.id} on {word.line_number}, {word.text} has no deletion paths! {target_word.deletion_paths_near_word}, {transform}') index += 1 @classmethod def create_cls(cls, xml_source_file=None, create_dummy_page=False, page_node=None): """Create a Page. """ if not create_dummy_page: return cls(xml_source_file) else: m = re.match(r'(.*)(page[0]*)(.*)(\.xml)', xml_source_file) if m is not None and len(m.groups()) > 3: number = m.group(3) else: number = basename(xml_source_file).replace('.xml','') return cls(number=number) @classmethod def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None): """Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT. [optional: instantiation depends on the fulfilment of a status_contains and/or on the selection of some words by a word_selection_function]. """ source_tree = ET.parse(xml_file) if source_tree.getroot().find('metadata/type').text == cls.FILE_TYPE_SVG_WORD_POSITION: page = cls(xml_file) if word_selection_function is None or len(word_selection_function(page.words)) > 0: return [ page ] else: return [] elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: pages = [] xpath = '//page/@output' if status_contains != '' and status_not_contain != '': xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain) elif status_contains != '': xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains) elif status_not_contain != '': xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain) for xml_source_file in source_tree.xpath(xpath): if isfile(xml_source_file): pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function) return pages else: return [] @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'number': { 'class': str, 'cardinality': 1}} properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage, subPropertyOf=cls.HAS_IMAGE)) properties.update(cls.create_semantic_property_dictionary('faksimile_text_field', TextField,\ name='pageIsOnFaksimileTextField', label='page is on faksimile text field',\ comment='Relates a page to the text field on a svg image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD)) properties.update(cls.create_semantic_property_dictionary('orientation', str)) properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage, subPropertyOf=cls.HAS_IMAGE)) properties.update(cls.create_semantic_property_dictionary('svg_text_field', TextField,\ name='pageIsOnSVGTextField', label='page is on svg text field',\ comment='Relates a page to the text field on a faksimile image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD)) - for key in [ 'lines', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']: + for key in [ 'imprints', 'lines', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']: properties.update(cls.create_semantic_property_dictionary(key, list)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def get_word_deletion_path(self, path=None, d_attribute=None) ->WordDeletionPath: """Return a word deletion path that belongs to page. """ if path is None and d_attribute is None: raise Exception('ERROR: get_word_deletion_path needs a path or a d_attribute!') if d_attribute is None: d_attribute = path.d_attribute page_paths = [ dpath for dpath in self.word_deletion_paths if dpath.d_attribute == d_attribute ] if len(page_paths) > 0: return page_paths[0] else: dpath = WordDeletionPath.create_cls(self, path=path, d_attribute=d_attribute) if dpath is not None: dpath.id = len(self.word_deletion_paths) self.word_deletion_paths.append(dpath) dpath.attach_object_to_tree(self.page_tree) return dpath def init_node_objects(self): """Initialize all node objects. """ self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ] self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ] self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ] self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] + self.imprints = [ Imprint.create_cls_from_node(imprint_node, self.lines) for imprint_node in self.page_tree.getroot().xpath('//' + Imprint.XML_TAG) ] self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ] self.word_deletion_paths = [ WordDeletionPath.create_cls(self, node=node) for node in self.page_tree.xpath('./' + WordDeletionPath.XML_TAG) ] if self.faksimile_image is not None and self.faksimile_image.text_field is not None: self.faksimile_text_field = self.faksimile_image.text_field if self.svg_image is not None and self.svg_image.text_field is not None: self.svg_text_field = self.svg_image.text_field for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks: simple_word.init_word(self) for wim in self.word_insertion_marks: if wim.line_number > -1: wim.line = [ line for line in self.lines if line.id == wim.line_number ][0] def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None): """Update the data source of page. """ if faksimile_svgFile is not None: self.faksimile_svgFile = faksimile_svgFile data_node = self.page_tree.xpath('.//data-source')[0]\ if len(self.page_tree.xpath('.//data-source')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'data-source') data_node.set('file', self.faksimile_svgFile) if xml_correction_file is not None: data_node.set('xml-corrected-words', xml_correction_file) def update_line_number_area(self, transkription_field, svg_tree=None, set_to_text_field_zero=True): """Determines the width of the area where the line numbers are written in the page.source file. """ THRESHOLD = 0.4 if svg_tree is None: svg_tree = ET.parse(self.source) if len(self.line_numbers) > 1: line_number = self.line_numbers[9]\ if transkription_field.is_page_verso() and len(self.line_numbers) > 8\ else self.line_numbers[1] ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\ if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\ and LineNumber.IS_A_LINE_NUMBER(item)\ and LineNumber(raw_text_node=item).id == line_number.id ] if len(ln_nodes) > 0: matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform')) if transkription_field.is_page_verso(): transkription_field.add_line_number_area_width(matrix.getX()) elif self.svg_file is not None and isfile(self.svg_file): svg_path_tree = ET.parse(self.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } svg_x = matrix.getX() svg_y = self.line_numbers[1].bottom + transkription_field.ymin\ if set_to_text_field_zero\ else self.line_numbers[1].bottom use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\ .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces) if len(use_nodes) > 0: symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '') d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces) if len(d_strings) > 0 and d_strings[0] != '': path = parse_path(d_strings[0]) xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin transkription_field.add_line_number_area_width(matrix.getX() + width) def update_page_type(self, transkription_field=None): """Adds a source to page and attaches it to page_tree. """ if self.number.endswith('r')\ or self.number.endswith('v'): self.page_type = Page.PAGE_VERSO\ if self.number.endswith('v')\ else Page.PAGE_RECTO else: if transkription_field is None: if self.source is None or not isfile(self.source): raise FileNotFoundError('Page does not have a source!') transkription_field = TranskriptionField(self.source, multipage_index=self.multipage_index) self.page_type = Page.PAGE_VERSO\ if transkription_field.is_page_verso()\ else Page.PAGE_RECTO self.page_tree.getroot().set('pageType', self.page_type) def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False): """Update styles of words and add them to their transkription_positions. Args: add_to_parents: Add styles also to word (and if not None to manuscript). partition_according_to_styles: Partition word if its transkription_positions have different styles. """ style_dictionary = {} if words is None: words = self.words for word in words: if len(word.word_parts) > 0: self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\ add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles) for transkription_position in word.transkription_positions: if len(transkription_position.positional_word_parts) > 0: style_class = transkription_position.positional_word_parts[0].style_class writing_process_id = -1 for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]: writing_process_id = self.fontsizekey2stage_mapping.get(font_key) style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id) if create_css: if style_dictionary.get((style_class_key, word.deleted)) is None: color = None if len(word.deletion_paths) > 0: if word.deletion_paths[0].style_class is not None\ and word.deletion_paths[0].style_class != ''\ and self.style_dict.get(word.deletion_paths[0].style_class) is not None: color = Color.create_cls_from_style_object(self.style_dict.get(word.deletion_paths[0].style_class)) else: color = Color() style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\ create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] ) transkription_position.style = style_dictionary[(style_class_key, word.deleted)] #print(style_dictionary[(style_class_key, word.deleted)]) else: if style_dictionary.get(style_class_key) is None: style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css) style_dictionary[style_class_key].writing_process_id = style_class_key[1] transkription_position.style = style_dictionary[style_class_key] if add_to_parents and transkription_position.style not in word.styles: word.styles.append(transkription_position.style) if partition_according_to_styles: word.split_according_to_status('style', splits_are_parts=True) if manuscript is not None\ and add_to_parents: manuscript.update_styles(*style_dictionary.values()) def __eq__(self, other): """Returns true if self is qualitatively identical to other. """ if other is None: return False if self.page_tree is None and other.page_tree is None: return self.number == other.number if self.page_tree is None or other.page_tree is None: return False return self.page_tree.docinfo.URL == other.page_tree.docinfo.URL def __hash__(self): """Return a hash value for self. """ try: if self.page_tree is None: return hash(self.number) except AttributeError: print(self) return hash(self.number) return hash(self.page_tree.docinfo.URL) Index: svgscripts/datatypes/imprint.py =================================================================== --- svgscripts/datatypes/imprint.py (revision 0) +++ svgscripts/datatypes/imprint.py (revision 110) @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This program can be used to extract imprints from a svg file. +""" +# Copyright (C) University of Basel 2021 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +import re +import sys +from os import listdir, sep, path +from os.path import isfile, isdir, dirname +import lxml.etree as ET +import warnings + + + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.1" + +from .attachable_object import AttachableObject +from .atypical_writing import AtypicalWriting +from .clarification import Clarification +from .editor_correction import EditorCorrection +from .line_continuation import LineContinuation +from .matrix import Matrix +from .standoff_tag import StandoffTag +from .text import Text +from .transkriptionField import TranskriptionField +from .uncertain_decipherment import UncertainDecipherment +from .footnotes import FootnoteColumns + +sys.path.append('py2ttl') +from class_spec import SemanticClass +from xml_conform_dictionary import XMLConformDictionary + +UNITTESTING = False +DEBUG = False + +class Imprint(SemanticClass,AttachableObject): + """This class represents an imprint, i.e. the reference to the printed version of the corresponding manuscript. + """ + START_END_LINE_PATTERN = re.compile('^(\d+)(-)(\d+)(:\s*)(.*)') + COMMA_LINE_PATTERN = re.compile('^(\d+)(,)(\d+)(-)(\d+)(:\s*)(.*)') + LINE_PATTERN = re.compile('^(((\d+,)*\d+-)*\d+)(:\s)(.*)') + XML_TAG = 'imprint' + DEBUG = False + + def __init__(self, reference=None, lines=None, line_list_string='', id=0): + self.id = id + self.reference = reference + self.lines = lines if lines is not None else [] + self.line_list_string = line_list_string + + def attach_object_to_tree(self, target_tree): + """Attach object to tree. + """ + obj_node = self.get_or_create_node_with_id(target_tree) + obj_node.set('reference', self.reference) + if self.line_list_string != '': + obj_node.set('line-list-string', self.line_list_string) + + @classmethod + def create_cls_from_node(cls, node, lines): + """Initialize a cls from node. + + [:return:] cls + """ + reference = node.get('reference') + line_list_string = node.get('line-list-string')\ + if bool(node.get('line-list-string')) else '' + return cls(reference=reference, lines=get_lines(lines, line_list_string)) + + @classmethod + def extract_cls(cls, lines, raw_node, namespaces, id=0): + """Return an instance of cls by extractign information from raw svg node. + """ + raw_string = ''.join(raw_node.xpath('./ns:tspan/text()', namespaces=namespaces)) + line_match = re.match(cls.LINE_PATTERN, raw_string) + if line_match is not None: + return cls(line_match.groups()[-1], line_list_string=line_match.groups()[0], id=id) + else: + return cls(raw_string, id=id) + + @classmethod + def get_semantic_dictionary(cls): + """ Creates a semantic dictionary as specified by SemanticClass. + """ + properties = {} + properties.update(cls.create_semantic_property_dictionary('reference', str,\ + name='imprintHasReference', label='imprint refers to the signature of the printed version of the manuscript')) + properties.update(cls.create_semantic_property_dictionary('lines', list,\ + name='imprintRefersToLines', label='the printed version of the manuscript concerns this list of lines')) + dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } + return cls.return_dictionary_after_updating_super_classes(dictionary) + +def get_lines(lines, line_list_string='') ->list: + """Returns a list of lines that correspond to the lines that are imprinted + """ + # TODO create tln:partOfPageTextUnit for each line range + if line_list_string == '': + return [] + relevant_lines = [] + if re.match(r'(.*\d+)(,)(\d+.*)', line_list_string): + for line_list_sub_string in line_list_string.split(','): + relevant_lines += get_lines(lines, line_list_string=line_list_sub_string) + return relevant_lines + multi_line_match = re.match(r'(\d+)(-)(\d+)', line_list_string) + single_line_match = re.match(r'^\d+$', line_list_string) + if multi_line_match is not None: + start_segment = int(multi_line_match.groups()[0]) + end_segment = int(multi_line_match.groups()[2]) + return [ line for line in lines if line.id >= start_segment and line.id <= end_segment ] + elif single_line_match is not None: + return [ line for line in lines if line.id == int(single_line_match.group()) ] + return relevant_lines + +def extract_imprints(page, transkription_field=None, svg_tree=None) ->list: + """Returns a list of imprints. + """ + if page.marginals_source is not None: + svg_tree = ET.parse(page.marginals_source) + if transkription_field is None: + transkription_field = TranskriptionField(page.source) + if svg_tree is None and page.source is not None: + svg_tree = ET.parse(page.source) + if transkription_field is None: + transkription_field = TranskriptionField(svg_tree.docinfo.URL) + nodes_beneath_tf = [ item for item in filter(lambda node: Matrix.IS_BENEATH_TF(Matrix(transform_matrix_string=node.get('transform')), transkription_field),\ + svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] + if len(nodes_beneath_tf) == 0: + return [] + x = Matrix(transform_matrix_string=nodes_beneath_tf[0].get('transform')).getX() + namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } + imprints = [ Imprint.extract_cls(page.lines, node, namespaces, id=i) for (i, node) in enumerate([ node for node in svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap)\ + if Matrix(transform_matrix_string=node.get('transform')).getX() == x ]) ] + return imprints + + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) Index: svgscripts/datatypes/footnotes.py =================================================================== --- svgscripts/datatypes/footnotes.py (revision 109) +++ svgscripts/datatypes/footnotes.py (revision 110) @@ -1,347 +1,347 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract footnotes from a svg file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET import warnings __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from .atypical_writing import AtypicalWriting from .clarification import Clarification from .editor_correction import EditorCorrection from .line_continuation import LineContinuation from .matrix import Matrix from .standoff_tag import StandoffTag from .text import Text from .transkriptionField import TranskriptionField from .uncertain_decipherment import UncertainDecipherment UNITTESTING = False DEBUG = False class FootnoteColumns: """This class represents footnote columns. """ REFERENCE_PATTERN = re.compile('.*(\d+-)*[0-9]+:') EXTENDED_REFERENCE_PATTERN = re.compile('.*(\d+(-|/))*[0-9]+:') REFERENCE_GROUP = re.compile('(.*\D)((\d+-)*[0-9]+:)') EXCEPTION = re.compile('((\d+/)+[0-9]+:)') def __init__(self, nsmap, nodes, bottom_values, style_dict, debug=False, skip_after=-1.0): self.bottom_values = bottom_values self.footnote_columns = [] self.footnote_keys = {} self.index = 0 self.nodes = nodes self.nsmap = nsmap self.skip_after = skip_after self.style_dict = style_dict self.debug = debug self._init_columns() def _init_columns(self): """Initialize footnote column positions by creating lists in self.footnote_columns and adding the positions a keys to self.footnote_keys while the index of self.footnote_columns are their values. """ first_line_fn_nodes = sorted([ item for item in self.nodes\ if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == round(self.bottom_values[0], 1)\ and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after],\ key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX()) current_nodes = [] for node in first_line_fn_nodes: matrix = Matrix(transform_matrix_string=node.get('transform')) if len(node.getchildren()) > 0: for tspan in node.findall('tspan', self.nsmap): x = matrix.add2X(float(tspan.get('x'))) current_nodes.append({ 'x': x, 'text': tspan.text }) elif node.text is not None: x = matrix.getX() current_nodes.append({ 'x': x, 'text': node.text }) if re.match(self.EXTENDED_REFERENCE_PATTERN,\ ''.join([ item.get('text') for item in current_nodes])): current_nodes = self._remove_unused_texts(current_nodes) self.footnote_columns.append([]) self.footnote_keys.update({ round(current_nodes[0].get('x')): len(self.footnote_columns)-1 }) current_nodes = [] if len(self.footnote_keys) == 0: raise Exception(f'ERROR: there are no footnote_keys') def _remove_unused_texts(self, nodes): """Remove tspan that contain text that is not a line reference. """ threshold = 100 node_text = ''.join([ item.get('text') for item in nodes]) match = re.match(self.REFERENCE_GROUP, node_text) if match is not None and match.group(1) is not None\ and not re.match(self.EXCEPTION, node_text): unused_text = '' index = 0 for item in nodes: unused_text += item.get('text') if match.group(1).startswith(unused_text): index += 1 else: break if len(nodes) > index+1: counter = 0 has_gap = False for item in nodes[index:]: if len(nodes) > index+counter+1\ and nodes[index+counter+1].get('x')-nodes[index+counter].get('x') > threshold: index += counter+1 has_gap = True break counter += 1 if has_gap: return nodes[index+1:] return nodes[index:] return nodes def append(self, footnote): """Append footnote to a column """ self.footnote_columns[self.index].append(footnote) @classmethod def create_cls(cls, style_dict=None, page=None, transkription_field=None, svg_tree=None, svg_file=None, marginals_on_extra_page=False, skip_after=-1.0): """Returns all footnotes as a list of Text. """ if page is not None and page.source is not None and svg_file is None: svg_file = page.source\ if page.marginals_source is None\ else page.marginals_source if transkription_field is None and svg_file is not None: multipage_index = -1\ if page is None\ else page.multipage_index transkription_field = TranskriptionField(svg_file, multipage_index=multipage_index) if svg_tree is None and svg_file is not None: svg_tree = ET.parse(svg_file) if style_dict is None and page is not None: style_dict = StandoffTag.create_relevant_style_dictionary(page) if page is not None and page.marginals_source is not None: marginals_on_extra_page = True svg_tree = ET.parse(page.marginals_source) nodes_in_footnote_area = cls.EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field, marginals_on_extra_page=marginals_on_extra_page) bottom_values = cls.GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area) if len(bottom_values) == 0: return None else: return cls(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, style_dict, skip_after=skip_after) def extract_footnotes(self, contains_string='', contains_strings=None) -> list: """Returns all footnotes as a list of Text. """ left_value = -1 for bottom_value in self.bottom_values: nodes_on_line = sorted([ item for item in self.nodes\ if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == bottom_value\ and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after\ ],\ key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) footnote = None matrix = None for node in nodes_on_line: matrix = Matrix(transform_matrix_string=node.get('transform')) footnote, left_value = self._process_content_and_markup(node, footnote, matrix) if footnote is not None: self.append(footnote) footnotes = self.toList() if contains_strings is not None: footnotes = [ footnote for footnote in footnotes if True in [ contains_string in footnote.content for contains_string in contains_strings] ] if contains_string != '': footnotes = [ footnote for footnote in footnotes if contains_string in footnote.content ] return footnotes def get_index(self, left_value) -> int: """Return index of column for left value. """ index = -1 if round(left_value) in self.footnote_keys.keys(): index = self.footnote_keys[round(left_value)] else: for key, value in self.footnote_keys.items(): if abs(key - round(left_value)) < 2: index = value break return index def register_index(self, left_value): """Register index for next column to be used. """ index = self.get_index(left_value) if index > -1: self.index = index else: error_value = round(left_value) msg = f'Left value not part of columns: {error_value} -> {self.footnote_keys}' raise Exception(msg) def toList(self): """Return footnotes as a list of Text. """ footnotes = [] for footnote_list in self.footnote_columns: for footnote in footnote_list: if re.match(self.REFERENCE_PATTERN, footnote.content): footnotes.append(footnote) elif len(footnotes) > 0: footnotes[-1].join(footnote) else: print([ footnote.content for footnote in self.footnote_columns[1]]) print(self.footnote_keys) raise Exception(f'List of footnotes empty and footnote "{footnote.content}" does not match {self.REFERENCE_PATTERN.pattern}!') return footnotes def _process_content_and_markup(self, node, footnote, matrix): """Process content and markup of node. [:return:] (footnote: Text, left_value: float) """ startIndex = 0 next_text = node.text left_value = matrix.getX() items = [ item for item in node.findall('tspan', self.nsmap)] if len(items) > 0: next_text = ''.join([ item.text for item in items]) left_value = matrix.add2X(float(items[0].get('x'))) elif bool(node.get('x')): left_value = matrix.add2X(float(node.get('x'))) if footnote != None and\ ((re.match(r'.*[0-9]+:', next_text)\ and re.match(r'.*[0-9]+:', footnote.content)\ and not re.match(r'.*\d-', footnote.content))\ or (self.get_index(left_value) > -1\ and self.get_index(left_value) != self.index)): if DEBUG and re.match(r'.*[0-9]+:', next_text)\ and not re.match(r'.*[0-9]+:', footnote.content): print(footnote, next_text) self.append(footnote) footnote = None if len(items) > 0: for item in items: footnote, left_value = self._process_content_and_markup(item, footnote, matrix) else: if footnote is None: footnote = Text(content=next_text) try: self.register_index(left_value) except Exception: print(self.footnote_columns) raise Exception(f'{footnote}') else: startIndex = footnote.append(next_text) if bool(node.get('class')): - standoff_markups = StandoffTag.create_cls(startIndex, len(footnote.content), node.get('class'), style_dict=self.style_dict) + standoff_markups = StandoffTag.create_cls(startIndex, len(footnote.content)-1, node.get('class'), style_dict=self.style_dict) if len(standoff_markups) > 0: if len(footnote.standoff_markups) > 0: standoff_markups = footnote.standoff_markups[-1].join_list(standoff_markups) if len(standoff_markups) > 0: footnote.standoff_markups += standoff_markups return footnote, left_value @staticmethod def EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field=None, marginals_on_extra_page=False) ->list: """Return a list of nodes that are in footnote area. """ if transkription_field is None and svg_tree is not None: transkription_field = TranskriptionField(svg_tree.docinfo.URL) nodes_in_footnote_area = [ item for item in filter(lambda node: Matrix.NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, transkription_field,\ marginals_on_extra_page=marginals_on_extra_page),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] for node in nodes_in_footnote_area: if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, marginals_on_extra_page=marginals_on_extra_page): for child in node.getchildren(): if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, x=float(child.get('x')), marginals_on_extra_page=marginals_on_extra_page): node.remove(child) return nodes_in_footnote_area @staticmethod def GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area) ->list: """Return sorted list of unique bottom values. """ return sorted([ bottom_value for bottom_value in set(round(Matrix(transform_matrix_string=item.get('transform')).getY(),1) for item in nodes_in_footnote_area) ]) def extract_footnotes_as_strings(transkription_field=None, svg_tree=None, svg_file=None, contains_string='', marginals_extra=False): """Returns all footnotes as a list of strings. """ if transkription_field is None and svg_file is not None: transkription_field = TranskriptionField(svg_file) if svg_tree is None and svg_file is not None: svg_tree = ET.parse(svg_file) footnotes = [] nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field, marginals_on_extra_page=marginals_extra),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ]) for bottom_value in bottom_values: nodes_on_line = [ item for item in nodes_in_footnote_area if Matrix(transform_matrix_string=item.get('transform')).getY() == bottom_value ] nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) footnote_string = '' for node in nodes_on_line: if len(node.getchildren()) == 0: if footnote_string != '' and re.match(r'.*[0-9]+:', node.text): footnotes.append(footnote_string) footnote_string = node.text else: footnote_string += node.text else: next_string = ''.join([ item.text for item in node.findall('tspan', svg_tree.getroot().nsmap)]) if footnote_string != '' and re.match(r'.*[0-9]+:', next_string): footnotes.append(footnote_string) footnote_string = next_string else: footnote_string += next_string footnotes.append(footnote_string) if contains_string != '': footnotes = [ footnote_string for footnote_string in footnotes if contains_string in footnote_string ] return footnotes def extract_footnotes(page, transkription_field=None, svg_tree=None, svg_file=None, contains_string='', contains_strings=None, skip_after=-1.0) ->list: """Returns all footnotes as a list of Text. """ marginals_on_extra_page = False if page.marginals_source is not None: marginals_on_extra_page = True svg_tree = ET.parse(page.marginals_source) if transkription_field is None: transkription_field = TranskriptionField(page.source) footnote_columns = FootnoteColumns.create_cls(page=page, transkription_field=transkription_field,\ svg_tree=svg_tree, svg_file=svg_file, marginals_on_extra_page=marginals_on_extra_page, skip_after=skip_after) if footnote_columns is None: return [] return footnote_columns.extract_footnotes(contains_string=contains_string, contains_strings=contains_strings) if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/datatypes/simple_word.py =================================================================== --- svgscripts/datatypes/simple_word.py (revision 109) +++ svgscripts/datatypes/simple_word.py (revision 110) @@ -1,124 +1,139 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent a simple word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc from lxml import etree as ET +import re import sys from .line import Line from .faksimile_position import FaksimilePosition from .transkription_position import TranskriptionPosition from .word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass class SimpleWord(SemanticClass, metaclass=abc.ABCMeta): """ This class represents a simple word. """ + PUNCTUATION_PATTERN = re.compile('(^[\.\?,\!;:\-_–()“„]|[\.\?,\!;:\-_–()“„]$)') + FIND_PUNCTUATION_PATTERN = re.compile('(^[\.\?,\!;:\-_–()“„]|.*[\.\?,\!;:\-_–()“„]$)') XML_TAG = 'simple-word' XML_SUB_TAG = 'content' def __init__(self, id=0, line_number=-1, line=None, text='', deleted=False, transkription_positions=None, faksimile_positions=None): self.id = id self.text = text + self.clean_text = self._create_clean_text(text) self.line_number = line_number self.lines = [] if line is not None: self.lines.append(line) self.transkription_positions = transkription_positions if transkription_positions is not None else [] self.faksimile_positions = faksimile_positions if faksimile_positions is not None else [] def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() if len(target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)) > 0: word_node = target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)[0] word_node.getparent().remove(word_node) word_node = ET.SubElement(target_tree, self.XML_TAG, attrib={'id': str(self.id)}) word_node.set('text', self.text) if self.line_number > -1: word_node.set('line-number', str(self.line_number)) for id, transkription_position in enumerate(self.transkription_positions): transkription_position.id = id transkription_position.attach_object_to_tree(word_node) for faksimile_position in self.faksimile_positions: faksimile_position.attach_object_to_tree(word_node) return word_node + def _create_clean_text(self, text: str) ->str: + """Creates a text without any punctuation chars. + """ + if len(text) < 2\ + or (len(text) < 3 and re.match(self.FIND_PUNCTUATION_PATTERN, text[0]) is None)\ + or re.match(self.FIND_PUNCTUATION_PATTERN, text) is None: + return text + return self._create_clean_text(re.sub(self.PUNCTUATION_PATTERN, '', text)) + @classmethod def create_cls(cls, word_node): """Creates a cls from a (lxml.Element) node. [:return:] cls """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else -1 text = word_node.get('text') transkription_positions = [ TranskriptionPosition(id=id, node=node) for id, node in enumerate(word_node.findall('./' + WordPosition.TRANSKRIPTION)) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('./' + WordPosition.FAKSIMILE) ] return cls(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) else: error_msg = 'word_node has not been defined' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'lines': {cls.CLASS_KEY: Line,\ cls.CARDINALITY: 1,\ cls.CARDINALITY_RESTRICTION: 'minCardinality',\ cls.PROPERTY_NAME: 'wordBelongsToLine',\ cls.PROPERTY_LABEL: 'word belongs to a line',\ cls.PROPERTY_COMMENT: 'Relating a word to a line.'}} properties.update(cls.create_semantic_property_dictionary('transkription_positions', TranskriptionPosition,\ name='hasTranskriptionPosition', cardinality=1, cardinality_restriction='minCardinality')) properties.update(cls.create_semantic_property_dictionary('faksimile_positions', FaksimilePosition,\ name='hasFaksimilePosition', cardinality=1, cardinality_restriction='minCardinality')) properties.update(cls.create_semantic_property_dictionary('text', str, cardinality=1,\ subPropertyOf=cls.HOMOTYPIC_HAS_TEXT_URL_STRING)) + properties.update(cls.create_semantic_property_dictionary('clean_text', str, cardinality=1,\ + name='hasCleanText', label='text without punctuation', comment='text of word without punctuation except abbrevations')) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def init_word(self, page): """Initialize word with objects from page. """ if self.line_number > -1: self.lines += [ line for line in page.lines if line.id == self.line_number ] elif 'word_parts' in self.__dict__.keys() and len(self.word_parts) > 0: self.lines += [ line for line in page.lines if line.id in [ wp.line_number for wp in self.word_parts ] ] Index: svgscripts/datatypes/super_page.py =================================================================== --- svgscripts/datatypes/super_page.py (revision 109) +++ svgscripts/datatypes/super_page.py (revision 110) @@ -1,295 +1,296 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a super page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile, basename, dirname from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import sys import warnings from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .mark_foreign_hands import MarkForeignHands from .text_connection_mark import TextConnectionMark from .text_field import TextField from .writing_process import WritingProcess class SuperPage: """ This super class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition' FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile' ADD2Y = 7 PAGE_RECTO = 'recto' PAGE_VERSO = 'verso' STATUS_MERGED_OK = 'faksimile merged' STATUS_POSTMERGED_OK = 'words processed' UNITTESTING = False XML_TAG = 'page' def __init__(self, xml_file, title=None, page_number='', orientation='North', multipage_index=-1, page_type=PAGE_VERSO, should_xml_file_exist=False): self.properties_dictionary = {\ 'faksimile_image': (FaksimileImage.XML_TAG, None, FaksimileImage),\ 'faksimile_svgFile': ('data-source/@file', None, str),\ 'multipage_index': ('page/@multipage-index', multipage_index, int),\ 'marginals_source': ('page/@marginals-source', None, str),\ 'number': ('page/@number', str(page_number), str),\ 'orientation': ('page/@orientation', orientation, str),\ 'page_type': ('page/@pageType', page_type, str),\ 'pdfFile': ('pdf/@file', None, str),\ 'source': ('page/@source', None, str),\ 'svg_file': ('svg/@file', None, str),\ 'svg_image': (SVGImage.XML_TAG, None, SVGImage),\ 'text_field': (FaksimileImage.XML_TAG + '/' + TextField.XML_TAG, None, TextField),\ 'title': ('page/@title', title, str),\ } self.bak_file = None self.online_properties = [] + self.imprints = [] self.line_numbers = [] self.lines = [] self.mark_foreign_hands = [] self.page_tree = None self.sonderzeichen_list = [] self.style_dict = {} self.text_connection_marks = [] self.word_deletion_paths = [] self.word_insertion_marks = [] self.words = [] self.writing_processes = [] self.xml_file = xml_file if not self.is_page_source_xml_file(): msg = f'ERROR: xml_source_file {self.xml_file} is not of type "{FILE_TYPE_SVG_WORD_POSITION}"' raise Exception(msg) self._init_tree(should_xml_file_exist=should_xml_file_exist) def add_style(self, sonderzeichen_list=None, letterspacing_list=None, style_dict=None, style_node=None): """Adds a list of classes that are sonderzeichen and a style dictionary to page. """ self.sonderzeichen_list = sonderzeichen_list if sonderzeichen_list is not None else [] self.letterspacing_list = letterspacing_list if letterspacing_list is not None else [] self.style_dict = style_dict if style_dict is not None else {} if style_node is not None: self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') } self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ] self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('letterspacing-list')) ] elif bool(self.style_dict): for node in self.page_tree.xpath('//style'): node.getparent().remove(node) style_node = ET.SubElement(self.page_tree.getroot(), 'style') if len(self.sonderzeichen_list) > 0: style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list)) if len(self.letterspacing_list) > 0: style_node.set('letterspacing-list', ' '.join(self.letterspacing_list)) for key in self.style_dict.keys(): self.style_dict[key]['name'] = key ET.SubElement(style_node, 'class', attrib=self.style_dict[key]) fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value } fontsizes = sorted(fontsize_dict.values(), reverse=True) # create a mapping between fontsizes and word stages self.fontsizekey2stage_mapping = {} for fontsize_key, value in fontsize_dict.items(): if value >= fontsizes[0]-1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION }) elif value <= fontsizes[-1]+1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION }) else: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION }) def get_biggest_fontSize4styles(self, style_set={}): """Returns biggest font size from style_dict for a set of style class names. [:returns:] (float) biggest font size OR 1 if style_dict is empty """ if bool(self.style_dict): sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True) return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1 else: return 1 def get_line_number(self, y): """Returns line number id for element at y. [:return:] (int) line number id or -1 """ if len(self.line_numbers) > 0: result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ] return result_list[0] if len(result_list) > 0 else -1 else: return -1 def init_all_properties(self, overwrite=False): """Initialize all properties. """ for property_key in self.properties_dictionary.keys(): if property_key not in self.online_properties: self.init_property(property_key, overwrite=overwrite) def init_property(self, property_key, value=None, overwrite=False): """Initialize all properties. Args: property_key: key of property in self.__dict__ value: new value to set to property overwrite: whether or not to update values from xml_file (default: read only) """ if value is None: if property_key not in self.online_properties: xpath, value, cls = self.properties_dictionary.get(property_key) if len(self.page_tree.xpath('//' + xpath)) > 0: value = self.page_tree.xpath('//' + xpath)[0] if value is not None: if cls.__module__ == 'builtins': self.update_tree(value, xpath) self.__dict__.update({property_key: cls(value)}) else: value = cls(node=value)\ if type(value) != cls\ else value self.__dict__.update({property_key: value}) self.__dict__.get(property_key).attach_object_to_tree(self.page_tree) else: self.__dict__.update({property_key: value}) self.online_properties.append(property_key) elif overwrite or property_key not in self.online_properties: xpath, default_value, cls = self.properties_dictionary.get(property_key) if cls.__module__ == 'builtins': self.__dict__.update({property_key: cls(value)}) self.update_tree(value, xpath) else: self.__dict__.update({property_key: value}) self.__dict__.get(property_key).attach_object_to_tree(self.page_tree) self.online_properties.append(property_key) def is_locked(self): """Return true if page is locked. """ return len(self.page_tree.xpath('//metadata/lock')) > 0 def is_page_source_xml_file(self, source_tree=None): """Return true if xml_file is of type FILE_TYPE_SVG_WORD_POSITION. """ if not isfile(self.xml_file): return True if source_tree is None: source_tree = ET.parse(self.xml_file) return source_tree.getroot().find('metadata/type').text == self.FILE_TYPE_SVG_WORD_POSITION def lock(self, reference_file, message=''): """Lock tree such that ids of words etc. correspond to ids in reference_file, optionally add a message that will be shown. """ if not self.is_locked(): metadata = self.page_tree.xpath('./metadata')[0]\ if len(self.page_tree.xpath('./metadata')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'metadata') lock = ET.SubElement(metadata, 'lock') ET.SubElement(lock, 'reference-file').text = reference_file if message != '': ET.SubElement(lock, 'message').text = message def unlock(self): """Lock tree such that ids of words etc. correspond to ids in reference_file, optionally add a message that will be shown. """ if self.is_locked(): lock = self.page_tree.xpath('//metadata/lock')[0] lock.getparent().remove(lock) def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') def update_property_dictionary(self, property_key, default_value): """Update properties_dictionary. """ content = self.properties_dictionary.get(property_key) if content is not None: self.properties_dictionary.update({property_key: (content[0], default_value, content[2])}) else: msg = f'ERROR: properties_dictionary does not contain a key {property_key}!' raise Exception(msg) def update_tree(self, value, xpath): """Update tree. """ node_name = dirname(xpath) node = self.page_tree.xpath('//' + node_name)[0]\ if len(self.page_tree.xpath('//' + node_name)) > 0\ else ET.SubElement(self.page_tree.getroot(), node_name) node.set(basename(xpath).replace('@', ''), str(value)) def _init_tree(self, should_xml_file_exist=False): """Initialize page_tree from xml_file if it exists. """ if isfile(self.xml_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(self.xml_file, parser) elif not should_xml_file_exist: self.page_tree = ET.ElementTree(ET.Element('page')) self.page_tree.docinfo.URL = self.xml_file else: msg = f'ERROR: xml_source_file {self.xml_file} does not exist!' raise FileNotFoundError(msg) Index: tests_svgscripts/test_util.py =================================================================== --- tests_svgscripts/test_util.py (revision 109) +++ tests_svgscripts/test_util.py (revision 110) @@ -1,256 +1,260 @@ import unittest from os import sep, path, remove, listdir from os.path import isdir, isfile, dirname, basename import shutil import sys import lxml.etree as ET import sys import tempfile import warnings sys.path.append('svgscripts') import util from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT from datatypes.faksimile import FaksimilePage from datatypes.page import Page from datatypes.page_creator import PageCreator from datatypes.positional_word_part import PositionalWordPart from datatypes.text_field import TextField from datatypes.transkriptionField import TranskriptionField from datatypes.word_position import WordPosition from datatypes.word import Word sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT sys.path.append('fixes') from fix_old_data import save_page class TestCopy(unittest.TestCase): def setUp(self): util.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_dir = DATADIR self.faksimile_dir = DATADIR + sep + 'faksimile_svg' self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg' self.image = DATADIR + sep + 'image.jpg' self.svg_testrecord = DATADIR + sep + 'TESTRECORD.svg' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.Mp_XIV_page420 = DATADIR + sep + 'Mp_XIV_page420.xml' self.tmp_dir = tempfile.mkdtemp() def test_copy(self): tmp_image = self.tmp_dir + sep + basename(self.image) target_file = 'asdf.svg' shutil.copy(self.image, self.tmp_dir) util.copy_faksimile_svg_file(target_file, faksimile_source_file=self.faksimile_file,\ target_directory=self.tmp_dir, local_image_path=tmp_image) self.assertEqual(isfile(self.tmp_dir + sep + target_file), True) util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_file,\ target_directory=self.tmp_dir, local_image_path=tmp_image) self.assertEqual(isfile(self.tmp_dir + sep + basename(self.faksimile_file)), True) with self.assertRaises(Exception): util.copy_faksimile_svg_file() with self.assertRaises(Exception): util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_source_file) def test_copy_xml(self): old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) self.assertEqual(isfile(xml_file), True) page = Page(xml_file) self.assertEqual(len(page.words), len(old_page.words)) self.assertEqual(len(page.line_numbers), 0) def test_create_highlighted_svg_file(self): target_file = self.tmp_dir + sep + basename(self.faksimile_file) tmp_image = self.tmp_dir + sep + basename(self.image) faksimile_tree = ET.parse(self.faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } node_ids = ['rect947', 'rect951', 'rect953', 'rect955', 'rect959', 'rect961', 'rect963'] highlight_color = 'blue' util.create_highlighted_svg_file(faksimile_tree, node_ids, target_directory=self.tmp_dir, highlight_color=highlight_color, namespaces=namespaces) self.assertEqual(isfile(target_file), True) new_tree = ET.parse(target_file) for node in new_tree.xpath('//ns:rect[@fill="{0}"]|//ns:path[@fill="{0}"]'.format(highlight_color), namespaces=namespaces): node_ids.remove(node.get('id')) self.assertEqual(len(node_ids), 0) def test_get_empty_node_ids(self): faksimile_tree = ET.parse(self.faksimile_file) faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] empty_node_ids = util.get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page) self.assertEqual('rect1085' in empty_node_ids, True) def test_record_changes(self): new_tree = ET.parse(self.faksimile_file) old_tree = ET.parse(self.faksimile_file) empty_node_id = 'rect1085' title_node_id = 'test001' namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() } node = new_tree.xpath('//ns:rect[@id="{0}"]'.format(empty_node_id), namespaces=namespaces)[0] title = ET.SubElement(node, 'title', attrib={ 'id': title_node_id }) title.text = 'test' new_file = self.tmp_dir + sep + 'new.svg' old_file = self.tmp_dir + sep + 'old.svg' util.copy_faksimile_svg_file(target_file=new_file, faksimile_tree=new_tree) util.copy_faksimile_svg_file(target_file=old_file, faksimile_tree=old_tree) util.record_changes(old_file, new_file, [ empty_node_id ], namespaces=namespaces) test_tree = ET.parse(old_file) self.assertEqual(len(test_tree.xpath('//ns:rect[@id="{0}"]/ns:title[@id="{1}"]'.format(empty_node_id, title_node_id), namespaces=namespaces)), 1) def test_replace_chars(self): page = Page(self.xml_file) faksimile_tree = ET.parse(self.faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } word_position = WordPosition(id='rect1159', text='„Gedächtniß"') wps, texts = util.replace_chars(page.words, [ word_position ]) self.assertEqual(texts[0].endswith('“'), True) self.assertEqual(wps[0].text.endswith('“'), True) word_position = WordPosition(id='rect1173', text='-') wps, texts = util.replace_chars(page.words, [ word_position ]) self.assertEqual(wps[0].text.endswith('–'), True) def test_mismatch_words(self): page = Page(self.xml_file) faksimile_tree = ET.parse(self.faksimile_file) faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] page = Page('xml/N_VII_1_page174.xml') faksimile_tree = ET.parse('faksimile_svg/N-VII-1,173et174.svg') faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] self.assertEqual('-' in [ tp.text for tp in faksimile_page.word_positions], True) wps, texts = util.replace_chars(page.words,faksimile_page.word_positions) self.assertEqual('–' in texts, True) self.assertEqual(len([ faksimile_position for faksimile_position in wps\ if faksimile_position.text == '–' ]), 4) mismatching_words, mismatching_faksimile_positions = util.get_mismatching_ids(page.words, faksimile_page.word_positions) self.assertEqual(len([word for word in mismatching_words if word.text.endswith('“') ]), 0) self.assertEqual(len([word for word in mismatching_words if word.text.endswith('–') ]), 0) def test_process_warnings(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter('default') warnings.warn('Test1: asdf') warnings.warn('Test2: asdf') status = util.process_warnings4status(w, ['Test1', 'Test2' ], 'asdf', 'OK', status_prefix='with warnings') #print(status) self.assertTrue('Test1' in status.split(':')) self.assertTrue('Test2' in status.split(':')) @unittest.skip('test uses external program, has been tested') def test_show_files(self): list_of_files = [ self.test_dir + sep + file for file in listdir(self.test_dir) if file.endswith('pdf') ][0:2] util.ExternalViewer.show_files(single_file=self.faksimile_file, list_of_files=list_of_files) def test_record_changes_to_page(self): page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 1 ]) old_length = len(page.words) self.assertEqual(page.words[1].text, 'asdf') self.assertEqual(page.words[1].transkription_positions[0].width, 353) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 13 ]) self.assertEqual(page.words[13].text, 'er') self.assertEqual(page.words[14].text, '=') self.assertEqual(len(page.words), old_length+1) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 64 ]) self.assertEqual(page.words[64].text, 'Simplifications-apparat') self.assertEqual(len(page.words[64].transkription_positions), 3) self.assertEqual(len(page.words), old_length-1) @unittest.skipUnless(__name__ == "__main__", 'tests all words') def test_extended__record_changes_to_page(self): page = Page(self.xml_file) old_length = len(page.words) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord) self.assertEqual(page.words[1].text, 'asdf') self.assertEqual(page.words[13].text, 'er') self.assertEqual(page.words[14].text, '=') self.assertEqual(page.words[65].text, 'Simplifications-apparat') self.assertEqual(len(page.words), old_length) def test_copy_faksimile_update_image_location(self): test_dir = self.tmp_dir #FAKSIMILE_LOCATION + '/Myriam/Fertig/' util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir) with self.assertWarns(UserWarning): util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir) def test_record_changes_on_xml(self): old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) node = tree.xpath('//word[@id="135"]')[0] counter =0 while node.get('text') != 'gar' or counter > 5: counter += 1 nextnode = node.getnext() node.set('text', node.get('text') + nextnode.get('text')) for element in nextnode.getchildren(): node.append(element) nextnode.getparent().remove(nextnode) write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(new_page.words), len(old_page.words)-2) self.assertEqual(len([ word for word in new_page.words if word.text == 'gar']), 1) old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) node = tree.xpath('//word[@id="138"]')[0] counter =0 while node.get('text') != 'nichtvorkommt.' or counter > 5: counter += 1 nextnode = node.getnext() node.set('text', node.get('text') + nextnode.get('text')) for element in nextnode.getchildren(): node.append(element) nextnode.getparent().remove(nextnode) node.set('split', 'nicht vorkommt.') write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) joined_page = Page(xml_file) self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.']), 1) self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.'][0].split_strings), 2) self.assertEqual(len(joined_page.words), len(old_page.words)-1) new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(new_page.words), len(old_page.words)) self.assertEqual(len([word for word in new_page.words if word.text == 'vorkommt.']), 1) self.assertEqual(len([word for word in old_page.words if word.text == 'nicht']),\ len([word for word in new_page.words if word.text == 'nicht'])) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) old_page = Page(xml_file) nodes = tree.xpath('//word[@id>="85" and @id<="87"]') self.assertEqual(len(nodes), 3) prevWordText = nodes[0].get('text') nodes[0].set('join', prevWordText + 'z') nodes[1].set('split', 'z u') lastWordText = nodes[2].get('text') nodes[2].set('join', 'u' + lastWordText) write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) joined_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(joined_page.words), len(old_page.words)-1) def test_reset_tp_with_matrix(self): page = Page(self.Mp_XIV_page420) util.reset_tp_with_matrix(page.words[0].transkription_positions) self.assertTrue(page.words[0].transkription_positions[0].left > 0 and page.words[0].transkription_positions[0].top > -5) transformed_words = [w for w in page.words if (len(w.transkription_positions) > 0 and w.transkription_positions[0].transform is not None) ] util.reset_tp_with_matrix(transformed_words[0].transkription_positions) self.assertEqual(transformed_words[0].transkription_positions[0].left, 0) self.assertTrue(transformed_words[0].transkription_positions[0].top < 0) def test_back_up(self): test_dir = self.tmp_dir page = Page(self.xml_file) target_file_name = util.back_up(page, self.xml_file, bak_dir=test_dir) self.assertEqual(isfile(target_file_name), True) svg_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } util.back_up_svg_file(svg_tree, namespaces) + """ + page = Page('xml/Mp_XV_page79r.xml') + util.back_up(page, page.xml_file) + """ def tearDown(self): shutil.rmtree(self.tmp_dir, ignore_errors=True) pass if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_simple_word.py =================================================================== --- tests_svgscripts/test_simple_word.py (revision 109) +++ tests_svgscripts/test_simple_word.py (revision 110) @@ -1,36 +1,44 @@ import unittest from os import sep, path from os.path import dirname, isdir import lxml.etree as ET +import re import sys sys.path.append('svgscripts') from datatypes.matrix import Matrix from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.simple_word import SimpleWord from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.word import Word class TestSimpleWord(unittest.TestCase): def test_get_semanticAndDataDict(self): dictionary = SimpleWord.get_semantic_dictionary() #print(dictionary) def test_create_cls_from_word(self): word = Word(text='test') mark = MarkForeignHands.create_cls_from_word(word) self.assertEqual(mark.text, word.text) self.assertEqual(type(mark), MarkForeignHands) def test_attach(self): word = SimpleWord() word.transkription_positions.append(TranskriptionPosition(id=0)) word.transkription_positions.append(TranskriptionPosition(id=0)) tree = ET.Element('page') word.attach_word_to_tree(tree) self.assertEqual(len(tree.xpath('//' + TranskriptionPosition.XML_TAG)), 2) + def test_clean_text(self): + word = SimpleWord() + self.assertEqual(word._create_clean_text('-asdf'), 'asdf') + self.assertEqual(word._create_clean_text('(-asdf)'), 'asdf') + self.assertEqual(word._create_clean_text('(a.)'), 'a.') + self.assertEqual(word._create_clean_text('.verhehlen'), 'verhehlen') + if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_imprint.py =================================================================== --- tests_svgscripts/test_imprint.py (revision 0) +++ tests_svgscripts/test_imprint.py (revision 110) @@ -0,0 +1,57 @@ +import unittest +from os import sep, path, remove +from os.path import isdir, isfile, dirname +import shutil +import sys +import lxml.etree as ET +import warnings +import sys + +sys.path.append('svgscripts') + +import datatypes.imprint +from datatypes.imprint import Imprint, extract_imprints, get_lines, UNITTESTING, DEBUG +from datatypes.matrix import Matrix +from datatypes.page import Page +from datatypes.positional_word_part import PositionalWordPart +from datatypes.transkriptionField import TranskriptionField + +class TestExtractImprint(unittest.TestCase): + def setUp(self): + datatypes.imprint.UNITTESTING = True + DATADIR = path.dirname(__file__) + sep + 'test_data' + self.test_page = Page.create_cls(DATADIR + sep + 'Mp_XV_page79v.xml') + self.test_page.source = self.test_page.page_tree.docinfo.URL.replace('.xml', '.svg') + + def test_extract_footnotes(self): + imprints = extract_imprints(self.test_page) + self.assertEqual(len(imprints), 4) + """ + for imprint in imprints: + print(imprint.reference, imprint.start_line, imprint.end_line) + """ + + def test_attach(self): + imprints = extract_imprints(self.test_page) + tree = ET.ElementTree(ET.Element('asdf')) + for imprint in imprints: + imprint.attach_object_to_tree(tree) + tree.xpath('//asdf')[0].set('test', 'This is a Test.') + #print(ET.dump(tree.getroot())) + + def test_init_from_node(self): + for imprint in extract_imprints(self.test_page): + imprint.attach_object_to_tree(self.test_page.page_tree) + imprints = [ Imprint.create_cls_from_node(node, self.test_page.lines) for node in self.test_page.page_tree.xpath('//' + Imprint.XML_TAG) ] + self.assertEqual(len(imprints), 4) + """ + Imprint.DEBUG = True + page = Page('xml/Mp_XV_page81v.xml') + line_list_string = '21-24,30-36,65-68' + for imprint in page.imprints: + print(imprint.reference) + for line in imprint.lines: print(line.id) + """ + +if __name__ == "__main__": + unittest.main() Index: tests_svgscripts/test_process_footnotes.py =================================================================== --- tests_svgscripts/test_process_footnotes.py (revision 109) +++ tests_svgscripts/test_process_footnotes.py (revision 110) @@ -1,46 +1,54 @@ import unittest from os import sep, path, remove from os.path import isdir, isfile, dirname import shutil import sys import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') from datatypes.footnotes import extract_footnotes +from datatypes.imprint import Imprint from datatypes.page import Page import process_footnotes -from process_footnotes import categorize_footnotes, main +from process_footnotes import categorize_footnotes, main, save_imprints class TestExtractFootnotes(unittest.TestCase): def setUp(self): process_footnotes.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_footnote = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_footnote_verso = DATADIR + sep + 'N_VII_1_xp5_4_page5.svg' self.test_footnote_recto = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg' self.test_footnote_multi = DATADIR + sep + 'N_VII_1_xp5_4_page13.svg' self.test_footnote_multi_xml = DATADIR + sep + 'N_VII_1_page013.xml' self.test_categorize_footnote = DATADIR + sep + 'N_VII_1_page006.xml' def test_categorize_footnotes(self): page = Page(self.test_categorize_footnote) footnotes = extract_footnotes(page, svg_file=self.test_footnote_recto) categorize_footnotes(page, footnotes) words_with_comments = [ word for word in page.words if word.editor_comment is not None ] self.assertEqual(len(words_with_comments), 4) lines_with_comments = [ line for line in page.lines if len(line.editor_comments) > 0 ] self.assertEqual(len(lines_with_comments), 1) page = Page('xml/W_II_1_page141.xml') footnotes = extract_footnotes(page) categorize_footnotes(page, footnotes, debug=True) words_with_comments = [ word for word in page.words if word.editor_comment is not None ] + def test_save_imprints(self): + page = Page(self.test_categorize_footnote) + save_imprints(page) + self.assertEqual(len(page.page_tree.xpath('//' + Imprint.XML_TAG)), 2) + #print(ET.dump(page.page_tree.getroot())) + + def test_main(self): self.assertEqual(main(['xml/N_VII_1_page005.xml']), 0) if __name__ == "__main__": unittest.main() Index: fixes/fix_old_data.py =================================================================== --- fixes/fix_old_data.py (revision 109) +++ fixes/fix_old_data.py (revision 110) @@ -1,540 +1,551 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to fix old data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET import re import shutil import string from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.path import Path as SVGPath from svgpathtools.path import Line import sys import tempfile from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from progress.bar import Bar import warnings sys.path.append('svgscripts') from convert_wordPositions import HTMLConverter from datatypes.box import Box from datatypes.faksimile import FaksimilePage +from datatypes.imprint import Imprint from datatypes.archival_manuscript import ArchivalManuscriptUnity from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.matrix import Matrix from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from datatypes.positional_word_part import PositionalWordPart from datatypes.path import Path from datatypes.word import Word from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word, update_transkription_position_ids from join_faksimileAndTranskription import sort_words from util import back_up, back_up_svg_file, copy_faksimile_svg_file, reset_tp_with_matrix from process_files import update_svgposfile_status +from process_footnotes import save_imprints from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from main_util import create_function_dictionary, get_manuscript_files __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False MAX_SVG_XY_THRESHOLD = 10 #TODO: fix all svg graphical files: change xlink:href to href!!!! def convert_old_matrix(tp, xmin, ymin) ->(Matrix, float, float): """Return new matrix, x and y for old transkription_position. """ matrix = tp.transform.clone_transformation_matrix() matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3) matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3) x = round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)\ if tp.left > 0\ else 0 y = round((tp.height-1.5)*-1, 3) return matrix, x, y def save_page(page, attach_first=False, backup=False, script_name=None): """Write page to xml file """ if backup: back_up(page, page.xml_file) if attach_first: page.update_and_attach_words2tree() if script_name is None: script_name = f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}' write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=script_name, file_type=FILE_TYPE_SVG_WORD_POSITION) def page_already_changed(page) -> bool: """Return whether page has alreadybeen changed by function """ return len(\ page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}:{inspect.currentframe().f_back.f_code.co_name}"]')\ ) > 0 def fix_faksimile_line_position(page, redo=False) -> bool: """Create a faksimile line position. """ if not redo and page_already_changed(page): return False; update_faksimile_line_positions(page) if not UNITTESTING: save_page(page) return True def check_faksimile_positions(page, redo=False) -> bool: """Check faksimile line position. """ if len(page.page_tree.xpath('//data-source/@file')) > 0: svg_file = page.page_tree.xpath('//data-source/@file')[0] svg_tree = ET.parse(svg_file) positions_are_equal_counter = 0 page_changed = False for faksimile_page in FaksimilePage.GET_FAKSIMILEPAGES(svg_tree): if page.title == faksimile_page.title\ and page.number == faksimile_page.page_number: #print([fp.id for fp in faksimile_page.word_positions ]) for word in page.words: for fp in word.faksimile_positions: rect_fps = [ rfp for rfp in faksimile_page.word_positions if rfp.id == fp.id ] if len(rect_fps) > 0: rfp = rect_fps[0] if fp.left != rfp.left or fp.top != rfp.top: #print(f'{fp.id}: {fp.left}/{rfp.left} {fp.top}/{rfp.top}') fp.left = rfp.left fp.top = rfp.top fp.bottom = fp.top + rfp.height word.attach_word_to_tree(page.page_tree) page_changed = True else: positions_are_equal_counter += 1 print(f'{positions_are_equal_counter}/{len(page.words)} are equal') if page_changed and not UNITTESTING: save_page(page) return page_changed def fix_faksimile_positions(page, redo=False) -> bool: """Set faksimile positions to absolute values. [:return:] fixed """ if not redo and len(page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}"]')) > 0: return False x_min = page.text_field.xmin y_min = page.text_field.ymin for word in page.words: for fp in word.faksimile_positions: fp.left = fp.left + x_min fp.top = fp.top + y_min fp.bottom = fp.bottom + y_min word.attach_word_to_tree(page.page_tree) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) return True def _fix_tp_of_word(page, word, text_field): """Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top """ for tp in word.transkription_positions: tp.left += text_field.left tp.top += text_field.top reset_tp_with_matrix(word.transkription_positions) if type(word) == Word: words_in_word = word.word_parts + [ item for item in word.__dict__.items() if type(item) == Word ] for wp in words_in_word: _fix_tp_of_word(page, wp, text_field) def fix_tp_with_matrix(page, redo=False) -> bool: """Fix transkription positions with rotation matrix ->set left to 0 and top to -5. [:return:] fixed """ xmin = 0 if page.svg_image is None or page.svg_image.text_field is None else page.svg_image.text_field.left ymin = 0 if page.svg_image is None or page.svg_image.text_field is None else page.svg_image.text_field.top for word in page.words: reset_tp_with_matrix(word.transkription_positions, tr_xmin=xmin, tr_ymin=ymin) for wp in word.word_parts: reset_tp_with_matrix(wp.transkription_positions, tr_xmin=xmin, tr_ymin=ymin) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, attach_first=True) return True def _fix_old_transkription_positions(page, redo=False) -> bool: """Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top [:return:] fixed """ if page.svg_image is not None\ and page.svg_image.text_field is None: if page.svg_image is None: if page.svg_file is not None: transkription_field = TranskriptionField(page.svg_file) width = round(tf.documentWidth, 3) height = round(tf.documentHeight, 3) page.svg_image = SVGImage(file_name=svg_file, width=width,\ height=height, text_field=transkription_field.convert_to_text_field()) page.svg_image.attach_object_to_tree(page.page_tree) else: raise Exception(f'ERROR page {page.page_tree.docinfo.URL} does not have a svg_file!') elif page.svg_image.text_field is None: page.svg_image.text_field = TranskriptionField(page.svg_image.file_name).convert_to_text_field() page.svg_image.attach_object_to_tree(page.page_tree) for line_number in page.line_numbers: line_number.top += page.svg_image.text_field.top line_number.bottom += page.svg_image.text_field.top line_number.attach_object_to_tree(page.page_tree) for word in page.words: _fix_tp_of_word(page, word, page.svg_image.text_field) for mark in page.mark_foreign_hands: _fix_tp_of_word(page, mark, page.svg_image.text_field) for tcm in page.text_connection_marks: _fix_tp_of_word(page, tcm, page.svg_image.text_field) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, attach_first=True) return True return False def _fix_old_pwps(page, old_tps): """Adjust positional_word_parts to corrected transkription_positions. """ for tp in old_tps: for pwp in tp.xpath(f'./{PositionalWordPart.XML_TAG}'): left = float(pwp.get('left')) top = float(pwp.get('top')) bottom = float(pwp.get('bottom')) pwp.set('left', str(left + page.svg_image.text_field.left)) pwp.set('top', str(top + page.svg_image.text_field.top)) pwp.set('bottom', str(bottom + page.svg_image.text_field.top)) def _fix_quotation_mark_tps(page, old_tps): """Fix the height of transkription_positions of words with quotation marks. """ for tp in old_tps: heighest_pwp = sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('height')), reverse=True)[0] toppest_pwp = sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('top')))[0] new_height = float(tp.get('height')) + abs(float(heighest_pwp.get('top'))-float(toppest_pwp.get('top'))) tp.set('height', str(new_height)) def fix_transkription_positions(page, redo=False) -> bool: """Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top [:return:] fixed """ THRESHOLD = 10 if page.svg_image is not None\ and page.svg_image.text_field is None: if not _fix_old_transkription_positions(page): return False _fix_old_pwps(page, [ pwp.getparent() for pwp in page.page_tree.xpath(f'//{PositionalWordPart.XML_TAG}[@id="0"]')\ if abs(float(pwp.get('left')) - float(pwp.getparent().get('left'))) > THRESHOLD ]) _fix_quotation_mark_tps(page, [ tp for tp in page.page_tree.xpath(f'//{TranskriptionPosition.XML_TAG}')\ if len(tp.xpath(f'./{PositionalWordPart.XML_TAG}')) > 0\ and sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('height')), reverse=True)[0]\ != sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('top')))[0] ]) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page) return True def fix_styles(page, redo=False): """Remove unused styles from tree. """ if len(page.page_tree.xpath('//style')) > 1: for node in page.page_tree.xpath('//style')[1:]: node.getparent().remove(node) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page) return True +def fix_imprints(page, redo=False): + """Remove unused styles from tree. + """ + if len(page.page_tree.xpath('//' + Imprint.XML_TAG)) == 0: + save_imprints(page) + return True + def merge_transkription_positions(page, redo=False) -> bool: """Fix transkription positions of merged words [:return:] fixed """ if not isdir(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR)\ or not isfile(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)): return False merged_page = Page(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)) sync_dictionary = sync_words_linewise(merged_page.words, page.words, merged_page.line_numbers) words = [] for source_word in merged_page.words: words.append(source_word) if bool(sync_dictionary.get(source_word)): _sync_transkriptions_with_words(source_word, sync_dictionary) if source_word.text != ''.join([ t.get_text() for t in source_word.transkription_positions ]): text = ''.join([ t.get_text() for t in source_word.transkription_positions ]) print(f'{source_word.line_number}: {source_word.text} has transkription_positions with text "{text}".') response = input('Change? [Y/n]>') if not response.startswith('n'): new_sync_dictionary = sync_words_linewise(merged_page.words, page.words,\ [ line for line in merged_page.line_numbers if line.id == source_word.line_number ], force_sync_on_word=source_word) if bool(new_sync_dictionary.get(source_word)): _sync_transkriptions_with_words(source_word, new_sync_dictionary) else: raise Exception(f'Could not find sourc_word {source_word.text} in {new_sync_dictionary}!') page.words = words page.update_and_attach_words2tree() if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page) return True def fix_graphical_svg_file(page, redo=False) -> bool: """Fix glyphs of word for which there is a /changed-word in page.page_tree """ svg_tree = ET.parse(page.svg_file) transkription_field = TranskriptionField(page.source) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } back_up_svg_file(svg_tree, namespaces=namespaces) tr_xmin = transkription_field.xmin if (page.svg_image is None or page.svg_image.text_field is None) else 0 tr_ymin = transkription_field.ymin if (page.svg_image is None or page.svg_image.text_field is None) else 0 for deleted_word_node in page.page_tree.xpath('//deleted-word'): deleted_word = Word.create_cls(deleted_word_node) _run_function_on_nodes_for_word(svg_tree, namespaces, deleted_word, tr_xmin, tr_ymin, _set_node_attribute_to, 'visibility', 'hidden') for changed_word_node in page.page_tree.xpath('//changed-word'): changed_word = Word.create_cls(changed_word_node) try: word = [ word for word in page.words if word.id == changed_word.id and word.text == changed_word.text ][0] left_difference = word.transkription_positions[0].left - changed_word.transkription_positions[0].left _run_function_on_nodes_for_word(svg_tree, namespaces, word, tr_xmin, tr_ymin, _add_value2attribute, 'x', left_difference) except IndexError: warnings.warn(f'There is no word for changed_word {changed_word.id}: "{changed_word.text}" in {page.page_tree.docinfo.URL}!') copy_faksimile_svg_file(target_file=page.svg_file, faksimile_tree=svg_tree, namespaces=namespaces) def _add_value2attribute(node, attribute, value): """Add left_difference to x of node. """ node.set(attribute, str(float(node.get(attribute)) + value)) node.set('changed', 'true') def _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=0.1) -> list: """Return nodes with symbol_id n x = svg_x and y = svg_y. """ nodes = [ node for node in svg_tree.xpath(\ f'//ns:use[@xlink:href="#{symbol_id}" and @x > {svg_x-threshold} and @x < {svg_x+threshold} and @y > {svg_y-threshold} and @y < {svg_y+threshold} ]',\ namespaces=namespaces) if not bool(node.get('changed')) ] if len(nodes) == 0 and threshold < MAX_SVG_XY_THRESHOLD: return _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=threshold+1) return nodes def _run_function_on_nodes_for_word(svg_tree, namespaces, word, tr_xmin, tr_ymin, function_on_node, attribute, value): """Run function on nodes for words. """ for tp in word.transkription_positions: for pwp in tp.positional_word_parts: symbol_id = pwp.symbol_id svg_x = pwp.left + tr_xmin svg_y = pwp.bottom + tr_ymin nodes = _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y) if len(nodes) > 0: node = nodes[0] function_on_node(node, attribute, value) def _set_node_attribute_to(node, attribute, value): """Set attribute of node to value. """ node.set(attribute, str(value)) node.set('changed', 'true') def sync_words_linewise(source_words, target_words, lines, force_sync_on_word=None) -> dict: """Sync words an create a dictionary with source_words as keys, refering to a list of corresponding words. """ result_dict = {} for word in target_words + source_words: word.processed = False for line in lines: source_words_on_line = sorted([ word for word in source_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left) target_words_on_line = sorted([ word for word in target_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left) if len(target_words_on_line) == len(source_words_on_line): _sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word) elif len(source_words_on_line) < len(target_words_on_line): _sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word) else: print('okey dokey') return result_dict def _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict): """Force sync on word. """ unprocessed_target_words = [t_word for t_word in target_words_on_line if not t_word.processed] if len(unprocessed_target_words) > 0: print([ (i, t_word.text) for i, t_word in enumerate(unprocessed_target_words)]) response = input(f'Please specify indices of words to sync {force_sync_on_word.text} with: [default:0-{len(unprocessed_target_words)-1}]>') indices = [ i for i in range(0, len(unprocessed_target_words)) ] if re.match(r'\d+-\d+', response): index_strings = response.split('-') indices = [ i for i in range(int(index_strings[0]), int(index_strings[1])+1) ] elif response != '': indices = [ int(i) for i in response.split(' ') ] target_words = [] for i in indices: target_words.append(unprocessed_target_words[i]) result_dict.update({ force_sync_on_word: target_words }) else: raise Exception(f'There are no unprocessed target_words for {force_sync_on_word.text} on line {force_sync_on_word.line_number}!') def _sync_transkriptions_with_words(word, sync_dictionary): """Sync transkription_positions of word with syncronized words. """ word.transkription_positions = [] for target_word in sync_dictionary[word]: word.transkription_positions += target_word.transkription_positions def _sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None): """Sync if there are more target words. """ current_source_word = None for target_word in target_words_on_line: if current_source_word is not None\ and current_source_word.text.startswith(''.join([ w.text for w in result_dict[current_source_word]]) + target_word.text): result_dict[current_source_word].append(target_word) target_word.processed = True if current_source_word.text == ''.join([ w.text for w in result_dict[current_source_word]]): current_source_word = None elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ]) > 0: source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ][0] target_word.processed = True source_word.processed = True result_dict.update({ source_word: [ target_word ] }) elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ]) > 0: current_source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ][0] current_source_word.processed = True target_word.processed = True result_dict.update({ current_source_word: [ target_word ] }) else: msg = f'On line {target_word.line_number}: target_word "{target_word.text}" does not have a sibling in {[ s.text for s in source_words_on_line if not s.processed ]}' warnings.warn(msg) if force_sync_on_word is not None: _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict) def _sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None): """Sync same length """ for i, word in enumerate(source_words_on_line): if word.text == target_words_on_line[i].text: word.processed = True target_words_on_line[i].processed = True result_dict.update({ word: [ target_words_on_line[i] ] }) elif len([ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ]) > 0: target_word = [ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ][0] word.processed = True target_word.processed = True result_dict.update({ word: [ target_word ] }) else: msg = f'On line {word.line_number}: source_word "{word.text}" does not have a sibling in {[ s.text for s in target_words_on_line]}' warnings.warn(msg) if force_sync_on_word is not None: _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to fix old data. svgscripts/fix_old_data.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -c|--check-faksimile-positions check whether faksimile positions have been updated + -i|--fix-imprints add imprints to page -l|--faksimile-line-position create faksimile line positions -p|--faksimile-positions fix old faksimile positions -r|--redo rerun -s|--fix-graphical-svg fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file. -S|--fix-styles fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file. -t|--transkription-positions fix old transkription positions -M|--matrix fix old transkription positions with transform matrix :return: exit code (int) """ function_list = [] function_dict = create_function_dictionary(['-c', '--check-faksimile-positions'], check_faksimile_positions) function_dict = create_function_dictionary(['-l', '--faksimile-line-position'], fix_faksimile_line_position, function_dictionary=function_dict) function_dict = create_function_dictionary(['-p', '--faksimile-positions'], fix_faksimile_positions, function_dictionary=function_dict) function_dict = create_function_dictionary(['-m', '--merge-positions'], merge_transkription_positions, function_dictionary=function_dict) function_dict = create_function_dictionary(['-s', '--fix-graphical-svg'], fix_graphical_svg_file, function_dictionary=function_dict) function_dict = create_function_dictionary(['-M', '--matrix'], fix_tp_with_matrix, function_dictionary=function_dict) function_dict = create_function_dictionary(['-t', '--transkription-positions'], fix_transkription_positions, function_dictionary=function_dict) - function_dict = create_function_dictionary(['default', '-S', '--fix-styles'], fix_styles, function_dictionary=function_dict) + function_dict = create_function_dictionary(['-S', '--fix-styles'], fix_styles, function_dictionary=function_dict) + function_dict = create_function_dictionary(['default', '-i', '--fix-imprints'], fix_imprints, function_dictionary=function_dict) redo = False; try: - opts, args = getopt.getopt(argv, "hcplrmsStM", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position",\ - "redo", "merge-positions", "fix-graphical-svg", "fix-styles", "transkription-positions", 'matrix' ]) + opts, args = getopt.getopt(argv, "hcplrmsStMi", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position",\ + "redo", "merge-positions", "fix-graphical-svg", "fix-styles", "transkription-positions", 'matrix', 'fix-imprints' ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-r', '--redo'): redo = True; elif opt in function_dict.keys(): function_list.append(function_dict[opt]) if len(function_list) == 0: function_list.append(function_dict['default']) if len(args) < 1: usage() return 2 exit_status = 0 for xml_file in get_manuscript_files(args): if isfile(xml_file): counters = { f.__name__: 0 for f in function_list } for current_function in function_list: status_contains = STATUS_MERGED_OK if 'faksimile' in current_function.__name__ else 'OK' for page in Page.get_pages_from_xml_file(xml_file, status_contains=status_contains): if not UNITTESTING: print(Fore.CYAN + f'Processing {page.title}, {page.number} with function {current_function.__name__} ...' + Style.RESET_ALL) back_up(page, page.xml_file) counters[current_function.__name__] += 1 if current_function(page, redo=redo) else 0 if not UNITTESTING: for function_name, counter in counters.items(): print(Style.RESET_ALL + f'[{counter} pages changed by {function_name}]') else: raise FileNotFoundError('File {} does not exist!'.format(xml_file)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: py2ttl/py2ttl_ontology.py =================================================================== --- py2ttl/py2ttl_ontology.py (revision 109) +++ py2ttl/py2ttl_ontology.py (revision 110) @@ -1,369 +1,371 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert py classes that are subclasses of class_spec.SemanticClass to a owl ontology in turtle format. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} +from datetime import datetime import getopt import importlib import importlib.util import inspect import lxml.etree as ET from os import sep, path, listdir from os.path import isfile, isdir, dirname, basename from progress.bar import Bar from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD import re import requests import sys import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from class_spec import SemanticClass, UnSemanticClass from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL, PROJECT_ONTOLOGY_FILE_URL from data_handler import RDFDataHandler sys.path.append('shared_util') from myxmlwriter import dict2xml __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Py2TTLOntologyConverter: """This class can be used convert semantic_dictionaries to a owl ontology in turtle format. """ UNITTESTING = False INFERRED_SUB_CLASS = RDFS.subClassOf * '*' def __init__(self, project_ontology_file=None): self.class_uri_dict = {} self.uri_mapping4cls_and_properties = {} self.project_graph = Graph() self.base_uriref = URIRef(PROJECT_URL) self.project_name = PROJECT_NAME self.ns = { self.base_uriref + '#': self.project_name } if project_ontology_file is not None and isfile(project_ontology_file): if project_ontology_file == PROJECT_ONTOLOGY_FILE: r = requests.get(PROJECT_ONTOLOGY_FILE_URL) with open(project_ontology_file, 'wb') as f: f.write(r.content) print(f'{project_ontology_file} updated from github repository') self.project_graph.parse(project_ontology_file, format="turtle") if len(self.project_graph) > 0: self.base_uriref = self.project_graph.value(predicate=RDF.type, object=OWL.Ontology, any=False) self.ns = { uriref: ns for ns, uriref in self.project_graph.namespace_manager.namespaces() } self.project_name = self.ns.get(self.base_uriref + '#') self.project_graph.bind(self.project_name, self.base_uriref + '#') + self.project_graph.add((self.base_uriref, OWL.versionInfo, Literal(datetime.now().strftime('%Y-%m-%d')))) self.uri_mapping4cls_and_properties.update({ 'ontology': { 'project_name': self.project_name, 'project_uri': self.base_uriref + '#' }}) self.uri_mapping4cls_and_properties.update({ 'classes': {} }) def addClass2Graph(self, cls, semantic_dict=None) -> (URIRef, type): """Add a class to project_graph. :return: (cls_uri (URIRef), super_cls (cls)) """ if semantic_dict is None: semantic_dict = cls.get_semantic_dictionary() comment, label = self.get_comment_label(cls) cls_uri = URIRef(self.base_uriref + '#' + cls.__name__) self.project_graph.add((cls_uri, RDF.type, OWL.Class)) self.project_graph.add((cls_uri, RDFS.isDefinedBy, self.base_uriref)) if comment != '': self.project_graph.add((cls_uri, RDFS.comment, Literal(comment, lang='en'))) if label != '': self.project_graph.add((cls_uri, RDFS.label, Literal(label, lang='en'))) super_uri = None super_cls = None if bool(semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.TYPE)): super_cls = semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.TYPE) super_uri = self.createClassAndProperties(super_cls) if super_uri is not None: self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri)) if SemanticClass.SUBCLASS_OF in semantic_dict[SemanticClass.CLASS_KEY].keys()\ and len(semantic_dict[SemanticClass.CLASS_KEY][SemanticClass.SUBCLASS_OF]) > 0: for super_uri_string in semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.SUBCLASS_OF): super_uri = URIRef(super_uri_string) if not (cls_uri, self.INFERRED_SUB_CLASS, super_uri) in self.project_graph: self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri)) return cls_uri, super_cls def addProperty2Graph(self, property_uri, domain_uri, range_uri, info_dict, property_type=OWL.ObjectProperty): """Add a property to self.project_graph. """ label = 'has ' + property_uri.split('#')[1].replace('has','')\ if SemanticClass.PROPERTY_LABEL not in info_dict.keys() else info_dict[SemanticClass.PROPERTY_LABEL] self.project_graph.add((property_uri, RDF.type, property_type)) self.project_graph.add((property_uri, RDFS.isDefinedBy, self.base_uriref)) self.project_graph.add((property_uri, RDFS.domain, domain_uri)) self.project_graph.add((property_uri, RDFS.range, range_uri)) if SemanticClass.PROPERTY_COMMENT in info_dict.keys(): comment = info_dict[SemanticClass.PROPERTY_COMMENT] self.project_graph.add((property_uri, RDFS.comment, Literal(comment, lang='en'))) self.project_graph.add((property_uri, RDFS.label, Literal(label, lang='en'))) if SemanticClass.CARDINALITY in info_dict.keys()\ and info_dict[SemanticClass.CARDINALITY] > 0: self.addRestriction2Class(domain_uri, property_uri, info_dict) def addRestriction2Class(self, cls_uri, property_uri, info_dict): """Adds restriction on property_uri to class cls_uri. """ if SemanticClass.CARDINALITY in info_dict.keys()\ and info_dict[SemanticClass.CARDINALITY] > 0: if (cls_uri, None, None) not in self.project_graph: warnings.warn('{} not in graph!'.format(cls_uri)) restriction = BNode() cardinality_restriction = URIRef(OWL + info_dict[SemanticClass.CARDINALITY_RESTRICTION])\ if SemanticClass.CARDINALITY_RESTRICTION in info_dict.keys()\ else OWL.cardinality cardinality = info_dict[SemanticClass.CARDINALITY] self.project_graph.add((cls_uri, RDFS.subClassOf, restriction)) self.project_graph.add((restriction, RDF.type, OWL.Restriction)) self.project_graph.add((restriction, OWL.onProperty, property_uri)) self.project_graph.add((restriction, cardinality_restriction, Literal(str(cardinality), datatype=XSD.nonNegativeInteger))) def create_ontology(self, datatypes_dir, target_ontology_file): """Convert all classes contained in datatypes_dir that are subclasses of class_spec.SemanticClass to rdf. :return: exit code (int) """ if isdir(datatypes_dir): semantic_classes = self.get_semantic_classes(datatypes_dir) if not Py2TTLOntologyConverter.UNITTESTING: bar = Bar('creating classes and properties', max=len(semantic_classes)) for cls in semantic_classes: self.createClassAndProperties(cls) not bool(Py2TTLOntologyConverter.UNITTESTING) and bar.next() not bool(Py2TTLOntologyConverter.UNITTESTING) and bar.finish() self.uri_mapping4cls_and_properties['ontology'].update({'ontology_file': target_ontology_file}) f = open(target_ontology_file, 'wb+') f.write(self.project_graph.serialize(format="turtle")) f.close() if not Py2TTLOntologyConverter.UNITTESTING: xml_file = 'mapping_file4' + datatypes_dir.replace(sep, '.') + '2' + target_ontology_file.replace('.' + sep, '').replace(sep, '.').replace('.ttl', '.xml') dict2xml(self.uri_mapping4cls_and_properties, xml_file) else: print('Error: dir {} does not exist!'.format(datatypes_dir)) usage return 1 return 0 def createClassAndProperties(self, cls): """Creates a owl:Class and some owl:ObjectProperty from semantic_dictionary of a python class. """ if not cls.__name__ in self.class_uri_dict: self.class_uri_dict.update({cls.__name__: cls}) semantic_dict = cls.get_semantic_dictionary() cls_uri, super_cls = self.addClass2Graph(cls, semantic_dict) uri_mapping4properties = {} for property_key in self._get_semantic_dictionary_keys_super_first(semantic_dict['properties']): super_semantic_dict = {} if super_cls is None else super_cls.get_semantic_dictionary() if len(super_semantic_dict) == 0 or not bool(super_semantic_dict['properties'].get(property_key)): property_dict4key = semantic_dict['properties'].get(property_key) property_cls = property_dict4key.get('class') subject_uri, property_uri = self.createProperty(cls_uri, property_key, property_cls, property_dict4key) uri_mapping4properties.update({ property_key: property_uri }) elif bool(self.uri_mapping4cls_and_properties.get('classes').get(super_cls.__name__).get('properties').get(property_key)): property_uri = self.uri_mapping4cls_and_properties['classes'][super_cls.__name__]['properties'][property_key] uri_mapping4properties.update({ property_key: property_uri}) self.uri_mapping4cls_and_properties.get('classes').update({ cls.__name__: { 'class_uri': cls_uri, 'properties': uri_mapping4properties }}) return URIRef(self.base_uriref + '#' + cls.__name__) def createProperty(self, domain_uri, property_name, range_cls, info_dict) -> (URIRef, URIRef): """Creates a owl:ObjectProperty. :return: tuple of domain_uri (rdflib.URIRef) and property_uri (rdflib.URIRef) of created property """ name = self.createPropertyName(property_name=property_name)\ if SemanticClass.PROPERTY_NAME not in info_dict.keys() else info_dict[SemanticClass.PROPERTY_NAME] property_uri = URIRef(self.base_uriref + '#' + name) inferredSubClass = RDFS.subClassOf * '*' range_uri = URIRef(self.base_uriref + '#' + range_cls.__name__) super_property_uri = None if SemanticClass.SUBPROPERTYOF in info_dict.keys(): super_property_uri = URIRef(info_dict[SemanticClass.SUBPROPERTYOF]) elif SemanticClass.SUPER_PROPERTY in info_dict.keys(): domain_uri, super_property_uri = self.createProperty(domain_uri,\ info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME),\ range_cls, info_dict[SemanticClass.SUPER_PROPERTY]) if (property_uri, None, None) not in self.project_graph: property_type = OWL.ObjectProperty if range_cls.__module__ == 'builtins': if range_cls != list: property_type = OWL.DatatypeProperty range_uri = RDFDataHandler.SIMPLE_DATA_TYPE_MAPPING.get(range_cls) if range_uri == XSD.string and property_name == 'URL': range_uri = XSD.anyURI self.addProperty2Graph(property_uri, domain_uri, range_uri, info_dict, property_type=property_type) elif not True in [\ (domain_uri, inferredSubClass, o) in self.project_graph\ for o in self.project_graph.objects(property_uri, RDFS.domain)\ ]: # if domain_uri is NOT a subclass of a cls specified by RDFS.domain if SemanticClass.CARDINALITY in info_dict.keys()\ and info_dict[SemanticClass.CARDINALITY] > 0: self.addRestriction2Class(domain_uri, property_uri, info_dict) self.project_graph.add((property_uri, RDFS.domain, domain_uri)) if super_property_uri is not None\ and (property_uri, RDFS.subPropertyOf, super_property_uri) not in self.project_graph: self.project_graph.add((property_uri, RDFS.subPropertyOf, super_property_uri)) return domain_uri, property_uri def createPropertyName(self, property_name=None, subject_uri=None, object_uri=None, connector='BelongsTo', prefix='has'): """Returns a property name. """ if property_name is not None: property_name = ''.join([ property_name.split('_')[0].lower() ] + [ text.capitalize() for text in property_name.split('_')[1:] ]) return prefix + property_name[0].upper() + property_name[1:] if property_name[0].islower()\ else prefix + property_name elif subject_uri is not None: property_name = subject_uri.split('#')[1] + self.createPropertyName(object_uri=object_uri, prefix=connector) return property_name[0].lower() + property_name[1:] elif object_uri is not None: return prefix + object_uri.split('#')[1] else: return prefix def get_comment_label(self, cls): """Returns comment and label from cls __doc__. """ comment = cls.__doc__.replace('\n','').lstrip() label = cls.__name__ if '.' in cls.__doc__: comment = [ text for text in cls.__doc__.split('\n') if text != '' ][0].lstrip() if '@label' in cls.__doc__: m = re.search('(@label[:]*\s)(.*[\.]*)', cls.__doc__) label_tag, label = m.groups() elif re.search('([A-Z][a-z]+)', label): m = re.search('([A-Z]\w+)([A-Z]\w+)', label) label = ' '.join([ text.lower() for text in re.split(r'([A-Z][a-z]+)', label) if text != '' ]) return comment, label def get_semantic_classes(self, datatypes_dir): """Returns a list of all classes that are contained in datatypes_dir that are subclasses of class_spec.SemanticClass. :return: a list of (str_name, class) """ base_dir = dirname(dirname(__file__)) sys.path.append(base_dir) root_modul_name = datatypes_dir.replace('/','.') files = [ file.replace('.py','') for file in listdir(datatypes_dir) if file.endswith('.py') and not file.startswith('test_') and not file.startswith('_')] all_modules = [] for name in files: all_modules.append(importlib.import_module('{}.{}'.format(root_modul_name, name))) all_classes = [] for modul in all_modules: all_classes += inspect.getmembers(modul, inspect.isclass) #all_classes = sorted(set(all_classes)) all_classes = sorted(set(all_classes), key=lambda current_class: current_class[0]) semantic_classes = [ cls for name, cls in all_classes if issubclass(cls, SemanticClass)\ and not issubclass(cls, UnSemanticClass)\ and not (cls == SemanticClass)] return semantic_classes def _get_builtin_cls_keys(self, property_dict): """Returns a list of keys for classes that are builtin. """ builtin_cls_keys = [] for key in property_dict.keys(): property_cls = property_dict.get(key).get('class')\ if type(property_dict.get(key)) is dict\ else property_dict.get(key)[0] if type(property_cls) != dict\ and property_cls.__module__ == 'builtins': builtin_cls_keys.append(key) return builtin_cls_keys def _get_semantic_dictionary_keys_super_first(self, property_dict): """Sorts the keys of the property part of a semantic dictionary and returns the keys for super classes before keys of subclasses. :return: a sorted list of keys. """ builtin_cls_keys = self._get_builtin_cls_keys(property_dict) complex_cls_keys = [] for key in [ key for key in property_dict.keys()\ if key not in builtin_cls_keys ]: current_cls = property_dict.get(key).get('class') key_inserted = False for index, cls_key in enumerate(complex_cls_keys): potential_sub_cls = property_dict.get(cls_key).get('class') if issubclass(potential_sub_cls, current_cls): complex_cls_keys.insert(index, key) key_inserted = True break if not key_inserted: complex_cls_keys.append(key) return builtin_cls_keys + complex_cls_keys def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert py classes that are subclasses of class_spec.SemanticClass to owl:Class and its properties to owl:ObjectProperty. py2ttl/py2ttl_ontology.py [OPTIONS ] [optional] directory containing datatypes that are subclasses of class_spec.SemanticClass. Overwrites DATATYPES_DIR in py2ttl/config.py. OPTIONS: -h|--help: show help -s|--source=source_ontology_file source ontology ttl file, option overwrites PROJECT_ONTOLOGY_FILE in py2ttl/config.py -t|--target=target_ontology_file target ontology ttl file, default: 'PROJECT_PREFIX-ontology_autogenerated.ttl' :return: exit code (int) """ check_config_files_exist() datatypes_dir = get_datatypes_dir() source_ontology_file = PROJECT_ONTOLOGY_FILE target_ontology_file = '' try: opts, args = getopt.getopt(argv, "hs:t:", ["help","source=", "target="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-t', '--target'): target_ontology_file = arg elif opt in ('-s', '--source'): source_ontology_file = arg converter = Py2TTLOntologyConverter(project_ontology_file=source_ontology_file) if len(args) > 0: datatypes_dir = args[0] if target_ontology_file == '': target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, converter.project_name) return converter.create_ontology(datatypes_dir, target_ontology_file) if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: py2ttl/xml_conform_dictionary.py =================================================================== --- py2ttl/xml_conform_dictionary.py (revision 109) +++ py2ttl/xml_conform_dictionary.py (revision 110) @@ -1,121 +1,124 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This represents a xml conform dictionary of data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import inspect import re import warnings from class_spec import SemanticClass class XMLConformDictionary: """ This represents a xml conform dictionary of data. """ def __init__(self): self.attachables = [] self.builtins = {} self.builtin_list = {} def attach_data_to_tree(self, node): """Attach data to node """ for xml_key, value in self.builtins.items(): node.set(xml_key, value) for xml_key, value_list in self.builtin_list.items(): node.set(xml_key, ' '.join([ str(i) for i in value_list])) for attachable in self.attachables: attachable.attach_object_to_tree(node) @classmethod def create_cls_from_data_object(cls, data_object): """Create a XMLConformDictionary. """ if not issubclass(type(data_object), SemanticClass): msg = f'{type(data_object)} is not a subclass of {SemanticClass}' raise TypeError(msg) property_d = data_object.get_semantic_dictionary()[data_object.PROPERTIES_KEY] xml_d = cls() for key in property_d.keys(): value = data_object.__dict__.get(key) if value is not None and (type(value) != list or len(value) > 0): semantic_type = property_d[key][data_object.CLASS_KEY]\ if type(property_d[key]) is dict\ else property_d[key][0] if type(value) != list and semantic_type.__module__ == 'builtins': if semantic_type == bool: + if value != True and value != False: + msg = f'Value "{value}" for key "{key}" is not of type "bool"' + raise TypeError(msg) xml_d.builtins.update({key.replace('_','-'): str(value).lower()}) else: xml_d.builtins.update({key.replace('_','-'): str(value)}) elif semantic_type.__module__ != 'builtins': if type(value) != list: xml_d.attachables.append(value) else: for item in value: xml_d.attachables.append(item) else: xml_d.builtin_list.update({key.replace('_','-'): value}) return xml_d @staticmethod def CREATE_INSTANCEOF_CLASS_FROM_NODE(semantic_class, node): """Create a instance of semantic_class from node. """ if not issubclass(semantic_class, SemanticClass): msg = f'{semantic_class} is not a subclass of {SemanticClass}' raise TypeError(msg) property_d = semantic_class.get_semantic_dictionary()[semantic_class.PROPERTIES_KEY] class_instance = semantic_class() for key in property_d.keys(): semantic_type = property_d[key][semantic_class.CLASS_KEY]\ if type(property_d[key]) is dict\ else property_d[key][0] if semantic_type.__module__ == 'builtins' and semantic_type != list: value = node.get(key.replace('_','-')) if semantic_type == bool: class_instance.__dict__.update({key: (value == 'true')}) elif semantic_type != str: if re.match(r'(.*)(\s)', value): class_instance.__dict__.update({key: [ semantic_type(item) for item in value.split(' ')] }) else: class_instance.__dict__.update({key: semantic_type(value)}) else: class_instance.__dict__.update({key: value}) else: attachables = [] for sub_node in node.xpath(semantic_type.XML_TAG): sub_instance = semantic_type.create_cls_from_node(sub_node)\ if 'create_cls_from_node' in semantic_type.__dict__\ else XMLConformDictionary.CREATE_INSTANCEOF_CLASS_FROM_NODE(semantic_type, sub_node) attachables.append(sub_instance) if len(attachables) > 0: if len(attachables) > 1: class_instance.__dict__.update({key: attachables}) else: class_instance.__dict__.update({key: attachables[0]}) return class_instance Index: tests_py2ttl/test_data/mapping_dict.xml =================================================================== --- tests_py2ttl/test_data/mapping_dict.xml (revision 109) +++ tests_py2ttl/test_data/mapping_dict.xml (revision 110) @@ -1,391 +1,408 @@ tln http://www.nie.org/ontology/nietzsche# ./tln-ontology_autogenerated.ttl http://www.nie.org/ontology/nietzsche#ManuscriptUnity http://www.nie.org/ontology/nietzsche#hasTitle http://www.nie.org/ontology/nietzsche#hasManuscriptType http://www.nie.org/ontology/nietzsche#hasPages http://www.nie.org/ontology/nietzsche#hasDescription http://www.nie.org/ontology/nietzsche#ArchivalManuscriptUnity http://www.nie.org/ontology/nietzsche#hasTitle http://www.nie.org/ontology/nietzsche#hasManuscriptType http://www.nie.org/ontology/nietzsche#hasPages http://www.nie.org/ontology/nietzsche#hasStyles + http://www.nie.org/ontology/nietzsche#hasGsaSignature http://www.nie.org/ontology/nietzsche#hasDescription http://www.nie.org/ontology/nietzsche#partsBelongToReconstructedKonvolut http://www.nie.org/ontology/nietzsche#hasEarlierDescriptions http://www.nie.org/ontology/nietzsche#EditorComment http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#AtypicalWriting http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#atypicalWritingHasText http://www.nie.org/ontology/nietzsche#Path http://www.nie.org/ontology/nietzsche#hasDAttribute http://www.nie.org/ontology/nietzsche#Box http://www.nie.org/ontology/nietzsche#hasDAttribute http://www.nie.org/ontology/nietzsche#hasEarlierText http://www.nie.org/ontology/nietzsche#Clarification http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#clarificationHasText http://www.nie.org/ontology/nietzsche#Color http://www.nie.org/ontology/nietzsche#colorHasName http://www.nie.org/ontology/nietzsche#hasHexadecimalValue http://www.nie.org/ontology/nietzsche#Text http://www.nie.org/ontology/nietzsche#textHasContent http://www.nie.org/ontology/nietzsche#textHasMarkup http://www.nie.org/ontology/nietzsche#Description http://www.nie.org/ontology/nietzsche#textHasContent http://www.nie.org/ontology/nietzsche#textHasMarkup http://www.nie.org/ontology/nietzsche#EarlierDescription http://www.nie.org/ontology/nietzsche#textHasContent http://www.nie.org/ontology/nietzsche#hasAuthor http://www.nie.org/ontology/nietzsche#hasCitation http://www.nie.org/ontology/nietzsche#textHasMarkup http://www.nie.org/ontology/nietzsche#EditorCorrection http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#hasCorrectionText http://www.nie.org/ontology/nietzsche#Image http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasFileName http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#hasPrimaryurl http://www.nie.org/ontology/nietzsche#hasSecondaryurl http://www.nie.org/ontology/nietzsche#hasTextField http://www.nie.org/ontology/nietzsche#FaksimileImage http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasFileName http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#hasPrimaryurl http://www.nie.org/ontology/nietzsche#hasSecondaryurl http://www.nie.org/ontology/nietzsche#hasApiurl http://www.nie.org/ontology/nietzsche#hasThumburl http://www.nie.org/ontology/nietzsche#hasMediumurl http://www.nie.org/ontology/nietzsche#hasTextField http://www.nie.org/ontology/nietzsche#PositionalObject http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#WordPosition http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#FaksimilePosition http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform + + http://www.nie.org/ontology/nietzsche#Imprint + + http://www.nie.org/ontology/nietzsche#imprintHasReference + http://www.nie.org/ontology/nietzsche#imprintRefersToLines + + http://www.nie.org/ontology/nietzsche#Line http://www.nie.org/ontology/nietzsche#lineHasNumber http://www.nie.org/ontology/nietzsche#lineHasBottomValueOnTranskription http://www.nie.org/ontology/nietzsche#lineHasTopValueOnTranskription http://www.nie.org/ontology/nietzsche#lineHasInnerBottomValueOnFaksimile http://www.nie.org/ontology/nietzsche#lineHasInnerTopValueOnFaksimile http://www.nie.org/ontology/nietzsche#lineHasOuterBottomValueOnFaksimile http://www.nie.org/ontology/nietzsche#lineHasOuterTopValueOnFaksimile http://www.nie.org/ontology/nietzsche#isMainLine http://www.nie.org/ontology/nietzsche#lineHasEditorComment http://www.nie.org/ontology/nietzsche#LineContinuation http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#isLineAContinuationTo http://www.nie.org/ontology/nietzsche#lineContinuationHasReference http://www.nie.org/ontology/nietzsche#SimpleWord http://www.nie.org/ontology/nietzsche#hasText + http://www.nie.org/ontology/nietzsche#hasCleanText http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#SpecialWord http://www.nie.org/ontology/nietzsche#hasText + http://www.nie.org/ontology/nietzsche#hasCleanText http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#MarkForeignHands http://www.nie.org/ontology/nietzsche#hasText + http://www.nie.org/ontology/nietzsche#hasCleanText http://www.nie.org/ontology/nietzsche#penOfForeignHands + http://www.nie.org/ontology/nietzsche#resolutionOfAbbreviation http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#textOfForeignHands http://www.nie.org/ontology/nietzsche#Page http://www.nie.org/ontology/nietzsche#hasNumber http://www.nie.org/ontology/nietzsche#hasOrientation + http://www.nie.org/ontology/nietzsche#hasImprints http://www.nie.org/ontology/nietzsche#hasLines http://www.nie.org/ontology/nietzsche#hasMarkForeignHands http://www.nie.org/ontology/nietzsche#hasWords http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks http://www.nie.org/ontology/nietzsche#hasFaksimileImage http://www.nie.org/ontology/nietzsche#pageIsOnSVGTextField http://www.nie.org/ontology/nietzsche#pageIsOnFaksimileTextField http://www.nie.org/ontology/nietzsche#hasSvgImage http://www.nie.org/ontology/nietzsche#NonExistentPage http://www.nie.org/ontology/nietzsche#hasNumber http://www.nie.org/ontology/nietzsche#hasOrientation + http://www.nie.org/ontology/nietzsche#hasImprints http://www.nie.org/ontology/nietzsche#hasLines http://www.nie.org/ontology/nietzsche#hasMarkForeignHands http://www.nie.org/ontology/nietzsche#hasWords http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks http://www.nie.org/ontology/nietzsche#hasStatus http://www.nie.org/ontology/nietzsche#hasFaksimileImage http://www.nie.org/ontology/nietzsche#pageIsOnSVGTextField http://www.nie.org/ontology/nietzsche#pageIsOnFaksimileTextField http://www.nie.org/ontology/nietzsche#hasSvgImage http://www.nie.org/ontology/nietzsche#ReconstructedKonvolut http://www.nie.org/ontology/nietzsche#hasTitle http://www.nie.org/ontology/nietzsche#hasManuscriptType http://www.nie.org/ontology/nietzsche#hasPages http://www.nie.org/ontology/nietzsche#hasDescription http://www.nie.org/ontology/nietzsche#Reference http://www.nie.org/ontology/nietzsche#firstLineOfReference http://www.nie.org/ontology/nietzsche#lastLineOfReference http://www.nie.org/ontology/nietzsche#wordReference http://www.nie.org/ontology/nietzsche#IsUncertain http://www.nie.org/ontology/nietzsche#hasTitle http://www.nie.org/ontology/nietzsche#hasPageNumber http://www.nie.org/ontology/nietzsche#SVGImage http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasFileName http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#hasPrimaryurl http://www.nie.org/ontology/nietzsche#hasSecondaryurl http://www.nie.org/ontology/nietzsche#hasTextField http://www.nie.org/ontology/nietzsche#StandoffTag http://www.nie.org/ontology/nietzsche#standoffTagHasStartIndex http://www.nie.org/ontology/nietzsche#standoffTagHasEndIndex http://www.nie.org/ontology/nietzsche#standoffTagHasCSS http://www.nie.org/ontology/nietzsche#TextConnectionMark http://www.nie.org/ontology/nietzsche#hasText + http://www.nie.org/ontology/nietzsche#hasCleanText http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#textConnectionMarkHasTextSource http://www.nie.org/ontology/nietzsche#TextField http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#TranskriptionPosition http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#UncertainDecipherment http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#Word http://www.nie.org/ontology/nietzsche#hasText + http://www.nie.org/ontology/nietzsche#hasCleanText http://www.nie.org/ontology/nietzsche#hasEditedText + http://www.nie.org/ontology/nietzsche#hasCleanEditedText http://www.nie.org/ontology/nietzsche#wordHasWordParts http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#wordHasStyle http://www.nie.org/ontology/nietzsche#overwritesWord http://www.nie.org/ontology/nietzsche#isTransformationOfWord http://www.nie.org/ontology/nietzsche#isExtensionOfWord http://www.nie.org/ontology/nietzsche#isDeletionOfWord http://www.nie.org/ontology/nietzsche#isClarificationOfWord http://www.nie.org/ontology/nietzsche#wordHasEarlierVersion http://www.nie.org/ontology/nietzsche#wordHasCorrection http://www.nie.org/ontology/nietzsche#wordIsDeletedByPath http://www.nie.org/ontology/nietzsche#wordHasEditorComment http://www.nie.org/ontology/nietzsche#WordDeletionPath http://www.nie.org/ontology/nietzsche#hasDAttribute http://www.nie.org/ontology/nietzsche#WordInsertionMark http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#hasMarkType http://www.nie.org/ontology/nietzsche#hasSymbolId http://www.nie.org/ontology/nietzsche#hasNextWord http://www.nie.org/ontology/nietzsche#hasPreviousWord http://www.nie.org/ontology/nietzsche#wordInsertionMarkBelongsToLine xml-dictionary - 2021-02-26 15:46:21 + 2021-08-23 09:52:15 Index: Friedrich-Nietzsche-late-work-ontology.ttl =================================================================== --- Friedrich-Nietzsche-late-work-ontology.ttl (revision 109) +++ Friedrich-Nietzsche-late-work-ontology.ttl (revision 110) @@ -1,160 +1,167 @@ @prefix dct: . -@prefix document: . +@prefix document: . @prefix homotypic: . @prefix stoff: . @prefix text: . @prefix owl: . @prefix rdfs: . @prefix rdf: . @prefix skos: . @prefix xsd: . @prefix tln: . a owl:Ontology; dct:license ; dct:title "An ontology about the collected late works of Friedrich Nietzsche"@en; dct:description """Formal description of specific concepts in the scientific study of Friedrich Nietzsches late work."""@en; dct:creator "Dominique Steinbach, tool coordinator/software developer, NIE-INE/digital edition of der späte Nietzsche, Basel University, Switzerland"@en; dct:contributor "Christian Steiner, software developer, digital edition of der späte Nietzsche, University of Basel, Switzerland"@en; dct:publisher "Basel University, Switzerland"@en. tln:TextGenesis a owl:Class ; rdfs:label "identifies a genetic order of text versions"@en ; rdfs:comment "Identifies a genetic order of text versions, i.e. groups text units as earlier and later versions of each other."@en ; rdfs:isDefinedBy . tln:IdentifiedTextVersion a owl:Class ; rdfs:label "identifies a list of text unities as a text version"@en ; rdfs:comment "Identification of a list of text unities (e.g. pages or parts of pages) as a text version for which there is an earlier or later version."@en ; rdfs:isDefinedBy . tln:PartOfPageTextUnit a owl:Class ; rdfs:label "identifies a part of a page as a text unity"@en ; rdfs:comment "Identification of a part of page as a text unity."@en ; rdfs:isDefinedBy ; rdfs:subClassOf [ a owl:Restriction ; owl:cardinality "1"^^xsd:nonNegativeInteger ; owl:onProperty tln:belongsToPage ], [ a owl:Restriction ; owl:cardinality "1"^^xsd:nonNegativeInteger ; owl:onProperty tln:startLine ], [ a owl:Restriction ; owl:cardinality "1"^^xsd:nonNegativeInteger ; owl:onProperty tln:endLine ] . tln:ExternalTextUnit a owl:Class ; rdfs:label "a list text unit that has been published external to the digital edition"@en ; rdfs:comment "A text unit that has been published external to the digital edition."@en ; rdfs:isDefinedBy ; rdfs:subClassOf tln:IdentifiedTextVersion . tln:Page a owl:Class ; rdfs:subClassOf document:Page . tln:belongsToPage a owl:ObjectProperty ; rdfs:label "relates a part of a page with the page it is a part of"@en ; rdfs:comment "Relates a part of a page with the page it is a part of."@en ; rdfs:isDefinedBy ; rdfs:domain tln:PartOfPageTextUnit ; rdfs:range tln:Page. tln:startLine a owl:ObjectProperty ; rdfs:label "relates a part of a page with the line it starts with"@en ; rdfs:comment "Relates a part of a page with the line it starts with."@en ; rdfs:isDefinedBy ; rdfs:domain tln:PartOfPageTextUnit ; rdfs:range tln:Line. tln:endLine a owl:ObjectProperty ; rdfs:label "relates a part of a page with the line it ends with"@en ; rdfs:comment "Relates a part of a page with the line it ends with."@en ; rdfs:isDefinedBy ; rdfs:domain tln:PartOfPageTextUnit ; rdfs:range tln:Line. tln:identifiesAsVersion a owl:ObjectProperty ; rdfs:label "groups a list of text unities together as a identified text version"@en ; rdfs:comment "Groups a list of text unities together as a identified text version for which there is an ealier or later version."@en ; rdfs:isDefinedBy ; rdfs:domain tln:IdentifiedTextVersion ; rdfs:range rdf:List. tln:hasGeneticOrder a owl:ObjectProperty ; rdfs:label "relates a list of text versions to an identified genetic order"@en ; rdfs:comment "Relates a list of text versions to an identified genetic order. The position in the list determines the version of a text unit."@en ; rdfs:isDefinedBy ; rdfs:domain tln:TextGenesis ; rdfs:range rdf:List. tln:textUnitHasTitle a owl:ObjectProperty ; rdfs:label "relates a external published text unit with a title"@en ; rdfs:comment "Relates a external published text unit with a title by which it can be identified."@en ; rdfs:isDefinedBy ; rdfs:domain tln:ExternalTextUnit ; rdfs:range xsd:string . tln:textUnitHasUrl a owl:ObjectProperty ; rdfs:label "relates a external published text unit with a URL"@en ; rdfs:comment "Relates a external published text unit with a URL by which it can be visited."@en ; rdfs:isDefinedBy ; rdfs:domain tln:ExternalTextUnit ; rdfs:range xsd:anyURI . tln:hasImage a owl:ObjectProperty ; rdfs:label "relates a page to a image"@en ; rdfs:comment "relates a page to an image that has a textfield that specifies the area where the writing that constitutes the page can be found."@en ; rdfs:isDefinedBy ; rdfs:domain tln:Page ; rdfs:range tln:Image . tln:hasUrl a owl:DatatypeProperty ; rdfs:label "has Url"@en ; rdfs:domain tln:Image ; rdfs:isDefinedBy ; rdfs:range xsd:anyURI . -tln:inheritOverwritesWord a owl:ObjectProperty ; - rdfs:subPropertyOf tln:overwritesWord; - rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ; - rdfs:comment "The author has used this word in order to overwrite that word."@en ; - rdfs:isDefinedBy ; - owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ). +#tln:inheritOverwritesWord a owl:ObjectProperty ; +# rdfs:subPropertyOf tln:overwritesWord; +# rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ; +# rdfs:comment "The author has used this word in order to overwrite that word."@en ; +# rdfs:isDefinedBy ; +# owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ). + +tln:hasStandoffMarkup4PartThatOverwritesWord a owl:ObjectProperty ; + rdfs:label "word has standoff markup for the part that overwrites a word"@en ; + rdfs:comment "word has standoff markup that highlights the part of its text that overwrites a word"@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:Word ; + rdfs:range stoff:StandoffMarkup . tln:lineContinuesOn a owl:ObjectProperty ; rdfs:label "writing from subject line continues on object line"@en ; rdfs:comment "the writing that ends on subject line continues on object line"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Line ; rdfs:range tln:Line . tln:pageIsOnTextField a owl:ObjectProperty ; rdfs:label "page is on text field"@en ; rdfs:comment "the writing that is referred to as subject can be found on object"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Page ; rdfs:range tln:TextField . tln:writingContinuesWithWord a owl:ObjectProperty ; rdfs:label "writing continues with next word"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Word ; rdfs:range tln:Word . tln:selectableWordProperty a owl:ObjectProperty ; rdfs:label "a property of a word for which it can be selected"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Word . tln:cardinalityGreaterOne a rdf:Property ; rdfs:label "whether a tln:selectableWordProperty can have a greater cardinality then one"@en ; rdfs:isDefinedBy ; rdfs:domain tln:selectableWordProperty ; rdfs:range xsd:boolean . tln:suggestedMaxCardinality a rdf:Property ; rdfs:label "the suggested max cardinaltiy of a tln:selectableWordProperty on a word"@en ; rdfs:isDefinedBy ; rdfs:domain tln:selectableWordProperty ; rdfs:range xsd:nonNegativeInteger .