Index: shared_util/myxmlwriter.py =================================================================== --- shared_util/myxmlwriter.py (revision 75) +++ shared_util/myxmlwriter.py (revision 76) @@ -1,184 +1,203 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to pretty-write a xml string to a xml file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import inspect import xml.dom.minidom as MD import xml.etree.ElementTree as ET import lxml.etree as LET from datetime import datetime from rdflib import URIRef +from os import makedirs +from os.path import sep, basename, dirname import sys import warnings sys.path.append('svgscripts') from datatypes.page import FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" FILE_TYPE_SVG_WORD_POSITION = FILE_TYPE_SVG_WORD_POSITION FILE_TYPE_XML_MANUSCRIPT = FILE_TYPE_XML_MANUSCRIPT FILE_TYPE_XML_DICT = 'xml-dictionary' def attach_dict_to_xml_node(dictionary, xml_node): """Create a xml tree from a dictionary. """ for key in dictionary.keys(): elem_type = type(dictionary[key]) if elem_type != dict: node = LET.SubElement(xml_node, key, attrib={'type': elem_type.__name__}) node.text = str(dictionary[key]) else: attach_dict_to_xml_node(dictionary[key], LET.SubElement(xml_node, key)) def dict2xml(dictionary, target_file_name): """Write dict 2 xml. """ xml_tree = LET.ElementTree(LET.Element('root')) attach_dict_to_xml_node(dictionary, LET.SubElement(xml_tree.getroot(), 'dict')) write_pretty(xml_element_tree=xml_tree, file_name=target_file_name,\ script_name=inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_XML_DICT) def get_dictionary_from_node(node): """Return dictionary from node. :return: dict """ new_dict = {} if len(node.getchildren()) > 0: new_dict.update({ node.tag : {} }) for child_node in node.getchildren(): new_dict.get(node.tag).update(get_dictionary_from_node(child_node)) else: elem_cls = eval(node.get('type')) if bool(node.get('type')) else str value = elem_cls(node.text) if bool(node.text) else None new_dict.update({ node.tag: value }) return new_dict def lock_xml_tree(xml_element_tree, **locker_dict): """Lock xml_element_tree. """ if xml_element_tree is not None and not test_lock(xml_element_tree, silent=True): message = locker_dict.get('message') if bool(locker_dict.get('message')) else '' reference_file = locker_dict.get('reference_file') if bool(locker_dict.get('reference_file')) else '' metadata = xml_element_tree.xpath('./metadata')[0]\ if len(xml_element_tree.xpath('./metadata')) > 0\ else LET.SubElement(xml_element_tree.getroot(), 'metadata') lock = LET.SubElement(metadata, 'lock') LET.SubElement(lock, 'reference-file').text = reference_file if message != '': LET.SubElement(lock, 'message').text = message def parse_xml_of_type(xml_source_file, file_type): """Return a xml_tree from xml_source_file is file is of type file_type. """ - xml_tree = LET.parse(xml_source_file) + parser = LET.XMLParser(remove_blank_text=True) + xml_tree = LET.parse(xml_source_file, parser) if not xml_has_type(file_type, xml_tree=xml_tree): msg = 'File {} is not of type {}!'.format(xml_source_file, file_type) raise Exception(msg) return xml_tree def test_lock(xml_element_tree=None, silent=False): """Test if xml_element_tree is locked and print a message. :return: True if locked """ if xml_element_tree is None: return False if len(xml_element_tree.findall('./metadata/lock')) > 0: reference_file = xml_element_tree.findall('./metadata/lock/reference-file') message = xml_element_tree.findall('./metadata/lock/message') if not silent: warning_msg = 'File {0} is locked!'.format(xml_element_tree.docinfo.URL) if len(reference_file) > 0: warning_msg = warning_msg.replace('!', ' ') + 'on {0}.'.format(reference_file[0].text) if len(message) > 0: warning_msg = warning_msg + '\n{0}'.format(message[0].text) warnings.warn(warning_msg) return True return False def update_metadata(xml_element_tree, script_name, file_type=None): """Updates metadata of xml tree. """ if len(xml_element_tree.getroot().findall('./metadata')) > 0: if len(xml_element_tree.getroot().find('./metadata').findall('./modifiedBy[@script="{}"]'.format(script_name))) == 0: LET.SubElement(xml_element_tree.getroot().find('./metadata'), 'modifiedBy', attrib={'script': script_name}) xml_element_tree.getroot().find('./metadata').findall('./modifiedBy[@script="{}"]'.format(script_name))[0].text = \ datetime.now().strftime('%Y-%m-%d %H:%M:%S') else: metadata = LET.SubElement(xml_element_tree.getroot(), 'metadata') if file_type is not None: LET.SubElement(metadata, 'type').text = file_type createdBy = LET.SubElement(metadata, 'createdBy') LET.SubElement(createdBy, 'script').text = script_name LET.SubElement(createdBy, 'date').text = datetime.now().strftime('%Y-%m-%d %H:%M:%S') -def write_pretty(xml_string=None, xml_element_tree=None, file_name=None, script_name=None, file_type=None, **locker_dict): +def write_backup(xml_element_tree: LET.ElementTree, file_type=None, bak_dir='./bak') -> str: + """Back up a xml_source_file. + + :return: target_file_name + """ + date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') + makedirs(bak_dir, exist_ok=True) + target_file_name = bak_dir + sep + basename(xml_element_tree.docinfo.URL) + '_' + date_string + reference_file = xml_element_tree.docinfo.URL + write_pretty(xml_element_tree=xml_element_tree, file_name=target_file_name,\ + script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, reference_file),\ + file_type=file_type) + return target_file_name + +def write_pretty(xml_string=None, xml_element_tree=None, file_name=None, script_name=None, backup=False, file_type=None, **locker_dict): """Writes a xml string pretty to a file. """ if not bool(xml_string) and not bool(xml_element_tree): raise Exception("write_pretty needs a string or a xml.ElementTree!") if not test_lock(xml_element_tree): if len(locker_dict) > 0 and bool(locker_dict.get('reference_file')): lock_xml_tree(xml_element_tree, **locker_dict) if script_name is not None and xml_element_tree is not None: update_metadata(xml_element_tree, script_name, file_type=file_type) if file_name is None and xml_element_tree is not None\ and xml_element_tree.docinfo is not None and xml_element_tree.docinfo.URL is not None: file_name = xml_element_tree.docinfo.URL if file_name is None: raise Exception("write_pretty needs a file_name or a xml.ElementTree with a docinfo.URL!") + if backup and xml_element_tree is not None: + write_backup(xml_element_tree, file_type=file_type) dom = MD.parseString(xml_string) if(bool(xml_string)) else MD.parseString(ET.tostring(xml_element_tree.getroot())) f = open(file_name, "w") dom.writexml(f, addindent="\t", newl='\n', encoding='utf-8') f.close() def xml2dict(xml_source_file): """Create dict from xml_source_file of Type FILE_TYPE_XML_DICT. :return: dict """ new_dict = {} xml_tree = LET.parse(xml_source_file) if xml_has_type(FILE_TYPE_XML_DICT, xml_tree=xml_tree)\ and len(xml_tree.xpath('/root/dict')) > 0: for node in xml_tree.xpath('/root/dict')[0].getchildren(): new_dict.update(get_dictionary_from_node(node)) else: msg = 'File {} is not of type {}!'.format(xml_source_file, FILE_TYPE_XML_DICT) raise Exception(msg) return new_dict def xml_has_type(file_type, xml_source_file=None, xml_tree=None): """Return true if xml_source_file/xml_tree has file type == file_type. """ if xml_tree is None and xml_source_file is None: return False if xml_tree is None: xml_tree = LET.parse(xml_source_file) if len(xml_tree.xpath('//metadata/type/text()')) < 1: return False return xml_tree.xpath('//metadata/type/text()')[0] == file_type Index: tests_shared_util/test_myxmlwriter.py =================================================================== --- tests_shared_util/test_myxmlwriter.py (revision 75) +++ tests_shared_util/test_myxmlwriter.py (revision 76) @@ -1,103 +1,104 @@ import unittest import os from os.path import isfile, isdir, dirname, sep, realpath from datetime import datetime import shutil import tempfile import xml.etree.ElementTree as ET import lxml.etree as LET from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD from xmldiff import main import sys sys.path.append('svgscripts') from datatypes.page import Page sys.path.append('shared_util') try: from myxmlwriter import attach_dict_to_xml_node, dict2xml, lock_xml_tree, update_metadata, write_pretty, test_lock, xml_has_type,\ FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_DICT, get_dictionary_from_node, xml2dict, parse_xml_of_type except ImportError: sys.path.append(dirname(dirname(realpath(__file__)))) from shared_util.myxmlwriter import attach_dict_to_xml_node, dict2xml, lock_xml_tree, update_metadata, write_pretty, test_lock, xml_has_type,\ FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_DICT, get_dictionary_from_node, xml2dict, parse_xml_of_type class TestPrettyWriter(unittest.TestCase): def setUp(self): self.test_dir = tempfile.mkdtemp() self.title = 'ASDF' DATADIR = dirname(__file__) + sep + 'test_data' self.page = DATADIR + sep + 'N_VII_1_page001.xml' self.mydict = { 'asdf': { 'b': { 'a': 1, 'b': 'c' , 'c': URIRef('adf')}},\ 'str': 'test' } def test_attach_dict_to_xml_node(self): xml_tree = LET.Element('root') attach_dict_to_xml_node(self.mydict, LET.SubElement(xml_tree, 'dict')) #print(LET.dump(xml_tree)) self.assertEqual(xml_tree.xpath('//asdf/b/a/@type')[0], 'int') self.assertEqual(xml_tree.xpath('//asdf/b/b/@type')[0], 'str') self.assertEqual(xml_tree.xpath('//asdf/b/c/@type')[0], URIRef.__name__) def test_dict2xml(self): test_file = self.test_dir + sep + 'new_test.xml' dict2xml(self.mydict, test_file) self.assertEqual(isfile(test_file), True) def test_get_dictionary_from_node(self): test_file = self.test_dir + sep + 'source.xml' dict2xml(self.mydict, test_file) xml_tree = LET.parse(test_file) self.assertEqual(len(xml_tree.xpath('/root/dict')[0].getchildren()), len(self.mydict.keys())) for index, key in enumerate(self.mydict.keys()): mydict = get_dictionary_from_node(xml_tree.xpath('/root/dict')[0].getchildren()[index]) self.assertEqual(key in mydict.keys(), True) if type(self.mydict[key]) == dict: self.assertEqual(mydict[key].keys(), self.mydict[key].keys()) def test_update_metadata(self): test_tree = LET.ElementTree(LET.Element('page', attrib={"title": self.title})) update_metadata(test_tree, __file__) self.assertEqual(test_tree.find('./metadata').find('./createdBy').find('./script').text, __file__) update_metadata(test_tree, __file__) self.assertEqual(len(test_tree.find('./metadata').findall('./modifiedBy[@script="{}"]'.format(__file__))), 1) update_metadata(test_tree, __file__) self.assertEqual(len(test_tree.find('./metadata').findall('./modifiedBy[@script="{}"]'.format(__file__))), 1) + def test_write_pretty(self): et_file = self.test_dir + os.sep + 'et_file.xml' pretty_file = self.test_dir + os.sep + 'pretty_file.xml' manuscript_tree = ET.ElementTree(ET.Element('page', attrib={"title": self.title})) metadata = ET.SubElement(manuscript_tree.getroot(), 'metadata') ET.SubElement(metadata, 'type').text = 'xmlManuscriptFile' createdBy = ET.SubElement(metadata, 'createdBy') manuscript_tree.write(et_file, xml_declaration=True, encoding='utf-8') write_pretty(xml_string=ET.tostring(manuscript_tree.getroot()), file_name=pretty_file) self.assertEqual(main.diff_files(et_file, pretty_file), []) write_pretty(xml_element_tree=manuscript_tree, file_name=pretty_file) self.assertEqual(main.diff_files(et_file, pretty_file), []) def test_lock(self): page = Page(self.page) locker_dict = { 'reference_file': 'asdf.txt', 'message': 'locked on this file'} lock_xml_tree(page.page_tree, **locker_dict) self.assertEqual(page.is_locked(), True) #test_lock(page.page_tree) def test_xml2dict(self): test_file = self.test_dir + sep + 'source.xml' dict2xml(self.mydict, test_file) mydict = xml2dict(test_file) self.assertEqual(mydict, self.mydict) def test_xml_has_type(self): self.assertEqual(xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=self.page), True) self.assertEqual(xml_has_type(FILE_TYPE_XML_DICT, xml_source_file=self.page), False) with self.assertRaises(Exception): parse_xml_of_type(self.page, FILE_TYPE_XML_DICT) def tearDown(self): isdir(self.test_dir) and shutil.rmtree(self.test_dir) if __name__ == "__main__": unittest.main() Index: svgscripts/datatypes/manuscript.py =================================================================== --- svgscripts/datatypes/manuscript.py (revision 75) +++ svgscripts/datatypes/manuscript.py (revision 76) @@ -1,98 +1,129 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent an archival unity of manuscript pages, i.e. workbooks, notebooks, folders of handwritten pages. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile import sys from .page import Page, FILE_TYPE_XML_MANUSCRIPT, FILE_TYPE_SVG_WORD_POSITION +from .color import Color sys.path.append('py2ttl') from class_spec import SemanticClass sys.path.append('shared_util') -from myxmlwriter import parse_xml_of_type, xml_has_type +from myxmlwriter import parse_xml_of_type, write_pretty, xml_has_type class ArchivalManuscriptUnity(SemanticClass): """ This class represents an archival unity of manuscript pages (workbooks, notebooks and portfolios of handwritten pages). @label archival unity of manuscript pages Args: title title of archival unity manuscript_type type of manuscript: 'Arbeitsheft', 'Notizheft', 'Mappe' + manuscript_tree lxml.ElementTree """ XML_TAG = 'manuscript' - #RDFS_SUBCLASSOF_LIST = [ 'http://www.knora.org/ontology/0068/nietzsche#Manuscript' ] # DEPRECATED + XML_COLORS_TAG = 'colors' + UNITTESTING = False - def __init__(self, title='', manuscript_type=''): + def __init__(self, title='', manuscript_type='', manuscript_tree=None): self.title = title + self.manuscript_tree = manuscript_tree self.manuscript_type = manuscript_type self.pages = [] + self.colors = [] def get_name_and_id(self): """Return an identification for object as 2-tuple. """ return '', self.title.replace(' ', '_') @classmethod - def get_semantic_dictionary(cls): - """ Creates a semantic dictionary as specified by SemanticClass. - """ - dictionary = {} - class_dict = cls.get_class_dictionary() - properties = {} - properties.update(cls.create_semantic_property_dictionary('title', str, 1)) - properties.update(cls.create_semantic_property_dictionary('manuscript_type', str, 1)) - properties.update(cls.create_semantic_property_dictionary('pages', list)) - dictionary.update({cls.CLASS_KEY: class_dict}) - dictionary.update({cls.PROPERTIES_KEY: properties}) - return cls.return_dictionary_after_updating_super_classes(dictionary) - - @classmethod def create_cls(cls, xml_manuscript_file, page_status_list=None, page_xpath=''): """Create an instance of ArchivalManuscriptUnity from a xml file of type FILE_TYPE_XML_MANUSCRIPT. :return: ArchivalManuscriptUnity """ manuscript_tree = parse_xml_of_type(xml_manuscript_file, FILE_TYPE_XML_MANUSCRIPT) title = manuscript_tree.getroot().get('title') if bool(manuscript_tree.getroot().get('title')) else '' manuscript_type = manuscript_tree.getroot().get('type') if bool(manuscript_tree.getroot().get('type')) else '' - manuscript = cls(title=title, manuscript_type=manuscript_type) + manuscript = cls(title=title, manuscript_type=manuscript_type, manuscript_tree=manuscript_tree) + manuscript.colors = [ Color.create_cls(node=color_node) for color_node in manuscript_tree.xpath('.//' + cls.XML_COLORS_TAG + '/' + Color.XML_TAG) ] if page_xpath == '': page_status = '' if page_status_list is not None\ and type(page_status_list) is list\ and len(page_status_list) > 0: page_status = '[' + ' and '.join([ f'contains(@status, "{status}")' for status in page_status_list ]) + ']' page_xpath = f'//pages/page{page_status}/@output' manuscript.pages = [ Page(page_source)\ for page_source in manuscript_tree.xpath(page_xpath)\ if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ] return manuscript + def get_color(self, hex_color) -> Color: + """Return color if it exists or None. + """ + if hex_color in [ color.hex_color for color in self.colors ]: + return [ color for color in self.colors if color.hex_color == hex_color ][0] + return None + + @classmethod + def get_semantic_dictionary(cls): + """ Creates a semantic dictionary as specified by SemanticClass. + """ + dictionary = {} + class_dict = cls.get_class_dictionary() + properties = {} + properties.update(cls.create_semantic_property_dictionary('title', str, 1)) + properties.update(cls.create_semantic_property_dictionary('manuscript_type', str, 1)) + properties.update(cls.create_semantic_property_dictionary('pages', list)) + dictionary.update({cls.CLASS_KEY: class_dict}) + dictionary.update({cls.PROPERTIES_KEY: properties}) + return cls.return_dictionary_after_updating_super_classes(dictionary) + + def update_colors(self, color): + """Update manuscript colors if color is not contained. + """ + if self.get_color(color.hex_color) is None: + self.colors.append(color) + if self.manuscript_tree is not None: + if len(self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)) > 0: + self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)[0].getparent().remove(self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)[0]) + colors_node = ET.SubElement(self.manuscript_tree.getroot(), self.XML_COLORS_TAG) + for color in self.colors: + color.attach_object_to_tree(colors_node) + if not self.UNITTESTING: + write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_tree.docinfo.URL,\ + script_name=__file__, backup=True,\ + file_type=FILE_TYPE_XML_MANUSCRIPT) + + + Index: svgscripts/datatypes/transkription_position.py =================================================================== --- svgscripts/datatypes/transkription_position.py (revision 75) +++ svgscripts/datatypes/transkription_position.py (revision 76) @@ -1,189 +1,199 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a transkription word position. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile import sys from .debug_message import DebugMessage from .image import SVGImage from .positional_word_part import PositionalWordPart from .word_position import WordPosition from .matrix import Matrix sys.path.append('py2ttl') from class_spec import SemanticClass class TranskriptionPosition(WordPosition): """ This class represents the position of a word on the transkription as it is displayed by a svg image. @label position of a word on the topological transkription Args: id (int): word id matrix (datatypes.Matrix): matrix containing information about transformation. height (float): height of word width (float): width of word x (float): x position of word y (float): y position of word positional_word_parts a list of (datatypes.positional_word_part) PositionalWordPart debug_message a (datatypes.debug_message) DebugMessage """ ADD2X = 0.15 ADD2TOP = 1.0 ADD2BOTTOM = 0.2 HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height XML_TAG = WordPosition.TRANSKRIPTION def __init__(self, id=0, node=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, positional_word_parts=None, debug_message=None): super(TranskriptionPosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) self.positional_word_parts = positional_word_parts if positional_word_parts is not None else [] self.debug_message = debug_message self.deleted = False self.has_box = None self.svg_image = None if node is not None: self.debug_message = DebugMessage(node=node.xpath('.//' + DebugMessage.XML_TAG)[0])\ if len(node.xpath('.//' + DebugMessage.XML_TAG)) > 0 else None self.positional_word_parts = [ PositionalWordPart(node=pwp_node) for pwp_node in node.xpath('.//' + PositionalWordPart.XML_TAG) ] self.attachable_objects += self.positional_word_parts if self.debug_message is not None: self.attachable_objects.append(self.debug_message) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = super(TranskriptionPosition,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('svg_image', SVGImage, cardinality=1,\ name='isOnSvgImage', label='transkription position is on svg image')) return cls.return_dictionary_after_updating_super_classes(dictionary) def get_text(self): """Returns the concatenated text of all positional_word_parts. """ return ''.join([pwp.text for pwp in self.positional_word_parts]) + def is_mergebale_with(self, other) -> bool: + """Return whether self and other have same writing_process_id or style. + """ + if self.writing_process_id == other.writing_process_id: + return True + if self.writing_process_id == -1 or other.writing_process_id == -1\ + and (len(self.positional_word_parts) > 0 and len(other.positional_word_parts) > 0): + return self.positional_word_parts[0].style_class == other.positional_word_parts[0].style_class + return False + def split(self, split_position, second_split=-1): """Split a transkription_position in two at split_position. :return: a list of the new transkription_positions """ transkription_positions = [] left_pwp = [ pwp for pwp in self.positional_word_parts if pwp.left + pwp.width < split_position ] transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(left_pwp, transkription_position_id=self.id) if second_split == -1: right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp ] next_id = int(self.id) + 1 transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id)) else: middle_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp.left + pwp.width < second_split ] next_id = int(self.id) + 1 transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(middle_pwp, transkription_position_id=str(next_id)) right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp not in middle_pwp ] next_id = int(self.id) + 1 transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id)) return transkription_positions def update_positional_word_parts(self, positional_word_parts): """Update positional_word_parts. """ if len(self.positional_word_parts) > 0 and self.positional_word_parts in self.attachable_objects: self.attachable_objects.remove(self.positional_word_parts) self.positional_word_parts = positional_word_parts self.attachable_objects += self.positional_word_parts @staticmethod def CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=None, debug_msg_string=None, transkription_position_id=0): """Creates a list of TranskriptionPosition from a list of (datatypes.positional_word_part) PositionalWordPart. [:return:] a list of (datatypes.transkription_position) TranskriptionPosition """ TOPCORRECTION = 1 debug_message = DebugMessage(message=debug_msg_string)\ if debug_msg_string is not None else debug_message transkription_positions = [] if len(positional_word_parts) < 1: return [] matrix = positional_word_parts[0].transform index = 0 matrices_differ = False style_class = positional_word_parts[0].style_class styles_differ = False while index < len(positional_word_parts) and not matrices_differ and not styles_differ: if Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix, positional_word_parts[index].transform): matrices_differ = True elif style_class != positional_word_parts[index].style_class: styles_differ = True else: index += 1 if (matrices_differ or styles_differ) and index < len(positional_word_parts): debug_msg_string = 'matrices differ' if matrices_differ else 'styles differ' transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts[index:], debug_msg_string=debug_msg_string, transkription_position_id=int(transkription_position_id)+1) positional_word_parts = positional_word_parts[:index] height = [ pwp.height for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.height, reverse=True)][0] + 2*TOPCORRECTION x = positional_word_parts[0].left - TranskriptionPosition.ADD2X y = [ pwp.top for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.top)][0] - TOPCORRECTION width = positional_word_parts[len(positional_word_parts)-1].left - x\ + positional_word_parts[len(positional_word_parts)-1].width + TranskriptionPosition.ADD2X for pwp_index, pwp in enumerate(positional_word_parts): pwp.id = pwp_index transkription_positions.insert(0, TranskriptionPosition(id=transkription_position_id, height=height, width=width, x=x, y=y, matrix=matrix,\ positional_word_parts=positional_word_parts, debug_message=debug_message)) return transkription_positions @staticmethod def CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=None, debug_msg_string=None, transkription_field=None): """Creates a list of TranskriptionPosition from word_part_objs (i.e. a list of dictionaries with the keys: text, x, y, matrix, class). [:return:] a list of (datatypes.transkription_position) TranskriptionPosition """ positional_word_parts = [] debug_message = DebugMessage(message=debug_msg_string)\ if debug_msg_string is not None else None if page.svg_file is not None and isfile(page.svg_file): svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } xmin = 0.0 ymin = 0.0 if transkription_field is not None: xmin = transkription_field.xmin ymin = transkription_field.ymin for part_obj in word_part_objs: positional_word_parts += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(\ part_obj, svg_path_tree, namespaces, page, start_id=len(positional_word_parts),\ xmin=xmin, ymin=ymin) else: positional_word_parts = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) if len(positional_word_parts) > 0: return TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=debug_message) else: return [ TranskriptionPosition(matrix=matrix, debug_message=debug_message) ] Index: svgscripts/datatypes/word.py =================================================================== --- svgscripts/datatypes/word.py (revision 75) +++ svgscripts/datatypes/word.py (revision 76) @@ -1,698 +1,701 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import inspect from lxml import etree as ET from operator import attrgetter import sys import warnings from .box import Box from .matrix import Matrix from .path import Path from .simple_word import SimpleWord from .word_position import WordPosition from .transkription_position import TranskriptionPosition from .writing_process import WritingProcess def execute_function_on_parts(word_parts, func_name): """Execute function on parts and add those parts instead of original word to word_parts. :return: new word_parts, output from func """ copy_parts = word_parts[:] for word in word_parts: output = eval('word.{0}()'.format(func_name)) if len(word.word_parts) > 0: for part_word in word.word_parts: copy_parts.insert(copy_parts.index(word), part_word) copy_parts.remove(word) word.word_parts = [] return copy_parts, output def update_transkription_position_ids(word): """Update transkription_position' ids according to index. """ word_part_ids = [ wp.id for wp in word.word_parts ] if len(word_part_ids) != len(set(word_part_ids)): for id, wp in enumerate(word.word_parts): wp.id = id for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))): transkription_position.id = index transkription_position.has_box = None transkription_position.deleted = False class Word(SimpleWord): """ This class represents a word. """ DATA = 'debug-data' XML_TAG = 'word' XML_EARLIER_VERSION = 'earlier-version' XML_OVERWRITES = 'overwrites' XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\ 'isDeletionOfWord': 'deletesEarlierPart',\ 'isExtensionOfWord': 'extendsEarlierVersion',\ 'isTransformationOfWord': 'transformsEarlierPart' } def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None): super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.corrections = [] self.deleted = deleted self.debug_container = {} self.debug_msg = None self.earlier_version = earlier_version self.edited_text = None self.isClarificationOfWord = None self.isDeletionOfWord = None self.isExtensionOfWord = None self.isTransformationOfWord = None if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0: self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ]) self.overwrites_word = None self.writing_process_id = writing_process_id self.writing_processes = [] self.word_insertion_mark = None self.word_box = None self.word_parts = word_parts if word_parts is not None else [] self.word_part_objs = word_part_objs if word_part_objs is not None else [] def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = super(Word,self).attach_word_to_tree(target_tree) if self.deleted is not None: word_node.set('deleted', str(self.deleted).lower()) if self.edited_text is not None: word_node.set('edited-text', self.edited_text) if self.writing_process_id > -1: word_node.set('writing-process-id', str(self.writing_process_id)) for word_part in self.word_parts: word_part.attach_word_to_tree(word_node) if self.earlier_version is not None: earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION) self.earlier_version.attach_word_to_tree(earlier_node) if self.overwrites_word is not None\ and len(self.overwrites_word.transkription_positions) > 0: overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES) self.overwrites_word.attach_word_to_tree(overwrite_node) if self.word_box is not None: self.word_box.attach_object_to_tree(word_node) if len(self.corrections) > 0: word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ]))) for key in self.XML_CORRECTION_DICT.keys(): if self.__dict__[key] is not None: word_node.set(self.XML_CORRECTION_DICT[key], 'true') return word_node def belongs_to_multiple_writing_processes(self, include_parts=False): """Returns true if transkription_positions belong to different WritingProcesses. """ if len(self.word_parts) > 0 and include_parts: return len(set(word.writing_process_id for word in self.word_parts)) > 1 return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1 @classmethod def create_cls(cls, word_node): """Creates a word from a (lxml.Element) node. [:return:] Word """ cls = super(Word,cls).create_cls(word_node) cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1 cls.split_strings = None + cls.join_string = word_node.get('join') if bool(word_node.get('split')): cls.split_strings = word_node.get('split').split(' ') if ''.join(cls.split_strings) != cls.text: error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\ format(word_node.getroottree().docinfo.URL, str(cls.id))\ + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\ + 'Text attribute: "{0}".\n'.format(cls.text) raise Exception(error_msg) cls.deleted = word_node.get('deleted') == 'true'\ if bool(word_node.get('deleted')) else None cls.edited_text = word_node.get('edited-text') cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ] if bool(word_node.get('corrections')): for index in [ int(i) for i in word_node.get('corrections').split(' ') ]: if index < len(cls.word_parts): cls.corrections.append(cls.word_parts[index]) cls.earlier_version = None if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0: cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0] for key_value in cls.XML_CORRECTION_DICT.values(): if word_node.get(key_value) == 'true': cls.__dict__[key_value] = True if cls.earlier_version is not None: for word_part in cls.word_parts: for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\ and len(cls.word_parts) <= len(cls.earlier_version.word_parts): word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id] for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls.earlier_version for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\ if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\ else None cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\ if len(word_node.xpath('./' + Box.XML_TAG)) > 0\ else None return cls def create_earlier_version(self, root_word=None, id=0): """Create an earlier version of word. """ if root_word is None: root_word = self word_parts = [] for id, word_part in enumerate(self.word_parts): earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id) if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart if word_part not in self.corrections: self.corrections.append(word_part) if not root_word.belongs_to_multiple_writing_processes(include_parts=True)\ or word_part.writing_process_id < root_word.writing_process_id: word_parts.append(earlierWordPart) elif word_part.overwrites_word is not None: word_part.overwrites_word.id = word_part.id word_parts.append(word_part.overwrites_word) word_part.isTransformationOfWord = word_part.overwrites_word if word_part not in self.corrections: self.corrections.append(word_part) else: word_part.extendsEarlierVersion = True if word_part not in self.corrections: self.corrections.append(word_part) text = ''.join([ word.text for word in word_parts ])\ if len(word_parts) > 0\ else self.text if len(word_parts) == 1: self.transkription_positions += word_parts[0].transkription_positions self.faksimile_positions += word_parts[0].faksimile_positions word_parts = [] return Word(id=id, text=text, transkription_positions=self.transkription_positions,\ faksimile_positions=self.faksimile_positions, line_number=self.line_number,\ writing_process_id=self.writing_process_id-1, word_parts=word_parts) def create_correction_history(self): """Create correction history. """ if self.word_box is not None: writing_process_id = self.writing_process_id-1\ if self.writing_process_id > 0\ else self.writing_process_id transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions) self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\ writing_process_id=writing_process_id, line_number=self.line_number) for word_part in self.word_parts: word_part.create_correction_history() if len(self.word_parts) > 0: earlier_version = self.create_earlier_version() extending_words = self._get_parts_with_property_key('extendsEarlierVersion') if len(extending_words) > 0: for word in extending_words: word.isExtensionOfWord = earlier_version if self.has_mixed_status('deleted', include_parts=True): self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ]) if len(self.corrections) > 0: self.earlier_version = earlier_version @staticmethod def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): """Creates a word from a (lxml.Element) node or word_part_objs. [:return:] Word """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number text = word_node.get('text') deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true' transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ if len(word_node.findall('.//' + Word.DATA)) > 0\ else [ item.attrib for item in word_node.findall('.//part')] return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file WIDTH = 5 TOPCORRECTION = 2.0 FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize height = height x = round(float(word_part_objs[0]['x']), 3) if(page is not None and bool(page.style_dict)): HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size if endSign is not None and '%' in endSign: lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ if bool(page.style_dict[key].get('font-size'))] lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR elif endSign is not None and '%' in endSign: endX = float(endX) + WIDTH bottom = round(float(word_part_objs[0]['y']), 3) y = round(bottom - height + TOPCORRECTION, 3) width = round(float(endX) - x, 3) transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] text = ''.join([ dict['text'] for dict in word_part_objs]) line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) word.debug_msg = debug_msg return word else: error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' raise Exception('Error: {}'.format(error_msg)) def get_earlier_versions(self, reference_writing_process_id=1): """Return earlier version of self or its word_parts. """ earlier_versions = [] if self.earlier_version is not None: self.earlier_version.line_number = self.line_number self.earlier_version.writing_process_id = self.writing_process_id-1 earlier_versions.append(self.earlier_version) for word in self.word_parts: earlier_versions += word.get_earlier_versions(reference_writing_process_id=reference_writing_process_id) return earlier_versions @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(Word,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\ name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deleted', bool,\ name='isWordDeleted', label='has word been deleted', comment='Word has been deleted by the author.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\ name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\ name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\ name='isClarificationOfWord', label='word is a clarification of word',\ comment='The author has used this part of the word in order to clarify the appearance of that word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\ name='isDeletionOfWord', label='word is a deletion of word',\ comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\ name='isExtensionOfWord', label='word is a extension of word',\ comment='The author has used this part of a word in order to extend an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\ name='isTransformationOfWord', label='word is a transformation of word',\ comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\ name='overwritesWord', label='word overwrites word',\ comment='The author has used this word in order to overwrite that word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\ name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\ subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING)) super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\ name='isCorrectionOfWord', label='word is a correction of word',\ comment='The author has used this word in order to correct that word.') for key in cls.XML_CORRECTION_DICT.keys(): correction_dict = dictionary[cls.PROPERTIES_KEY].get(key) correction_dict.update(super_property_dictionary) dictionary[cls.PROPERTIES_KEY].update({key: correction_dict}) return cls.return_dictionary_after_updating_super_classes(dictionary) def has_mixed_status(self, property_key, include_parts=False): """Returns true if transkription_positions have mixed status concerning the property_key in their __dict__. """ if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions): return False if len(self.word_parts) > 0 and include_parts: if False in set(property_key in word.__dict__.keys() for word in self.word_parts): return False return len(set(word.deleted for word in self.word_parts)) > 1 return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1 def init_word(self, page): """Initialize word with objects from page. """ super(Word,self).init_word(page) if self.writing_process_id > -1: self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ] writing_processes = self.writing_processes for word_part in self.word_parts: word_part.init_word(page) self.lines += word_part.lines self.writing_processes += word_part.writing_processes self.lines = [ line for line in set(self.lines) ] self.writing_processes = [ wp for wp in set(self.writing_processes)] if self.overwrites_word is not None: self.overwrites_word.init_word(page) if self.earlier_version is not None: if self.earlier_version.writing_process_id == -1: self.earlier_version.writing_process_id = self.writing_process_id-1 if self.earlier_version.line_number == -1: self.earlier_version.line_number = self.line_number self.earlier_version.init_word(page) def join(self, other_word, append_at_end_of_new_word=True): """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. """ if append_at_end_of_new_word: self.text = self.text + other_word.text for position in other_word.transkription_positions: position.id = str(len(self.transkription_positions)) self.transkription_positions.append(position) else: self.text = other_word.text + self.text index = 0 for position in other_word.transkription_positions: self.transkription_positions.insert(index, position) index += 1 while index < len(self.transkription_positions): self.transkription_positions[index].id = str(index) index += 1 self.simplify_transkription_positions() def partition_according_to_deletion(self): """Partition a word according to its transkription_positions' deletion status ->split word and add partial words as its parts. """ if self.has_mixed_status('deleted'): transkription_positions = [] last_status = None for transkription_position in self.transkription_positions: if transkription_position.deleted != last_status\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.deleted if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] self.line_number = -1 self.deleted = False elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion') elif not self.deleted\ and len(self.transkription_positions) > 0\ and self.transkription_positions[0].deleted: self.deleted = True def partition_according_to_writing_process_id(self): """Partition a word according to its transkription_positions' writing_process_ids ->split word and add partial words as its parts. """ if self.belongs_to_multiple_writing_processes(): last_writing_process_id = -1 transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.writing_process_id != last_writing_process_id\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_writing_process_id = transkription_position.writing_process_id if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id') if self.belongs_to_multiple_writing_processes(include_parts=True): self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0] elif len(self.transkription_positions) > 0: self.writing_process_id = self.transkription_positions[0].writing_process_id def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0): """Determines whether word is over a word box. """ word_over_box = None if len(self.word_parts) > 0: for word in self.word_parts: if word_over_box is None: current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) if current_word is not None and current_word.word_box is not None: word_over_box = current_word else: new_tp_dict = {} for transkription_position in self.transkription_positions: word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) containing_boxes = [ box_path for box_path in box_paths\ if word_path.is_partially_contained_by(box_path)\ or box_path.do_paths_intersect(word_path) ] if len(containing_boxes) > 0: self._set_box_to_transkription_position(containing_boxes[0], word_path,\ transkription_position, new_tp_dict, tr_xmin) for replace_tp in new_tp_dict.keys(): for tp in new_tp_dict.get(replace_tp): self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp) self.transkription_positions.remove(replace_tp) word_over_box = self._get_partial_word_over_box() update_transkription_position_ids(self) return word_over_box def set_word_insertion_mark(self, word_insertion_mark): """Sets word_insertion_mark """ self.word_insertion_mark = word_insertion_mark def set_writing_process_id_to_transkription_positions(self, page): """Determines the writing process id of the transkription_positions. """ for transkription_position in self.transkription_positions: if len(transkription_position.positional_word_parts) > 0: for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): if font_key in page.fontsizekey2stage_mapping.keys(): transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) def simplify_transkription_positions(self): """Merge transkription_positions if possible. """ index = len(self.transkription_positions)-1 while index > 0\ and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]: current_tp = self.transkription_positions[index] index -= 1 previous_tp = self.transkription_positions[index] - if previous_tp.writing_process_id == current_tp.writing_process_id: + if previous_tp.is_mergebale_with(current_tp): positional_word_parts = previous_tp.positional_word_parts positional_word_parts += current_tp.positional_word_parts transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id) if len(transkription_positions) == 1: - transkription_positions[0].writing_process_id = previous_tp.writing_process_id + transkription_positions[0].writing_process_id = previous_tp.writing_process_id\ + if previous_tp.writing_process_id != -1\ + else current_tp.writing_process_id self.transkription_positions.pop(index+1) self.transkription_positions[index] = transkription_positions[0] #print(self.text, len(self.transkription_positions)) def split(self, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) return previousWord, currentWord, nextWord def split_according_to_status(self, status): """Split a word according to its transkription_positions' text. :return: a list of new word.Word """ new_words = [] if self.has_mixed_status(status): last_status = None transkription_positions = [] copy_keys = [ 'line_number', 'text', 'deleted', 'writing_process_id' ] for transkription_position in self.transkription_positions: if transkription_position.__dict__[status] != last_status\ and len(transkription_positions) > 0: newWord = Word(id=self.id+len(new_words), transkription_positions=transkription_positions) for key in copy_keys: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] newWord.__dict__[status] = transkription_positions[0].__dict__[status] new_words.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.__dict__[status] if len(transkription_positions) > 0: newWord = Word(id=self.id+len(new_words), transkription_positions=transkription_positions) for key in copy_keys: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] newWord.__dict__[status] = transkription_positions[0].__dict__[status] new_words.append(newWord) return new_words def undo_partitioning(self): """Undo partitioning. """ if len(self.word_parts) > 0: for word_part in self.word_parts: word_part.undo_partitioning() if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]): self.transkription_positions += word_part.transkription_positions self.earlier_version = None self.word_box = None self.word_parts = [] self.earlier_versions = [] self.box_paths = [] def _get_parts_with_property_key(self, property_key): """Return a list of word_parts with property == property_key. """ word_parts = [] for word_part in self.word_parts: if property_key in word_part.__dict__.keys(): word_parts.append(word_part) else: word_parts += word_part._get_parts_with_property_key(property_key) return word_parts def _get_partial_word_over_box(self): """Partition a word according to its transkription_positions' has_box ->split word and add partial words as its parts. :return: word over box or self """ word_over_box = None if self.has_mixed_status('has_box'): transkription_positions = [] last_word_box = None for transkription_position in self.transkription_positions: if transkription_position.has_box != last_word_box\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box transkription_positions = [] transkription_positions.append(transkription_position) last_word_box = transkription_position.has_box if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box self.transkription_positions = [] elif len(self.word_parts) > 0: self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box') elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1: word_over_box = self word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box return word_over_box def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin): """Set box_path to transkription_position that is contained by box_path. Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary. """ if box_path.contains_path(word_path): transkription_position.has_box = box_path elif box_path.contains_start_of_path(word_path): split_position = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[0].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path elif box_path.contains_end_of_path(word_path): split_position = box_path.path.bbox()[0] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path else: # box_path in the middle of word_pathz split_position1 = box_path.path.bbox()[0] - tr_xmin split_position2 = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position1, split_position2) if len(new_tps) >= 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path Index: svgscripts/datatypes/color.py =================================================================== --- svgscripts/datatypes/color.py (revision 0) +++ svgscripts/datatypes/color.py (revision 76) @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This class can be used to represent a color. +""" +# Copyright (C) University of Basel 2019 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.1" + +from lxml import etree as ET +from matplotlib import colors +import re +import sys +import webcolors + +from .attachable_object import AttachableObject + +sys.path.append('py2ttl') +from class_spec import SemanticClass + +class Color(SemanticClass,AttachableObject): + """ + This class represents a color. + + Args: + + """ + alias = { 'teal': 'blue', 'olive': 'green' } + XML_TAG = 'color' + NAME_INDEX = 0 + HEX_INDEX = 1 + stringKeys = [ 'name', 'hex_color' ] + + def __init__(self, color_name='black', hex_color='#000000', rgb_color=(0, 0, 0)): + self.name = color_name + self.hex_color = hex_color + self.rgb_color = rgb_color + + def attach_object_to_tree(self, target_tree): + """Attach object to tree. + """ + if target_tree.__class__.__name__ == '_ElementTree': + target_tree = target_tree.getroot() + obj_node = target_tree.xpath('.//' + self.XML_TAG + '[@hex="%s"]' % self.hex_color)[0] \ + if(len(target_tree.xpath('.//' + self.XML_TAG + '[@hex="%s"]' % self.hex_color)) > 0) \ + else ET.SubElement(target_tree, self.XML_TAG) + for key in self.stringKeys: + if self.__dict__[key] is not None: + obj_node.set(key.replace('_','-'), self.__dict__[key]) + + @classmethod + def create_cls(cls, node=None, hex_color='#000000', manuscript=None): + """Creates a Color from a hex_color_string. + + :return: (datatypes.style) Color + """ + if manuscript is not None\ + and manuscript.get_color(hex_color) is not None: + return manuscript.get_color(hex_color) + if node is not None: + color_name = node.get(cls.stringKeys[cls.NAME_INDEX].replace('_','-')) + hex_color = node.get(cls.stringKeys[cls.HEX_INDEX].replace('_','-')) + rgb_color = webcolors.hex_to_rgb(hex_color) + + else: + rgb_color = webcolors.hex_to_rgb(hex_color) + color_name = cls.get_color_name(rgb_color) + color = cls(color_name, hex_color, rgb_color) + if manuscript is not None: + manuscript.update_colors(color) + return color + + @classmethod + def get_closest_color(cls, rgb_color) -> str: + """Return closest color name for a rgb_color triple. **SLOW** + """ + min_colours = {} + for key, name in webcolors.css21_hex_to_names.items(): + color = webcolors.hex_to_rgb(key) + rd = (color[0] - rgb_color[0]) ** 2 + gd = (color[1] - rgb_color[1]) ** 2 + bd = (color[2] - rgb_color[2]) ** 2 + min_colours[(rd + gd + bd)] = cls.alias[name]\ + if name in cls.alias.keys()\ + else name + return min_colours[min(min_colours.keys())] + + @classmethod + def get_color_name(cls, rgb_color) -> str: + """Return the name of a rgb_color triple. + """ + try: + color_name = webcolors.rgb_to_name(rgb_color) + except ValueError: + color_name = cls.get_closest_color(rgb_color) + return color_name + + @classmethod + def get_semantic_dictionary(cls): + """ Creates a semantic dictionary as specified by SemanticClass. + """ + properties = {} + properties.update(cls.create_semantic_property_dictionary('name', str, cardinality=1,\ + name='colorHasName', label='color has name', comment='Connects a color with its name.')) + properties.update(cls.create_semantic_property_dictionary('hex_color', str, cardinality=1,\ + name='hasHexadecimalValue', label='color has a hexadecimal value',\ + comment='Connects a color with its hexadecimal representation. A hexadecimal color is specified with: #RRGGBB.')) + dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } + return cls.return_dictionary_after_updating_super_classes(dictionary) + + Index: svgscripts/datatypes/style.py =================================================================== --- svgscripts/datatypes/style.py (revision 0) +++ svgscripts/datatypes/style.py (revision 76) @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This class can be used to represent the style of a word. +""" +# Copyright (C) University of Basel 2019 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.1" + +from lxml import etree as ET +import re +import sys + +from .color import Color + +sys.path.append('py2ttl') +from class_spec import SemanticClass + + +class Style(SemanticClass): + """ + This class represents the style of a word. + + Args: + manuscript: a ArchivalManuscriptUnity + + """ + NIETSCHES_FONTS = { 'german': 'deutsche Schreibschrift', 'latin': 'lateinische Schreibschrift' } + COLOR_KEYS = [ 'black', 'red', 'blue', 'green' ] + WRITING_INSTRUMENTS = { (COLOR_KEYS[0], False): 'schwarze Tinte',\ + (COLOR_KEYS[0], True): 'Bleistift',\ + (COLOR_KEYS[1], False): 'braune Tinte',\ + (COLOR_KEYS[1], True): 'Rotstift',\ + (COLOR_KEYS[2], False): 'violette Tinte',\ + (COLOR_KEYS[2], True): 'Blaustift',\ + (COLOR_KEYS[3], False): '„Tinte der letzten Korrektur“'} + + def __init__(self, manuscript=None): + self.color = Color() + self.font_family = 'Weidemann-Book' + self.font = self.NIETSCHES_FONTS['german'] + self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, False)] + self.manuscript = manuscript + self.relevant_key_map = {'font-family': self.set_font, 'fill': self.set_color, 'stroke': self.set_color} + + @classmethod + def create_cls(cls, page, style_string, manuscript=None): + """Creates a Style from a style_string. + + :return: (datatypes.style) Style + """ + style = cls(manuscript=manuscript) + style_dict = { key: key_dict for key, key_dict in page.style_dict.items()\ + if any(relevant_key in key_dict.keys() for relevant_key in style.relevant_key_map.keys()) } + for style_key in style_string.split(' '): + if style_key in style_dict.keys(): + dictionary = style_dict[style_key] + for key, set_function in [ (key, func) for key, func in style.relevant_key_map.items() if key in dictionary.keys() ]: + set_function(dictionary[key]) + style.process_style_classes() + return style + + @classmethod + def get_semantic_dictionary(cls): + """ Creates a semantic dictionary as specified by SemanticClass. + """ + properties = {} + properties.update(cls.create_semantic_property_dictionary('font', str, cardinality=1,\ + name='styleHasFont', label='style has font', comment='Connects a style with the kind of font Nietzsche used in writing.')) + properties.update(cls.create_semantic_property_dictionary('writing_instrument', str, cardinality=1,\ + name='styleHasWritingInstrument', label='style has writing instrument', comment='Connects a style with the description of a writing instrument.')) + properties.update(cls.create_semantic_property_dictionary('color', Color, cardinality=1,\ + name='styleHasColor', label='style has color', comment='Connects a style with a color.')) + dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } + return cls.return_dictionary_after_updating_super_classes(dictionary) + + def process_style_classes(self): + """Infere writing instrument from font-family and color. + """ + if self.font_family.startswith('NewsGothic'): + self.font = self.NIETSCHES_FONTS['latin'] + if self.color.name in self.COLOR_KEYS: + self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, self.font_family.endswith('Bold'))] + + def set_color(self, hex_color): + self.color = Color.create_cls(hex_color=hex_color, manuscript=self.manuscript) + + def set_font(self, font_family): + self.font_family = font_family + Index: svgscripts/util.py =================================================================== --- svgscripts/util.py (revision 75) +++ svgscripts/util.py (revision 76) @@ -1,420 +1,435 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to copy a faksimile svg file with the option of highlighting some word boxes. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from datetime import datetime from functools import cmp_to_key import getopt import inspect import itertools import lxml.etree as ET import re import shutil import signal import string import subprocess from svgpathtools import svg_to_paths import sys import tempfile import os from os import listdir, sep, path, setpgrp, devnull, makedirs from os.path import basename, commonpath, dirname, exists, isfile, isdir, realpath, splitext import warnings import wget import xml.etree.ElementTree as XET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.faksimile import FaksimilePage, get_paths_inside_rect from datatypes.faksimile_image import FaksimileImage from datatypes.lineNumber import LineNumber from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page import Page from datatypes.page_creator import PageCreator from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import update_transkription_position_ids from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT from process_files import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False HIGHLIGHT_COLOR = 'red' OPACITY = '0.5' class ExternalViewer: """This class can be used to show files with external viewers. """ file_format_viewer_dict = { '.pdf': PDF_READER, '.svg': SVG_EDITOR } @classmethod def show_files(cls, single_file=None, list_of_files=[]): """Opens file(s) with corresponding external viewer(s). """ DEVNULL = None if type(single_file) == list: list_of_files = single_file elif single_file is not None: list_of_files.append(single_file) if len(list_of_files) > 1: DEVNULL = open(devnull, 'wb') process_list = [] list_of_files.reverse() while len(list_of_files) > 0: file2open = list_of_files.pop() viewer = cls.file_format_viewer_dict.get(splitext(file2open)[1]) if viewer is not None: if len(list_of_files) > 0: process_list.append(\ subprocess.Popen([viewer, file2open], stdout=DEVNULL, stderr=DEVNULL, preexec_fn=os.setsid)) else: subprocess.run([viewer, file2open]) for process in process_list: os.killpg(os.getpgid(process.pid), signal.SIGTERM) if DEVNULL is not None: DEVNULL.close() def back_up(page: Page, reference_file, bak_dir='./bak') -> str: """Back up a xml_source_file. :return: target_file_name """ date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') makedirs(bak_dir, exist_ok=True) target_file_name = bak_dir + sep + basename(page.page_tree.docinfo.URL) + '_' + date_string write_pretty(xml_element_tree=page.page_tree, file_name=target_file_name,\ script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, reference_file),\ file_type=FILE_TYPE_SVG_WORD_POSITION) return target_file_name def copy_faksimile_svg_file(target_file=None, faksimile_source_file=None, faksimile_tree=None, target_directory=None, abs_image_path=None, local_image_path=None, namespaces=None): """Copy a faksimile_svg_file to target_file. """ if faksimile_source_file is None and faksimile_tree is not None: faksimile_source_file = faksimile_tree.docinfo.URL elif faksimile_source_file is None: raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file') if target_file is not None and target_directory is not None: target_file = target_directory + sep + target_file elif target_file is None and target_directory is not None: target_file = target_directory + sep + basename(faksimile_source_file) elif target_file is None: raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory') paths, attributes, svg_attributes = svg_to_paths.svg2paths(faksimile_source_file, return_svg_attributes=True) for key in [ key for key in svg_attributes.keys() if key.startswith('xmlns:') ]: try: XET.register_namespace(key.replace('xmlns:', ''), svg_attributes[key]) except ValueError: pass XET.register_namespace('', 'http://www.w3.org/2000/svg') if namespaces is None: namespaces = { 'ns': svg_attributes['xmlns'], 'xlink': svg_attributes['xmlns:xlink'],\ 'sodipodi': svg_attributes['xmlns:sodipodi'] } if faksimile_tree is not None: element = XET.fromstring(ET.tostring(faksimile_tree))\ if type(faksimile_tree) == ET._ElementTree\ else XET.fromstring(XET.tostring(faksimile_tree.getroot())) target_tree = XET.ElementTree(element) else: target_tree = XET.parse(faksimile_source_file) if (local_image_path is not None or abs_image_path is not None)\ and len(target_tree.findall('.//ns:image', namespaces=namespaces)) > 0: image_node = target_tree.findall('.//ns:image', namespaces=namespaces)[0] if local_image_path is not None: image_node.set('{%s}href' % namespaces['xlink'], local_image_path) if abs_image_path is not None: image_node.set('{%s}absref' % namespaces['sodipodi'], abs_image_path) target_tree.write(target_file) def copy_faksimile_update_image_location(faksimile_source_file=None, faksimile_tree=None, target_file=None, target_directory=None, overwrite=False): """Copy a faksimile_svg_file to target_file and update image location. """ if faksimile_source_file is None and faksimile_tree is not None: faksimile_source_file = faksimile_tree.docinfo.URL elif faksimile_source_file is None: raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file') if target_file is not None and target_directory is not None: target_file = target_directory + sep + target_file elif target_file is None and target_directory is not None: target_file = target_directory + sep + basename(faksimile_source_file) elif target_directory is None and target_file is not None: target_directory = dirname(target_file) elif target_file is None: raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory') source_tree = ET.parse(faksimile_source_file) if faksimile_tree is None else faksimile_tree namespaces = { k if k is not None else 'ns': v for k, v in source_tree.getroot().nsmap.items() } image_nodes = source_tree.xpath('//ns:image', namespaces=namespaces) local_image_path = None abs_image_path = None user_abs_image_path = None if len(image_nodes) > 0: image = FaksimileImage.CREATE_IMAGE(image_nodes[0], source_file=faksimile_source_file) abs_image_path = image.local_path for user_name in USER_ROOT_LOCATION_DICT.keys(): if user_name in target_directory: user_abs_image_path = abs_image_path.replace(FAKSIMILE_LOCATION, USER_ROOT_LOCATION_DICT[user_name]).replace('//','/') break # if target_directory is subdir of FAKSIMILE_LOCATION if realpath(target_directory).startswith(realpath(FAKSIMILE_LOCATION)): common_path = commonpath([ realpath(target_directory), realpath(dirname(image.local_path)) ]) relative_directory = '/'.join(\ [ '..' for d in realpath(target_directory).replace(common_path + '/', '').split('/') ]) local_image_path = relative_directory + realpath(image.local_path).replace(common_path, '') if not isfile(target_directory + sep + local_image_path): local_image_path = None elif abs_image_path is not None: local_image_path = abs_image_path if abs_image_path is not None and not isfile(abs_image_path): wget.download(image.URL, out=dirname(abs_image_path)) if not isfile(target_file) or overwrite: abs_image_path = user_abs_image_path if user_abs_image_path is not None else abs_image_path copy_faksimile_svg_file(target_file=target_file, faksimile_source_file=faksimile_source_file,\ faksimile_tree=faksimile_tree, abs_image_path=abs_image_path,\ local_image_path=local_image_path, namespaces=namespaces) else: msg = 'File {0} not copied to directory {1}, it already contains a file {2}.'.format(faksimile_source_file, target_directory, target_file) warnings.warn(msg) def copy_xml_file_word_pos_only(xml_source_file, target_directory): """Copy word positions of a xml file to target directory. :return: (str) xml_target_file """ xml_target_file = target_directory + sep + basename(xml_source_file) source_page = Page(xml_source_file) target_page = PageCreator(xml_target_file, title=source_page.title, page_number=source_page.number, orientation=source_page.orientation) target_page.words = source_page.words target_page.update_and_attach_words2tree() write_pretty(xml_element_tree=target_page.page_tree, file_name=xml_target_file,\ script_name=__file__ + '({})'.format(inspect.currentframe().f_code.co_name), file_type=FILE_TYPE_SVG_WORD_POSITION) return xml_target_file def create_highlighted_svg_file(faksimile_tree, node_ids, target_file=None, target_directory=None, local_image_path=None, namespaces={}, highlight_color=HIGHLIGHT_COLOR, opacity=OPACITY): """Highlights the nodes of a faksimile_tree that are specified by the list of node_ids and writes the tree to a file. """ if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } for node in itertools.chain(*[\ faksimile_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)\ for node_id in node_ids\ ]): node.set('fill', highlight_color) node.set('opacity', opacity) node.set('style', '') copy_faksimile_update_image_location(target_file=target_file, faksimile_tree=faksimile_tree, target_directory=target_directory) def get_empty_node_ids(faksimile_tree, x_min=0.0, x_max=0.0, y_min=0.0, y_max=0.0, text_field_id=None, faksimile_page=None, namespaces={}): """Returns a list of ids of rect and path nodes that do not have a title element. """ THRESHOLD_X = 10 if faksimile_page is not None: x_min = faksimile_page.text_field.xmin + faksimile_page.faksimile_image.x x_max = faksimile_page.text_field.xmax + faksimile_page.faksimile_image.x - THRESHOLD_X y_min = faksimile_page.text_field.ymin + faksimile_page.faksimile_image.y y_max = faksimile_page.text_field.ymax + faksimile_page.faksimile_image.y text_field_id = faksimile_page.text_field.id if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } empyt_node_ids = [] nodes_without_title = faksimile_tree.xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'.format(\ x_min, x_max, y_min, y_max, text_field_id), namespaces=namespaces) nodes_without_title += get_paths_inside_rect(faksimile_tree, '//ns:path[not(./ns:title)]', x_min, x_max, y_min, y_max, text_field_id, namespaces=namespaces) for node_without_title in nodes_without_title: empyt_node_ids.append(node_without_title.get('id')) return empyt_node_ids def get_mismatching_ids(words, faksimile_positions): """ Return the list of mismatching words and the list of mismatching faksimile_positions as a 2-tuple. """ mismatching_words = [] mismatching_faksimile_positions = [] faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions) word_texts = [ word.text for word in words ] for word_text in set(word_texts): if word_text not in unique_faksimile_words: mismatching_words += [ word for word in words if word.text == word_text ] for faksimile_position_text in unique_faksimile_words: if faksimile_position_text not in set(word_texts): mismatching_faksimile_positions += [ faksimile_position for faksimile_position in faksimile_positions\ if faksimile_position.text == faksimile_position_text ] return mismatching_words, mismatching_faksimile_positions def record_changes(original_svg_file, changed_svg_file, node_ids, namespaces={}): """Copy changes made to changed_svg_file to original_svg_file. """ old_tree = ET.parse(original_svg_file) new_tree = ET.parse(changed_svg_file) if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() } for node_id in node_ids: new_titles = new_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces) old_nodes = old_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces) if len(new_titles) > 0 and len(old_nodes) > 0: if old_nodes[0].find('ns:title', namespaces=namespaces) is not None: old_nodes[0].find('ns:title', namespaces=namespaces).text = new_titles[0].text else: old_title_id_string = new_titles[0].get('id') old_title = ET.SubElement(old_nodes[0], 'title', attrib={ 'id': old_title_id_string }) old_title.text = new_titles[0].text elif len(old_nodes) > 0: for old_node in old_nodes: old_node.getparent().remove(old_node) copy_faksimile_svg_file(target_file=original_svg_file, faksimile_tree=old_tree) def record_changes_on_svg_file_to_page(xml_source_file, svg_file, word_ids=None): """Copy changes made to svg_file to xml_source_file. :return: datatypes.page.Page """ svg_tree = ET.parse(svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } transkription_field = TranskriptionField(svg_file) page = Page(xml_source_file) words = [ word for word in page.words if word.id in word_ids ]\ if word_ids is not None else page.words new_page_words = [] for word in words: word_id = 'word_' + str(word.id) + '_' recorded_ids = [] for transkription_position in word.transkription_positions: transkription_position_id = word_id + str(transkription_position.id) tp_nodes = svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]'.format(transkription_position_id), namespaces=namespaces) if len(tp_nodes) > 0: record_changes_to_transkription_position(tp_nodes[0], transkription_position,\ transkription_field.xmin, transkription_field.ymin, namespaces=namespaces) recorded_ids.append(transkription_position_id) extra_nodes = [ node for node in\ svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[contains(@id, "{0}")]'.format(word_id), namespaces=namespaces)\ if node.get('id') not in recorded_ids ] if len(extra_nodes) > 0: for extra_node in extra_nodes: old_ids = [ inkscape_id.replace('#','') for inkscape_id in\ svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]/@inkscape:label'.format(extra_node.get('id')),\ namespaces=namespaces) ] if len(old_ids) > 0 and re.match(r'word_[0-9]+_[0-9]+', old_ids[0]): old_id_list = old_ids[0].split('_') ref_word_id = int(old_id_list[1]) ref_tp_id = old_id_list[2] ref_words = [ word for word in page.words if word.id == ref_word_id ] if len(ref_words) > 0: ref_tps = [ tp for tp in ref_words[0].transkription_positions\ if tp.id == ref_tp_id ] if len(ref_tps) > 0: ref_words[0].transkription_positions.remove(ref_tps[0]) record_changes_to_transkription_position(extra_node,\ ref_tps[0], transkription_field.xmin, transkription_field.ymin, namespaces=namespaces) word.transkription_positions.append(ref_tps[0]) for word in page.words: if word.has_mixed_status('text'): new_page_words += [ word for word in word.split_according_to_status('text') if word.text is not None and word.text != '' ] elif len(word.transkription_positions) > 0: new_text = [ tp.text for tp in word.transkription_positions if tp.text is not None and tp.text != '' ] if len(new_text) > 0: word.text = new_text[0] new_page_words.append(word) page.words = new_page_words page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids) page.unlock() if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\ script_name=__file__ + ' -> ' + inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_SVG_WORD_POSITION) return page def record_changes_on_xml_file_to_page(xml_source_file, xml_file) -> Page: """Copy changes made to xml_file to xml_source_file. :return: datatypes.page.Page """ copy_page = Page(xml_file) page = Page(xml_source_file) page.unlock() back_up(page, xml_file) page.words = [] for word in copy_page.words: if word.split_strings is None\ or len(word.split_strings) == 0: page.words.append(word) else: next_word = word for split_string in word.split_strings: _, new_word, next_word = next_word.split(split_string) page.words.append(new_word) if next_word is not None: page.words.append(next_word) page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids) + remove_words_if_done = [] + for word in page.words: + if 'join_string' in word.__dict__.keys()\ + and word.join_string is not None: + if word.id > 0\ + and page.words[word.id-1].text + word.text == word.join_string: + page.words[word.id-1].join(word) + remove_words_if_done.append(word) + elif word.id < len(page.words)\ + and word.text + page.words[word.id+1].text == word.join_string: + word.join(page.words[word.id+1]) + remove_words_if_done.append(page.words[word.id+1]) + for word in remove_words_if_done: + page.words.remove(word) + page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids) if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\ script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, xml_file), file_type=FILE_TYPE_SVG_WORD_POSITION) return page def record_changes_to_transkription_position(node, transkription_position, xmin=0.0, ymin=0.0, namespaces=None): """Record changes made to node to transkription_position. """ if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in node.nsmap.items() } if bool(node.get('x')): transkription_position.left = float(node.get('x')) - xmin if bool(node.get('y')): transkription_position.top = float(node.get('y')) - ymin if bool(node.get('width')): transkription_position.width = float(node.get('width')) if bool(node.get('height')): transkription_position.height = float(node.get('height')) if len(node.xpath('./ns:title/text()', namespaces=namespaces)) > 0: transkription_position.text = node.xpath('./ns:title/text()', namespaces=namespaces)[0] def replace_chars(words, faksimile_positions, unique_faksimile_words=None): """Return unique_faksimile_words and faksimile_positions, with characters changed according to transcription words. """ if unique_faksimile_words is None: unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\ key=lambda text: len(text)) for index, word_text in enumerate(unique_faksimile_words): if len([ word for word in words if word.text == word_text ]) == 0: if re.match(r'.*".*', word_text)\ and len([ word for word in words if word.text == word_text.replace('"', '“') ]) > 0: unique_faksimile_words[index] = word_text.replace('"', '“') elif re.match(r'.*ss.*', word_text)\ and len([ word for word in words if word.text == word_text.replace('ss', 'ß') ]) > 0: unique_faksimile_words[index] = word_text.replace('ss', 'ß') elif re.match(r'.*-.*', word_text)\ and len([ word for word in words if word.text == word_text.replace('-', '–') ]) > 0: unique_faksimile_words[index] = word_text.replace('-', '–') for faksimile_position in [ faksimile_position for faksimile_position in faksimile_positions\ if faksimile_position.text == word_text ]: faksimile_position.text = unique_faksimile_words[index] elif word_text == '-'\ and len([ word for word in words if word.text == '–' ]) > 0: print([ word.text for word in words if word.text == word_text ]) print([ word.text for word in words if word.text == '–' ]) return faksimile_positions, unique_faksimile_words Index: svgscripts/process_files.py =================================================================== --- svgscripts/process_files.py (revision 75) +++ svgscripts/process_files.py (revision 76) @@ -1,358 +1,358 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract information from all text svg files in directory. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from convertPDF2SVG4Web import Converter from datatypes.page_creator import PageCreator from datatypes.transkriptionField import TranskriptionField from extractWordPosition import Extractor sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False class MyErrorHandler: """This class can be used to handle errors executing extractWordPosition.Extractor.extractAndWriteInformation. """ ERROR_LOG = 'error_log.xml' def __init__(self): self.tree = ET.ElementTree(ET.Element('error-log')) if isfile(MyErrorHandler.ERROR_LOG): parser = ET.XMLParser(remove_blank_text=True) self.tree = ET.parse(MyErrorHandler.ERROR_LOG, parser) def record_error(self, svgfile, pdffile, title, page_number, error=None): """Records an error. """ if len(self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))) > 0: error_node = self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))[0] else: error_node = ET.SubElement(self.tree.getroot(), 'error', attrib={'title': title, 'number': page_number}) ET.SubElement(error_node, 'svgfile').text = svgfile ET.SubElement(error_node, 'pdffile').text = pdffile if error is not None: error_node.set('type', str(type(error).__name__)) if str(error) != '': error_msg = ET.SubElement(error_node, 'error-msg') error_msg.text = str(error) if str(type(error).__name__) == 'ExpatError': error_msg.text += '->svgfile is empty!' def run(self, title=None, page_number=None, error_type=None): """Run all or some errors [:return:] exit status (int) """ xpath = '//error' if title is not None and page_number is not None: xpath = '//error[@title="{0}" and @number="{1}"]'.format(title, page_number) elif title is not None: xpath = '//error[@title="{0}"]'.format(title) elif page_number is not None: xpath = '//error[@number="{0}"]'.format(page_number) if error_type is not None: xpath = xpath + '[@type="{0}"]'.format(error_type)\ if title is None and page_number is None\ else xpath.replace(']', ' ') + 'and @type="{0}"]'.format(error_type) exit_status = 0 for error in self.tree.xpath(xpath): title = error.get('title') page_number = error.get('number') svgfile = error.xpath('./svgfile/text()')[0]\ if len(error.xpath('./svgfile/text()')) > 0 else None pdffile = error.xpath('./pdffile/text()')[0]\ if len(error.xpath('./pdffile/text()')) > 0 else None if svgfile is not None: converter = Converter(title=title) extractor = Extractor(title=title, compare2pdf=True) status = process_file(converter, extractor, svgfile, pdffile, page_number) if status > 0: exit_status = status if status < 2: error.getparent().remove(error) self.write() return exit_status def write(self): """Writes error log. """ write_pretty(xml_element_tree=self.tree, file_name=MyErrorHandler.ERROR_LOG, script_name=__file__, file_type='xmlErrorLog') def is_page_ok(manuscript_file=None, page_number=None): """Returns true if page status is 'OK'. """ if manuscript_file is not None and isfile(manuscript_file): manuscript_tree = ET.parse(manuscript_file) if page_number is not None\ and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0: return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('status') == 'OK'\ and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')) return False def is_svg_ok(manuscript_file=None, page_number=None): """Returns true if svgfile contains a valid svg graphic location. """ if manuscript_file is not None and isfile(manuscript_file): manuscript_tree = ET.parse(manuscript_file) if page_number is not None\ and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0\ and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')): xml_source_tree = ET.parse(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')) return len(xml_source_tree.xpath('//svg/@file')) > 0 and isfile(xml_source_tree.xpath('//svg/@file')[0]) return False def process_file(converter, extractor, svgfile, pdffile, page_number): """Processes file. [:return:] exit status (int) """ exit_status = 0 path_svg_file = converter.get_file_name(pdffile, page_number=page_number) if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Processing file {} ...'.format(svgfile)) print(Style.RESET_ALL) if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0: transkriptionField = TranskriptionField(path_svg_file) transkriptionField.shrink_svg_to_transkription_field() xml_target_file = extractor.get_file_name(svgfile, page_number) extraction_status = extractor.extractAndWriteInformation(svgfile, xml_target_file=xml_target_file,\ page_number=page_number, pdfFile=pdffile, svg_file=path_svg_file, record_warnings=True) if extraction_status < 2 and extractor.manuscript_file is not None: status = 'OK' if extraction_status == 1: status = extractor.latest_status exit_status = 1 #update_manuscript_file(extractor.manuscript_file, page_number, xml_target_file, status=status) update_svgposfile_status(xml_target_file, manuscript_file=extractor.manuscript_file, status=status) return exit_status def update_graphical_svg(converter, svgfile, pdffile, page_number, xml_target_file): """Create a new graphical svg file and update xml output file. [:return:] exit status (int) """ exit_status = 0 if isfile(xml_target_file): path_svg_file = converter.get_file_name(pdffile, page_number=page_number) if not UNITTESTING: - print(Fore.LIGHTBLUE_EX + 'Creating file {} ...'.format(svgfile)) + print(Fore.LIGHTBLUE_EX + 'Creating file {} ...'.format(path_svg_file)) print(Style.RESET_ALL) if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0: transkriptionField = TranskriptionField(path_svg_file) transkriptionField.shrink_svg_to_transkription_field() page = PageCreator(xml_target_file, svg_file=path_svg_file) write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) else: exit_status = 2 return exit_status def update_manuscript_file(manuscript_file, page_number, file_name, status='changed', append=True): """Updates manuscript file: adds status information about page. """ if isfile(manuscript_file): parser = ET.XMLParser(remove_blank_text=True) manuscript_tree = ET.parse(manuscript_file, parser) if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0: node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0] old_status = node.get('status') if old_status is None or 'OK' not in old_status.split(':'): node.set('status', status) elif append: if status not in old_status.split(':'): new_status = old_status + ':' + status node.set('status', new_status) else: node.set('status', new_status) if not bool(node.get('output')): node.set('output', file_name) else: pages_node = manuscript_tree.getroot().find('pages')\ if manuscript_tree.getroot().find('pages') is not None\ else ET.SubElement(manuscript_tree.getroot(), 'pages') new_id = len(pages_node.findall('page')) + 1 ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name}) write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT) def update_svgposfile_status(file_name, manuscript_file=None, status='changed', append=True): """Updates svg position file's status. """ if isfile(file_name): parser = ET.XMLParser(remove_blank_text=True) file_tree = ET.parse(file_name, parser) old_status = file_tree.getroot().get('status') if old_status is None or 'OK' not in old_status.split(':'): file_tree.getroot().set('status', status) elif append: if status not in old_status.split(':'): new_status = old_status + ':' + status file_tree.getroot().set('status', new_status) else: file_tree.getroot().set('status', new_status) write_pretty(xml_element_tree=file_tree, file_name=file_name, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) if manuscript_file is not None and isfile(manuscript_file): page_number = file_tree.getroot().get('number') update_manuscript_file(manuscript_file, page_number, file_name, status=status) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract information from all text svg files in a directory. svgscripts/process_files.py [OPTIONS] svgscripts/process_files.py [OPTIONS] Directory containing pdfs corresponding to svg files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg). Directory containing svg files corresponding to pdf files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg). OPTIONS: -h|--help: show help -e|--run-error Rerun error cases. -g|--check-graphic-svg Check that graphical svg file exists or generate a new svg file. -n|--number=pageNumber Use this with OPTION -e|--run-error in order to specify an error case. -t|--title=title: title of the manuscript to which all files belong. -T|--error-type: error type, use this with OPTION -e|--run-error in order to specify an error case. -s|--svg-target-dir=svg-target-dir target directory for path svg files, i.e. svg files that can be displayed on the web. -x|--xml-target-dir=xml-target-dir target directory for xml files. :return: exit code (int) """ title = None xml_target_dir = ".{}xml".format(sep) svg_target_dir = ".{}svg".format(sep) error_handler = MyErrorHandler() number = None rerun_errors = False error_type = None check_graphic_svg_exists = False try: opts, args = getopt.getopt(argv, "hegn:s:t:T:x:", ["help", "run-error", "check-graphic-svg", "number=", "svg-target-dir=", "title=", "error-type=", "xml-target-dir="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-e', '--run-error'): rerun_errors = True elif opt in ('-g', '--check-graphic-svg'): check_graphic_svg_exists = True elif opt in ('-t', '--title'): title = arg elif opt in ('-T', '--error-type'): error_type = arg elif opt in ('-n', '--number'): number = arg elif opt in ('-s', '--svg-target-dir'): svg_target_dir = arg elif opt in ('-x', '--xml-target-dir'): xml_target_dir = arg if rerun_errors: return error_handler.run(title=title, page_number=number, error_type=error_type) if len(args) == 1 and args[0].endswith('.xml'): source_tree = ET.parse(args[0]) if source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: svg_word_file_tree = ET.parse(source_tree.xpath('//page/@output')[0]) svg_dir = dirname(svg_word_file_tree.xpath('//page/@source')[0]) pdf_dir = dirname(svg_word_file_tree.xpath('//page/pdf/@file')[0]) else: print('File {} is not of type {}'.format(args[0], FILE_TYPE_XML_MANUSCRIPT)) usage() return 2 elif len(args) < 1 or\ (len(args) == 1\ and (True not in [ pdffile.endswith('pdf') for pdffile in listdir(args[0]) ]\ or True not in [ svgfile.endswith('svg') for svgfile in listdir(args[0]) ])\ ): print("Please specify both PDFDIR and TEXT_SVG_DIR!") usage() return 2 elif len(args) < 2: pdf_dir, svg_dir = args[0], args[0] elif isdir(args[0]) and isdir(args[1]): pdf_dir, svg_dir = args[0], args[1] if True in [ svgfile.endswith('pdf') for svgfile in listdir(args[1]) ]: pdf_dir, svg_dir = args[1], args[0] else: not_existing = args[0] if not isdir(args[0]) else args[1] print("ERROR directory {} does not exist!".format(not_existing)) return 2 list_of_svg = [ svgfile for svgfile in listdir(svg_dir) if svgfile.endswith('svg') ] list_of_pdf = [ pdffile for pdffile in listdir(pdf_dir) if pdffile.endswith('pdf') ] converter = Converter(target_dir=svg_target_dir, title=title) extractor = Extractor(xml_dir=xml_target_dir, title=title, compare2pdf=True) exit_status = 0 for svgfile in list_of_svg: if svgfile.replace('.svg', '.pdf') in list_of_pdf: title = re.split(r'(^[A-Z]+p*_[A-Z]*_[0-9]*)', svgfile)[1].replace('_', ' ') if extractor.title is None or extractor.title != title: extractor.update_title_and_manuscript(title) if converter.title is None or converter.title != title: converter.title = title.replace(' ', '_') if 'page' in svgfile: page_number = svgfile.replace('.svg','').split('page')[1] else: page_number = svgfile.replace('.svg','').split('_')[len(svgfile.replace('.svg','').split('_'))-1] pdffile = '{}{}{}'.format(pdf_dir, sep, svgfile.replace('.svg', '.pdf')) if not check_graphic_svg_exists and not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number): try: svgfile = '{}{}{}'.format(svg_dir, sep, svgfile) exit_status = process_file(converter, extractor, svgfile, pdffile, page_number) except Exception as err: error_handler.record_error(svgfile, pdffile, title, page_number, error=err) if not UNITTESTING: print(Fore.RED) print('There was an error ->', err) print(Style.RESET_ALL) elif not is_svg_ok(manuscript_file=extractor.manuscript_file, page_number=page_number): update_graphical_svg(converter, svgfile, pdffile, page_number, extractor.get_file_name(svgfile, page_number)) error_handler.write() return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/fix_missing_glyphs.py =================================================================== --- svgscripts/fix_missing_glyphs.py (revision 75) +++ svgscripts/fix_missing_glyphs.py (revision 76) @@ -1,192 +1,200 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to fix missing glyphs. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.page import Page, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition +from datatypes.word import Word from process_files import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False -def find_missing_glyph_for_pwp(positional_word_part_node, svg_path_tree, namespaces, xmin=0.0, ymin=0.0): - """Finds missing glyph for node of a PositionalWordPart. +def find_missing_glyph_for_pwp(pwp, svg_path_tree, namespaces, xmin=0.0, ymin=0.0): + """Finds missing glyph for a PositionalWordPart. :return: list of PositionalWordPart """ THRESHOLD = 15.5 - pwp = PositionalWordPart(node=positional_word_part_node) + #pwp = PositionalWordPart(node=positional_word_part_node) word_part_obj = { "x": pwp.left, "y": pwp.top, "text": pwp.text, "matrix": pwp.transform, "class": pwp.style_class } start_id = int(pwp.id) threshold = -0.5 positional_word_parts = [] while threshold < THRESHOLD and len(positional_word_parts) < 1: try: positional_word_parts = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces,\ start_id=start_id, xmin=xmin, ymin=ymin, threshold=threshold, throw_error_if_not_found=True) except Exception: threshold += 0.1 return positional_word_parts -def update_word(page, positional_word_part_node, positional_word_parts): +def update_word(word, old_transkription_position, old_positional_word_part, positional_word_parts): """Updates word according to new positional_word_parts. + + :return: new transkription_position """ if len(positional_word_parts) > 0: debug_msg_string = 'update word from ' + __file__ - positional_word_part_id = int(positional_word_part_node.get('id')) - transkription_position_id = int(positional_word_part_node.getparent().get('id')) - word_id = int(positional_word_part_node.getparent().getparent().get('id')) - word = page.words[word_id] - transkription_position = word.transkription_positions[transkription_position_id] - transkription_position.positional_word_parts.pop(positional_word_part_id) + old_transkription_position.positional_word_parts.remove(old_positional_word_part) positional_word_parts.reverse() for positional_word_part in positional_word_parts: - transkription_position.positional_word_parts.insert(positional_word_part_id, positional_word_part) - for index, positional_word_part in enumerate(transkription_position.positional_word_parts): + old_transkription_position.positional_word_parts.insert(int(old_positional_word_part.id), positional_word_part) + for index, positional_word_part in enumerate(old_transkription_position.positional_word_parts): positional_word_part.id = index transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ - transkription_position.positional_word_parts, debug_msg_string=debug_msg_string, transkription_position_id=transkription_position_id) - word.transkription_positions.pop(transkription_position_id) + old_transkription_position.positional_word_parts, debug_msg_string=debug_msg_string, transkription_position_id=old_transkription_position.id) + word.transkription_positions.remove(old_transkription_position) transkription_positions.reverse() for new_tp in transkription_positions: - word.transkription_positions.insert(transkription_position_id, new_tp) + word.transkription_positions.insert(int(old_transkription_position.id), new_tp) text = '' for index, tp in enumerate(word.transkription_positions): tp.id = index - tp.writing_process_id = transkription_position.writing_process_id + tp.writing_process_id = old_transkription_position.writing_process_id for pwp in tp.positional_word_parts: text += pwp.text if word.text != text: word.text = text - word.attach_word_to_tree(page.page_tree) + return transkription_positions[0] def fix_missing_glyphs(svg_word_pos_file, manuscript_file=None): """Finds missing glyphs for xml file of type FILE_TYPE_SVG_WORD_POSITION. """ if isfile(svg_word_pos_file): if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Fixing missing glyphs for file {} ... '.format(svg_word_pos_file), end='') - #print(Style.RESET_ALL) + print(Style.RESET_ALL) page = Page(svg_word_pos_file) transkription_field = TranskriptionField(page.svg_file) svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) - for positional_word_part_node in page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]'): - pwps = find_missing_glyph_for_pwp(positional_word_part_node, svg_path_tree, namespaces, xmin=transkription_field.xmin, ymin=transkription_field.ymin) - update_word(page, positional_word_part_node, pwps) + words_without_glyphs = [ word for word in page.words\ + if len([ tp for tp in word.transkription_positions\ + if len([ pwp for pwp in tp.positional_word_parts if pwp.symbol_id is None]) > 0]) > 0 ] + for word in words_without_glyphs: + for transkription_position in word.transkription_positions: + positional_word_parts = transkription_position.positional_word_parts[:] + for positional_word_part in positional_word_parts: + if positional_word_part.symbol_id is None: + pwps = find_missing_glyph_for_pwp(positional_word_part, svg_path_tree, namespaces, xmin=transkription_field.xmin, ymin=transkription_field.ymin) + new_transkription_position = update_word(word, transkription_position, positional_word_part, pwps) + if new_transkription_position is not None: + transkription_position = new_transkription_position + page.update_and_attach_words2tree() write_pretty(xml_element_tree=page.page_tree, file_name=svg_word_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) page = Page(svg_word_pos_file) new_number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) if not UNITTESTING: result_color = Fore.LIGHTBLUE_EX if new_number_of_missing_glyphs == 0 else Fore.MAGENTA print(result_color + ' {0}/{1}'.format(number_of_missing_glyphs-new_number_of_missing_glyphs, number_of_missing_glyphs), end='') print(Fore.LIGHTBLUE_EX + ' fixed.', end='') print(Style.RESET_ALL) if len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) == 0: update_svgposfile_status(svg_word_pos_file, manuscript_file=manuscript_file, status='OK') def get_filelist_and_manuscript_file(file_a, file_b=None): """Returns a file list and a manuscript file (or None) """ file_list = [] manuscript_file = None source_tree = ET.parse(file_a) if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION\ and len([ word_part for word_part in source_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')]) > 0: # if symbol_ids are missing ... file_list.append(file_a) if file_b is not None: manuscript_file = file_b elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: manuscript_file = file_a if file_b is not None: file_list.append(file_b) else: file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND.lower())) return file_list, manuscript_file def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to fix missing glyphs. svgscripts/fix_missing_glyphs.py [OPTIONS] -File [-File] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help: show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): file_b = None if len(args) > 1 and isfile(args[1]): file_b = args[1] file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b) for svg_word_pos_file in file_list: fix_missing_glyphs(svg_word_pos_file, manuscript_file=manuscript_file) else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: error_log.xml =================================================================== --- error_log.xml (revision 75) +++ error_log.xml (revision 76) @@ -1,12 +1,12 @@ xmlErrorLog 2019-06-18 18:31:49 2019-08-02 09:46:40 - 2019-10-18 15:05:05 + 2019-12-03 17:56:25 Index: tests_svgscripts/test_fix_missing_glyphs.py =================================================================== --- tests_svgscripts/test_fix_missing_glyphs.py (revision 75) +++ tests_svgscripts/test_fix_missing_glyphs.py (revision 76) @@ -1,79 +1,79 @@ import unittest from os import sep, path, remove from os.path import isdir, isfile, dirname import shutil import sys import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') import fix_missing_glyphs from datatypes.page import Page from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField class TestMissingGlyphs(unittest.TestCase): def setUp(self): fix_missing_glyphs.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.manuscript = DATADIR + sep + 'pdfsvg' + sep + 'W_II_1.xml' self.manuscript_copy = self.manuscript.replace('.', '_copy.') self.svgposfile = DATADIR + sep + 'pdfsvg' + sep + 'W_II_1_page015.xml' self.svgposfile_copy = DATADIR + sep + 'pdfsvg' + sep + 'W_II_1_page015_copy.xml' def test_main(self): argv_fileNotFound = [ 'asdf' ] with self.assertRaises(FileNotFoundError): fix_missing_glyphs.main(argv_fileNotFound) #shutil.copy(self.manuscript, self.manuscript_copy) #shutil.copy(self.svgposfile, self.svgposfile_copy) #self.assertEqual(fix_missing_glyphs.main([self.manuscript_copy]), 0) #shutil.copy(self.svgposfile_copy, self.svgposfile) #remove(self.manuscript_copy) shutil.copy(self.svgposfile, self.svgposfile_copy) self.assertEqual(fix_missing_glyphs.main([self.svgposfile_copy]), 0) remove(self.svgposfile_copy) def test_update_word(self): page = Page(self.svgposfile) pwps = page.words[5].transkription_positions[0].positional_word_parts new_left = 10 old_left = pwps[0].left new_width = pwps[0].width + old_left - new_left pwps[0].left = new_left pwps[0].width = new_width pwps[0].text = 'X' + pwps[0].text original_text = page.words[5].text - pwp_node = page.page_tree.xpath('//word[@id="5"]/transkription-position[@id="0"]/' + PositionalWordPart.XML_TAG + '[@id="0"]')[0] + word = page.words[5] #print(ET.dump(pwp_node)) - fix_missing_glyphs.update_word(page, pwp_node, [ pwps[0] ]) - pwp_node = page.page_tree.xpath('//word[@id="5"]/transkription-position[@id="0"]/' + PositionalWordPart.XML_TAG + '[@id="0"]')[0] + fix_missing_glyphs.update_word(word, word.transkription_positions[0], word.transkription_positions[0].positional_word_parts[0], [ pwps[0] ]) #print(ET.dump(pwp_node.getparent().getparent())) - self.assertEqual(float(pwp_node.get('width')), new_width) - self.assertEqual(pwp_node.getparent().getparent().get('text'), 'X' + original_text) + self.assertEqual(word.transkription_positions[0].positional_word_parts[0].width, new_width) + self.assertEqual(word.text, 'X' + original_text) def test_find_missing_glyph_for_pwp(self): page = Page(self.svgposfile) transkription_field = TranskriptionField(page.svg_file) svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } positional_word_part_node = page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')[0]\ if len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) > 0 else None - pwps = fix_missing_glyphs.find_missing_glyph_for_pwp(positional_word_part_node, svg_path_tree, namespaces, xmin=transkription_field.xmin, ymin=transkription_field.ymin) + pwp = PositionalWordPart(node=positional_word_part_node) + pwps = fix_missing_glyphs.find_missing_glyph_for_pwp(pwp, svg_path_tree, namespaces, xmin=transkription_field.xmin, ymin=transkription_field.ymin) self.assertEqual(len(pwps), 2) def test_get_filelist_and_manuscript_file(self): file_list, manuscript_file = fix_missing_glyphs.get_filelist_and_manuscript_file(self.manuscript, self.svgposfile) self.assertEqual(len(file_list), 1) self.assertEqual(file_list[0], self.svgposfile) self.assertEqual(manuscript_file, self.manuscript) file_list, manuscript_file = fix_missing_glyphs.get_filelist_and_manuscript_file(self.svgposfile, self.manuscript) self.assertEqual(len(file_list), 1) self.assertEqual(file_list[0], self.svgposfile) self.assertEqual(manuscript_file, self.manuscript) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_word.py =================================================================== --- tests_svgscripts/test_word.py (revision 75) +++ tests_svgscripts/test_word.py (revision 76) @@ -1,358 +1,364 @@ import unittest from os import sep, path import lxml.etree as ET import sys sys.path.append('svgscripts') from process_words_post_merging import reset_page, update_writing_process_ids from datatypes.box import Box from datatypes.matrix import Matrix import datatypes.page from datatypes.path import Path from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word, execute_function_on_parts, update_transkription_position_ids from datatypes.word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass class Page: def __init__(self): self.svg_file = None def get_line_number(self, input=0): return -1 def get_biggest_fontSize4styles(self, style_set={}): return 7 class TestWord(unittest.TestCase): def setUp(self): DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_file = DATADIR + sep + 'N_VII_1_page009.xml' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] x = 0 for dict in self.word_part_objs: dict['class'] = 'st22' dict['x'] = x dict['y'] = 11 x += 1 mylist = {'text': 'abc', 'id': '0', 'line-number': '2', 'deleted': 'true' } word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] self.word_node = ET.Element('word', attrib=mylist) word_position.attach_object_to_tree(self.word_node) x = 0 for char in mylist['text']: ET.SubElement(self.word_node, 'part', attrib={'text': char, 'x': str(x), 'y': '11', 'class': 'st22' }) x += 1 def test_Word_with_word_part_objs(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) self.assertEqual(word.id, 0) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_Word_with_word_node(self): word = Word.create_cls(self.word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, True) self.assertEqual(word.transkription_positions[0].bottom, 11) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 1) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') self.assertEqual(word.line_number, 2) self.assertEqual(word.transkription_positions[0].transform.isRotationMatrix(), True) def test_attach_word_to_tree(self): newWord = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) empty_tree = ET.ElementTree(ET.Element('page')) newWord.attach_word_to_tree(empty_tree) for word_node in empty_tree.getroot().xpath('//word'): word = Word.CREATE_WORD(word_node=word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, False) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_create_correction_history(self): box = Box(earlier_text='XYX') word = Word(text='ASDF', transkription_positions=[TranskriptionPosition()]) word.word_box = box word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.overwrites_word is not None, True) partA = Word(text='A', transkription_positions=[TranskriptionPosition()]) partA.word_box = box partB = Word(text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.word_parts[0].overwrites_word is not None, True) partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.edited_text, 'SDF') partA = Word(id=0, text='Test', writing_process_id=0) partB = Word(id=1, text='er', writing_process_id=1) word = Word(text='Tester', writing_process_id=1, word_parts=[ partA, partB ] ) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[1].extendsEarlierVersion, True) self.assertEqual(word.word_parts[1].isExtensionOfWord, word.earlier_version) box = Box(earlier_text='XYX') partA = Word(id=0, text='Test', writing_process_id=0) partB = Word(id=1, text='er', writing_process_id=1) partB.word_box = box word = Word(text='Tester', writing_process_id=1, word_parts=[ partA, partB ] ) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.earlier_version.text, 'TestXYX') self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) page = datatypes.page.Page('xml/N_VII_1_page138.xml') reset_page(page) update_writing_process_ids(page) word = page.words[77] word.transkription_positions[0].deleted = True word.partition_according_to_deletion() word.word_parts[1].transkription_positions[0].has_box = Box(earlier_text='b') word_over_box = word._get_partial_word_over_box() update_transkription_position_ids(word) + # TODO make that it works without the next two lines ... word.word_parts[1].writing_process_id = 2 word.writing_process_id = 2 word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.text, 'AufBau') self.assertEqual(word.edited_text, 'Bau') self.assertEqual(word.earlier_version.text, 'Aufbau') self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) self.assertEqual(word.word_parts[1].overwrites_word is not None, True) """ empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) newWord = Word.create_cls(word_node) print(ET.dump(word_node)) """ def test_earlier_version(self): partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) earlier_version = word.create_earlier_version() self.assertEqual(earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, earlier_version.word_parts[0]) partA = Word(id=0, text='Test', writing_process_id=0) partB = Word(id=1, text='er', writing_process_id=1) word = Word(text='Tester', writing_process_id=1, word_parts=[ partA, partB ] ) self.assertEqual(word.belongs_to_multiple_writing_processes(include_parts=True), True) def test_undo_partitioning(self): tps = [] for i, xy in enumerate([ 3, 4, 5 ]): tps.append(TranskriptionPosition(id=i, x=xy, y=xy, height=10, width=10)) partA = Word(id=0, text='Auf', writing_process_id=1, deleted=True, transkription_positions=[ tps[0]]) partB = Word(id=1, text='B', writing_process_id=2, transkription_positions=[tps[1]]) partC = Word(id=2, text='au', writing_process_id=1,transkription_positions=[tps[2]]) word = Word(text='Aufbau', writing_process_id=2, word_parts=[ partA, partB, partC ] ) word.undo_partitioning() self.assertEqual(len(word.transkription_positions), len(tps)) self.assertEqual(len(word.word_parts), 0) """ page = datatypes.page.Page('xml/N_VII_1_page138.xml') word = page.words[77] word.undo_partitioning() self.assertEqual(len(word.word_parts), 0) self.assertEqual(len(word.transkription_positions), 3) update_transkription_position_ids(word) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) print(ET.dump(word_node)) """ def test_split(self): page = Page() pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('b') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) self.assertEqual(nextWord.id, 2) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('bc') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('ab', start_id=10) self.assertEqual(currentWord.id, 10) self.assertEqual(currentWord.text, 'ab') self.assertEqual(currentWord.transkription_positions[0].width, 2.1) self.assertEqual(nextWord.id, 11) self.assertEqual(nextWord.transkription_positions[0].width, 5.2) word_part_objs=[{'text': 'x', 'class':'st22', 'x': 0, 'y': 0},\ {'text': 'Insofern', 'class':'st22', 'x': 1, 'y': 0},\ {'text': 'x', 'class':'st22', 'x': 10, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofer') word_part_objs=[{'text': 'xInsofern', 'class':'st22', 'x': 0, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofern') def test_join(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word) self.assertEqual(word.text, 'abc.') other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word, append_at_end_of_new_word=False) self.assertEqual(word.text, '.abc.') """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_get_semanticAndDataDict(self): dictionary = Word.get_semantic_dictionary() info_dict = dictionary['properties'].get('isDeletionOfWord') self.assertEqual(SemanticClass.SUPER_PROPERTY in info_dict.keys(), True) super_info_dict = info_dict[SemanticClass.SUPER_PROPERTY] #print(info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME)) def test_simplify_transkription_positions(self): node_string = """ """ nodeA = ET.fromstring(node_string) node_string = """ """ nodeB = ET.fromstring(node_string) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) self.assertEqual(len(word.transkription_positions), 2) word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) + word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) + word.transkription_positions[1].writing_process_id = -1 + word.simplify_transkription_positions() + self.assertEqual(len(word.transkription_positions), 1) + self.assertEqual(word.transkription_positions[0].writing_process_id, 0) """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_partition(self): page = datatypes.page.Page(self.test_file) word = page.words[67] self.assertEqual(word.belongs_to_multiple_writing_processes(), True) word.partition_according_to_writing_process_id() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.belongs_to_multiple_writing_processes(), False) self.assertEqual(word.belongs_to_multiple_writing_processes(include_parts=True), True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) newWord = Word.create_cls(word_node) self.assertEqual(len(newWord.word_parts), 3) #print(ET.dump(empty_tree.getroot())) def test_partition_deletion(self): page = datatypes.page.Page(self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.deleted = transkription_position.writing_process_id == 1 self.assertEqual(word.has_mixed_status('deleted'), True) word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.has_mixed_status('deleted'), False) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) page = datatypes.page.Page(self.test_file) word = page.words[67] word.partition_according_to_writing_process_id() #print([(word.text, word.deleted) for word in word.word_parts]) word.word_parts[1].transkription_positions[1].deleted = True word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 4) #print([(word.text, word.deleted) for word in word.word_parts]) partA = Word(text='A', deleted=True) partB = Word(text='SDF', deleted=False) word = Word(text='ASDF', word_parts=[ partA, partB]) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) def test_execute_function_on_parts(self): page = datatypes.page.Page(self.test_file) word_parts = [ page.words[67], page.words[68] ] word_parts, none = execute_function_on_parts(word_parts, 'partition_according_to_writing_process_id') self.assertEqual(len(word_parts) == 4, True) def test_process_word_boxes(self): page = datatypes.page.Page(self.pdf_xml) page.source = self.pdf_xml_source for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [30, 277, 288, 297, 321] empty_tree = ET.ElementTree(ET.Element('page')) for word_id, index in enumerate(indices): word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) self.assertEqual(word_over_box is not None, True) def test_split_according_to_status(self): page = datatypes.page.Page(self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.text = 'asdf'\ if transkription_position.writing_process_id == 1\ else word.text self.assertEqual(word.has_mixed_status('text'), True) new_words = word.split_according_to_status('text') self.assertEqual(len(new_words) > 1, True) self.assertEqual(new_words[0].id, word.id) self.assertEqual(new_words[0].deleted, word.deleted) self.assertEqual(new_words[1].id, word.id+1) #print([ word.text for word in new_words ]) def test__get_partial_word_over_box(self): word = Word(text='test', transkription_positions=[ TranskriptionPosition(id=0), TranskriptionPosition(id=1) ]) word.transkription_positions[0].has_box = Box(earlier_text='asdf') word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 2) partA = Word(id=0, text='A', transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition(), TranskriptionPosition(id=1)]) partB.transkription_positions[0].has_box = Box(earlier_text='asdf') word = Word(text='ASDF', word_parts=[ partA, partB]) word._get_partial_word_over_box() if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_color.py =================================================================== --- tests_svgscripts/test_color.py (revision 0) +++ tests_svgscripts/test_color.py (revision 76) @@ -0,0 +1,66 @@ +import unittest +from os import sep, path +from os.path import dirname, basename, isfile, isdir +import lxml.etree as ET +import sys + +sys.path.append('svgscripts') +from datatypes.color import Color +from datatypes.manuscript import ArchivalManuscriptUnity + +class GColor: # Gnome supported + END = "\x1b[0m" + # If Foreground is False that means color effect on Background + def RGB(R, G, B): # R: 0-255 , G: 0-255 , B: 0-255 + FB_G = 38 # Effect on foreground + return "\x1b[" + str(FB_G) + ";2;" + str(R) + ";" + str(G) + ";" + str(B) + "m" + +class TestColor(unittest.TestCase): + def setUp(self): + DATADIR = dirname(__file__) + sep + 'test_data' + if not isdir(DATADIR): + DATADIR = dirname(dirname(__file__)) + sep + 'test_data' + self.test_file = DATADIR + sep + 'test.xml' + self.test_svg_file = DATADIR + sep + 'test421.svg' + self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' + self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' + self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' + self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' + self.test_page = DATADIR + sep + 'N_VII_1_page001.xml' + self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' + + def test_attach_object_to_tree(self): + color = Color(color_name='blue', hex_color='#009CDE') + empty_tree = ET.ElementTree(ET.Element('page')) + color.attach_object_to_tree(empty_tree) + color_nodes = empty_tree.xpath('.//' + Color.XML_TAG) + self.assertEqual(len(color_nodes), 1) + color = Color.create_cls(node=color_nodes[0]) + self.assertEqual(color.hex_color, '#009CDE') + + def test_create_cls(self): + manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript) + manuscript.UNITTESTING = True + color = Color.create_cls() + self.assertEqual(color.name, 'black') + color = Color.create_cls(hex_color='#009CDE', manuscript=manuscript) + self.assertEqual(color.name, 'blue') + self.assertEqual(color in manuscript.colors, True) + self.assertEqual(manuscript.get_color(color.hex_color), color) + color = Color.create_cls(hex_color='#009CDE', manuscript=manuscript) + self.assertEqual(color.name, 'blue') + """ + color_string = "#000000, #009CDE, #1D1D1B, #4CA32F, #93CDF1, #9D9D9C, #ADD8F5, #B2B2B2, #C6C6C6, #CD1719, #CED5CE, #DADADA, #DC0714, #DC0814, #F0977A, #F0F0F0, #F8F9F8, #FF6600, #FFFFFF".replace(',','') + color_list = color_string.split(' ') + for hex_color in color_list: + color = Color.create_cls(hex_color=hex_color) + print(GColor.RGB(*color.rgb_color), color.name, GColor.END) + """ + + def test_get_semantic_dictionary(self): + dictionary = Color.get_semantic_dictionary() + #print(dictionary) + + +if __name__ == "__main__": + unittest.main() Index: tests_svgscripts/test_data/N_VII_1_page006.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1_page006.xml (revision 75) +++ tests_svgscripts/test_data/N_VII_1_page006.xml (revision 76) @@ -1,1276 +1,1276 @@ svgWordPosition 2019-08-02 15:17:37 2019-08-02 15:17:37 2019-08-02 15:30:59 2019-08-02 15:30:59 - 2019-11-28 16:42:09 + 2019-12-04 09:58:57 Index: tests_svgscripts/test_data/N_VII_1.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1.xml (revision 75) +++ tests_svgscripts/test_data/N_VII_1.xml (revision 76) @@ -1,171 +1,176 @@ xmlManuscriptFile 2019-08-02 15:28:57 2019-08-02 15:31:25 + 2019-12-04 08:23:33 + 2019-12-04 08:23:33 H. J. Mette In schwarzen Lederdeckel gebundenes Oktavheft (10,5x17), 194 durchweg beschriebene Seiten; Studien aus der Umwertungszeit, die zum Teil für das <i>Jenseits </i>verwandt worden sind. BAW 1, XCVI M. Montinari Oktavheft, 10,5x17. 194 Seiten. Schwarze und violette Tinte, sowie Bleistift. Überwiegend deutsche Schrift. Von hinten nach vorn beschrieben. Alte Signatur: N XLIII. KGW VII 4/2, 632 Oktavheft. Schwarzer Ledereinband mit Goldprägung (vorn und hinten senkrechte Linie, parallel zum Rücken; vorn rechts unten Initialen „F. N.“, Einzelstempel) und umlaufender Blindlinie. Am hinteren Deckel lederne Stifthülse. Buchblock stellenweise gelockert. Vorsätze aus Moiré-Papier. 194 Seiten, 10,8x17,3, unliniiert. April bis Juni 1885 von KGW VII Fragmentgruppe 34 16. April 1885 bis Anfang Juni 1885 der identifizierten Briefentwürfe - + KGW VII 34[1-256] M. Montinari (zu 34[257]): „dieses und die beiden folgenden Fragmente 34[258.259] wurden von N in einen Brief von Paul Lanzky von Anfang Juni 1885 (KGB III 4, S. 28, Nr. 281) eingetragen.“ KGW VII 4/2, 374. Vorderer Deckel Vorsatz Rekto Vorsatz Verso (kaschiert) 1 1 194 Lage, 6 Blatt Vorsatz - 11 1 11 Einzelblatt 12-13 12 13 Lage, 4 Blatt 14-21 14 21 Lage, 8 Blatt 22-37 22 37 Lage, 8 Blatt 38-53 38 53 Lage, 8 Blatt 54-69 54 69 Lage, 8 Blatt 70-85 70 85 Lage, 8 Blatt 86-101 86 101 Lage, 8 Blatt 102-117 102 117 Lage, 8 Blatt 118-133 118 133 Lage, 8 Blatt 134-149 134 149 Lage, 8 Blatt 150-165 150 165 Lage, 8 Blatt 166-181 166 181 Lage, 8 Blatt 182 - Vorsatz 182 194 Vorsatz Rekto (kaschiert) 194 1 194 Vorsatz Verso Hinterer Deckel 1885-4-1 1885-6-28 KGW IX 1 2001 Bearbeitet von Marie-Luise Haase, Michael Kohlenbach, Johannes Neininger, Wolfert von Rahden, Thomas Riebe und René Stockmar unter Mitarbeit von Dirk Setton. Marie-Luise Haase und Michael Kohlenbach 71/209 N XLIII + + + Index: tests_svgscripts/test_style.py =================================================================== --- tests_svgscripts/test_style.py (revision 0) +++ tests_svgscripts/test_style.py (revision 76) @@ -0,0 +1,55 @@ +import unittest +from os import sep, path +from os.path import dirname, basename, isfile, isdir +import lxml.etree as ET +import sys + +sys.path.append('svgscripts') +from datatypes.color import Color +from datatypes.page import Page +from datatypes.style import Style + +class TestStyle(unittest.TestCase): + def setUp(self): + DATADIR = dirname(__file__) + sep + 'test_data' + if not isdir(DATADIR): + DATADIR = dirname(dirname(__file__)) + sep + 'test_data' + self.test_file = DATADIR + sep + 'test.xml' + self.test_svg_file = DATADIR + sep + 'test421.svg' + self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' + self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' + self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' + self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' + self.test_page = DATADIR + sep + 'N_VII_1_page001.xml' + self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' + + def test_create_cls(self): + page = Page(self.test_page) + style_string = "st11 st10 st5" + style = Style.create_cls(page, style_string) + self.assertEqual(style.font_family, 'Weidemann-Book') + self.assertEqual(style.color.hex_color, "#DADADA") + self.assertEqual(style.writing_instrument, 'schwarze Tinte') + style_string = "st11 st10" + style = Style.create_cls(page, style_string) + self.assertEqual(style.font_family, 'Weidemann-Book') + self.assertEqual(style.color.name, "black") + self.assertEqual(style.writing_instrument, 'schwarze Tinte') + + def test_process_style_classes(self): + style = Style() + style.color = Color.create_cls(hex_color='#009CDE') + style.process_style_classes() + self.assertEqual(style.writing_instrument, 'violette Tinte') + self.assertEqual(style.font, 'deutsche Schreibschrift') + style.font_family = "NewsGothicBT-Bold" + style.process_style_classes() + self.assertEqual(style.writing_instrument, 'Blaustift') + self.assertEqual(style.font, 'lateinische Schreibschrift') + + def test_get_semantic_dictionary(self): + dictionary = Style.get_semantic_dictionary() + #print(dictionary) + +if __name__ == "__main__": + unittest.main() Index: tests_svgscripts/test_util.py =================================================================== --- tests_svgscripts/test_util.py (revision 75) +++ tests_svgscripts/test_util.py (revision 76) @@ -1,217 +1,229 @@ import unittest from os import sep, path, remove, listdir from os.path import isdir, isfile, dirname, basename import shutil import sys import lxml.etree as ET import sys import tempfile import warnings sys.path.append('svgscripts') import util from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT from datatypes.faksimile import FaksimilePage from datatypes.page import Page from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.word_position import WordPosition +from datatypes.word import Word sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT class TestCopy(unittest.TestCase): def setUp(self): util.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_dir = DATADIR self.faksimile_dir = DATADIR + sep + 'faksimile_svg' self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg' self.image = DATADIR + sep + 'image.jpg' self.svg_testrecord = DATADIR + sep + 'TESTRECORD.svg' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.tmp_dir = tempfile.mkdtemp() def test_copy(self): tmp_image = self.tmp_dir + sep + basename(self.image) target_file = 'asdf.svg' shutil.copy(self.image, self.tmp_dir) util.copy_faksimile_svg_file(target_file, faksimile_source_file=self.faksimile_file,\ target_directory=self.tmp_dir, local_image_path=tmp_image) self.assertEqual(isfile(self.tmp_dir + sep + target_file), True) util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_file,\ target_directory=self.tmp_dir, local_image_path=tmp_image) self.assertEqual(isfile(self.tmp_dir + sep + basename(self.faksimile_file)), True) with self.assertRaises(Exception): util.copy_faksimile_svg_file() with self.assertRaises(Exception): util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_source_file) def test_copy_xml(self): old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) self.assertEqual(isfile(xml_file), True) page = Page(xml_file) self.assertEqual(len(page.words), len(old_page.words)) self.assertEqual(len(page.line_numbers), 0) def test_create_highlighted_svg_file(self): target_file = self.tmp_dir + sep + basename(self.faksimile_file) tmp_image = self.tmp_dir + sep + basename(self.image) faksimile_tree = ET.parse(self.faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } node_ids = ['rect947', 'rect951', 'rect953', 'rect955', 'rect959', 'rect961', 'rect963'] highlight_color = 'blue' util.create_highlighted_svg_file(faksimile_tree, node_ids, target_directory=self.tmp_dir, highlight_color=highlight_color, namespaces=namespaces) self.assertEqual(isfile(target_file), True) new_tree = ET.parse(target_file) for node in new_tree.xpath('//ns:rect[@fill="{0}"]|//ns:path[@fill="{0}"]'.format(highlight_color), namespaces=namespaces): node_ids.remove(node.get('id')) self.assertEqual(len(node_ids), 0) def test_get_empty_node_ids(self): faksimile_tree = ET.parse(self.faksimile_file) faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] empty_node_ids = util.get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page) self.assertEqual('rect1085' in empty_node_ids, True) def test_record_changes(self): new_tree = ET.parse(self.faksimile_file) old_tree = ET.parse(self.faksimile_file) empty_node_id = 'rect1085' title_node_id = 'test001' namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() } node = new_tree.xpath('//ns:rect[@id="{0}"]'.format(empty_node_id), namespaces=namespaces)[0] title = ET.SubElement(node, 'title', attrib={ 'id': title_node_id }) title.text = 'test' new_file = self.tmp_dir + sep + 'new.svg' old_file = self.tmp_dir + sep + 'old.svg' util.copy_faksimile_svg_file(target_file=new_file, faksimile_tree=new_tree) util.copy_faksimile_svg_file(target_file=old_file, faksimile_tree=old_tree) util.record_changes(old_file, new_file, [ empty_node_id ], namespaces=namespaces) test_tree = ET.parse(old_file) self.assertEqual(len(test_tree.xpath('//ns:rect[@id="{0}"]/ns:title[@id="{1}"]'.format(empty_node_id, title_node_id), namespaces=namespaces)), 1) def test_replace_chars(self): page = Page(self.xml_file) faksimile_tree = ET.parse(self.faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } word_position = WordPosition(id='rect1159', text='„Gedächtniß"') wps, texts = util.replace_chars(page.words, [ word_position ]) self.assertEqual(texts[0].endswith('“'), True) self.assertEqual(wps[0].text.endswith('“'), True) word_position = WordPosition(id='rect1173', text='-') wps, texts = util.replace_chars(page.words, [ word_position ]) self.assertEqual(wps[0].text.endswith('–'), True) def test_mismatch_words(self): page = Page(self.xml_file) faksimile_tree = ET.parse(self.faksimile_file) faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] page = Page('xml/N_VII_1_page174.xml') faksimile_tree = ET.parse('faksimile_svg/N-VII-1,173et174.svg') faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] self.assertEqual('-' in [ tp.text for tp in faksimile_page.word_positions], True) wps, texts = util.replace_chars(page.words,faksimile_page.word_positions) self.assertEqual('–' in texts, True) self.assertEqual(len([ faksimile_position for faksimile_position in wps\ if faksimile_position.text == '–' ]), 4) mismatching_words, mismatching_faksimile_positions = util.get_mismatching_ids(page.words, faksimile_page.word_positions) self.assertEqual(len([word for word in mismatching_words if word.text.endswith('“') ]), 0) self.assertEqual(len([word for word in mismatching_words if word.text.endswith('–') ]), 0) @unittest.skip('test uses external program, has been tested') def test_show_files(self): list_of_files = [ self.test_dir + sep + file for file in listdir(self.test_dir) if file.endswith('pdf') ][0:2] util.ExternalViewer.show_files(single_file=self.faksimile_file, list_of_files=list_of_files) def test_record_changes_to_page(self): page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 1 ]) old_length = len(page.words) self.assertEqual(page.words[1].text, 'asdf') self.assertEqual(page.words[1].transkription_positions[0].width, 353) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 13 ]) self.assertEqual(page.words[13].text, 'er') self.assertEqual(page.words[14].text, '=') self.assertEqual(len(page.words), old_length+1) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 64 ]) self.assertEqual(page.words[64].text, 'Simplifications-apparat') self.assertEqual(len(page.words[64].transkription_positions), 3) self.assertEqual(len(page.words), old_length-1) @unittest.skipUnless(__name__ == "__main__", 'tests all words') def test_extended__record_changes_to_page(self): page = Page(self.xml_file) old_length = len(page.words) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord) self.assertEqual(page.words[1].text, 'asdf') self.assertEqual(page.words[13].text, 'er') self.assertEqual(page.words[14].text, '=') self.assertEqual(page.words[65].text, 'Simplifications-apparat') self.assertEqual(len(page.words), old_length) def test_copy_faksimile_update_image_location(self): test_dir = self.tmp_dir #FAKSIMILE_LOCATION + '/Myriam/Fertig/' util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir) with self.assertWarns(UserWarning): util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir) def test_record_changes_on_xml(self): old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) node = tree.xpath('//word[@id="135"]')[0] counter =0 while node.get('text') != 'gar' or counter > 5: counter += 1 nextnode = node.getnext() node.set('text', node.get('text') + nextnode.get('text')) for element in nextnode.getchildren(): node.append(element) nextnode.getparent().remove(nextnode) write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(new_page.words), len(old_page.words)-2) self.assertEqual(len([ word for word in new_page.words if word.text == 'gar']), 1) old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) node = tree.xpath('//word[@id="138"]')[0] counter =0 while node.get('text') != 'nichtvorkommt.' or counter > 5: counter += 1 nextnode = node.getnext() node.set('text', node.get('text') + nextnode.get('text')) for element in nextnode.getchildren(): node.append(element) nextnode.getparent().remove(nextnode) node.set('split', 'nicht vorkommt.') write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) joined_page = Page(xml_file) self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.']), 1) self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.'][0].split_strings), 2) self.assertEqual(len(joined_page.words), len(old_page.words)-1) new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(new_page.words), len(old_page.words)) self.assertEqual(len([word for word in new_page.words if word.text == 'vorkommt.']), 1) self.assertEqual(len([word for word in old_page.words if word.text == 'nicht']),\ len([word for word in new_page.words if word.text == 'nicht'])) - #print(ET.dump(node)) - #for node in [ node for node in new_page.page_tree.xpath('//word') if node.get('text') == 'gar' ]: - # print(ET.dump(node)) + xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) + tree = ET.parse(xml_file) + old_page = Page(xml_file) + nodes = tree.xpath('//word[@id>="85" and @id<="87"]') + self.assertEqual(len(nodes), 3) + prevWordText = nodes[0].get('text') + nodes[0].set('join', prevWordText + 'z') + nodes[1].set('split', 'z u') + lastWordText = nodes[2].get('text') + nodes[2].set('join', 'u' + lastWordText) + write_pretty(xml_element_tree=tree, file_name=xml_file,\ + script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) + joined_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) + self.assertEqual(len(joined_page.words), len(old_page.words)-1) def test_back_up(self): test_dir = self.tmp_dir page = Page(self.xml_file) target_file_name = util.back_up(page, self.xml_file, bak_dir=test_dir) self.assertEqual(isfile(target_file_name), True) def tearDown(self): shutil.rmtree(self.tmp_dir, ignore_errors=True) pass if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_convert_wordPositions.py =================================================================== --- tests_svgscripts/test_convert_wordPositions.py (revision 75) +++ tests_svgscripts/test_convert_wordPositions.py (revision 76) @@ -1,55 +1,63 @@ import unittest from os import sep, path, remove import lxml.etree as ET import lxml.html import sys sys.path.append('svgscripts') import convert_wordPositions from convert_wordPositions import Converter, SVGConverter, HTMLConverter from datatypes.page import Page from datatypes.page_creator import PageCreator +from datatypes.transkription_position import TranskriptionPosition class TestConverter(unittest.TestCase): def setUp(self): DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.outputfile_txt = 'test.txt' self.outputfile_html = 'test.html' self.outputfile_svg = 'test.svg' def test_main(self): argv = ['-t', '-s', self.test_svg_file, self.test_file] self.assertEqual(convert_wordPositions.main(argv), 0) argv = ['-t', '-s', self.test_svg_file, '-o', self.outputfile_txt, self.test_file] self.assertEqual(convert_wordPositions.main(argv), 0) self.assertEqual(path.isfile(self.outputfile_txt), True) argv = ['-t', '-s', self.test_svg_file, '-o', self.outputfile_html, self.test_file] self.assertEqual(convert_wordPositions.main(argv), 0) self.assertEqual(path.isfile(self.outputfile_html), True) html_tree = lxml.html.parse(self.outputfile_html) self.assertEqual(html_tree.getroot().tag, 'html') argv = ['-t', '-s', self.test_svg_file, '-o', self.outputfile_svg, self.test_file] self.assertEqual(convert_wordPositions.main(argv), 0) self.assertEqual(path.isfile(self.outputfile_svg), True) svg_tree = ET.parse(self.outputfile_svg) self.assertEqual(svg_tree.getroot().tag, '{http://www.w3.org/2000/svg}svg') def test_create_converter(self): page = PageCreator(self.test_file, svg_file=self.test_svg_file) converter = Converter.CREATE_CONVERTER(page, False, 'SVG') self.assertEqual(isinstance(converter, SVGConverter), True) converter = Converter.CREATE_CONVERTER(page, False, 'HTML') self.assertEqual(isinstance(converter, HTMLConverter), True) converter = Converter.CREATE_CONVERTER(page, False) self.assertEqual(isinstance(converter, Converter), True) + def test_get_transkription_positions(self): + tp = [ TranskriptionPosition(), TranskriptionPosition(), TranskriptionPosition() ] + page = PageCreator(self.test_file, svg_file=self.test_svg_file) + converter = Converter.CREATE_CONVERTER(page, False, 'SVG') + converter._get_transkription_positions(tp, stage_version='1+') + + def tearDown(self): bool(path.isfile(self.outputfile_txt)) and remove(self.outputfile_txt) bool(path.isfile(self.outputfile_html)) and remove(self.outputfile_html) bool(path.isfile(self.outputfile_svg)) and remove(self.outputfile_svg) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_manuscript.py =================================================================== --- tests_svgscripts/test_manuscript.py (revision 75) +++ tests_svgscripts/test_manuscript.py (revision 76) @@ -1,38 +1,54 @@ import unittest from os import sep, path from os.path import basename, dirname, isfile import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.manuscript import ArchivalManuscriptUnity +from datatypes.color import Color class TestArchivalManuscriptUnity(unittest.TestCase): def setUp(self): + ArchivalManuscriptUnity.UNITTESTING = True DATADIR = dirname(__file__) + sep + 'test_data' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' def test_init(self): title = 'Test I 1' manuscript = ArchivalManuscriptUnity(title=title) self.assertEqual(manuscript.title, title) def test_get_semanticAndDataDict(self): semantic_dict = ArchivalManuscriptUnity.get_semantic_dictionary() #print(semantic_dict) def test_create_cls(self): manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript) self.assertEqual(manuscript.title, basename(self.test_manuscript).replace('.xml','').replace('_', ' ')) self.assertEqual(manuscript.manuscript_type, 'Notizheft') self.assertEqual(len(manuscript.pages), 3) manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript, page_status_list=['faksimile merged']) self.assertEqual(len(manuscript.pages), 2) manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript, page_status_list=['faksimile merged', 'words processed']) self.assertEqual(len(manuscript.pages), 1) manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript, page_xpath='//pages/page/@output') self.assertEqual(len(manuscript.pages), 3) + def test_get_color(self): + color = Color() + manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript) + self.assertEqual(manuscript.get_color(color.hex_color) is not None, True) + self.assertEqual(manuscript.get_color("#F7F6F5") is None, True) + + def test_update_colors(self): + color = Color() + manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript) + manuscript.update_colors(color) + self.assertEqual(len(manuscript.colors), 1) + #print(ET.dump(manuscript.manuscript_tree.getroot())) + + if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_transkription_position.py =================================================================== --- tests_svgscripts/test_transkription_position.py (revision 75) +++ tests_svgscripts/test_transkription_position.py (revision 76) @@ -1,102 +1,115 @@ import unittest from os import sep, path from os.path import dirname, isdir, isfile import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.debug_message import DebugMessage from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.page_creator import PageCreator from datatypes.positional_word_part import PositionalWordPart from datatypes.transkription_position import TranskriptionPosition from datatypes.transkriptionField import TranskriptionField from datatypes.word_position import WordPosition class TestTranskriptionPosition(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_svg_file = DATADIR + sep + 'W_I_8_page125_web.svg' self.test_xml = DATADIR + sep + 'W_I_8_page125.xml' self.dir = DATADIR def test_init(self): dmsg = DebugMessage(message='test') word_position = TranskriptionPosition(id=1, height=10, width=10, x=0, y=10, debug_message=dmsg) self.assertEqual(word_position.tag, WordPosition.TRANSKRIPTION) self.assertEqual(word_position.id, '1') self.assertEqual(word_position.debug_message.message, 'test') self.assertEqual(word_position.height, 10) self.assertEqual(word_position.top, 10) self.assertEqual(word_position.bottom, 20) self.assertEqual(word_position.left, 0) self.assertEqual(word_position.isOnTranskription(), True) self.assertEqual(word_position.isOnFaksimile(), False) def test_attach_object_to_tree(self): matrix = Matrix('matrix(0 0 0 0 0 0)') dmsg = DebugMessage(message='test') pwps = [ PositionalWordPart(text='test') ] word_position = TranskriptionPosition(id=1, height=10, width=10, x=0, y=10, matrix=matrix, debug_message=dmsg, positional_word_parts=pwps) empty_tree = ET.ElementTree(ET.Element('page')) word_position.attach_object_to_tree(empty_tree) #print(ET.dump(empty_tree.getroot())) for node in empty_tree.getroot().xpath('//' + word_position.tag): self.assertEqual(node.get('id'), '1') self.assertEqual(node.get('bottom'), '20') self.assertEqual(node.get('transform'), matrix.toString()) self.assertEqual(node.get('writing-process-id'), '-1') word_position = TranskriptionPosition(node=empty_tree.getroot().find('.//' + word_position.tag)) self.assertEqual(word_position.height, 10) self.assertEqual(word_position.debug_message is not None, True) self.assertEqual(word_position.debug_message.message, 'test') self.assertEqual(len(word_position.positional_word_parts), 1) def test_CREATE_TRANSKRIPTION_POSITION_LIST(self): page = PageCreator(self.test_xml, svg_file=self.test_svg_file) tf = TranskriptionField(page.svg_file) word_part_objs = [{'text': 'es', 'class': 'st5 st6', 'x': 258.148, 'y': '8.5' }] transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf) self.assertEqual(transkription_positions[0].top, 3.829) self.assertEqual(transkription_positions[0].height, 5.672) word_part_objs = [{'text': 'Meine', 'class': 'st5 st8', 'x': 8.504, 'y': 70.5 }] transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf) self.assertEqual(transkription_positions[0].height, 9.125) self.assertEqual(transkription_positions[0].top, 62.376) self.assertEqual(transkription_positions[0].bottom, 71.501) def test_CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(self): page = PageCreator(self.test_xml, svg_file=self.test_svg_file) tf = TranskriptionField(page.svg_file) word_part_objs = [{'text': 'Meine', 'class': 'st5 st8', 'x': 8.504, 'y': 70.5 }] transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf) transkription_positions[0].positional_word_parts[2].transform = Matrix('rotate(20)') transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(transkription_positions[0].positional_word_parts) self.assertEqual(len(transkription_positions), 3) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf) transkription_positions[0].positional_word_parts[0].style_class = 'st5 st10' transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(transkription_positions[0].positional_word_parts) self.assertEqual(len(transkription_positions), 2) def test_get_semantic_dictionary(self): dictionary = TranskriptionPosition.get_semantic_dictionary() #print(dictionary) #self.assertEqual(TranskriptionPosition.XML_TAG in dictionary['properties'].get('writing_process_id').get('xpath'), True) - + + def test_is_mergeable_with(self): + page = PageCreator(self.test_xml, svg_file=self.test_svg_file) + tf = TranskriptionField(page.svg_file) + word_part_objs = [{'text': 'Meine', 'class': 'st5 st8', 'x': 8.504, 'y': 70.5 }] + transkription_positionA = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf)[0] + transkription_positionB = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf)[0] + self.assertEqual(transkription_positionA.is_mergebale_with(transkription_positionB), True) + transkription_positionA.writing_process_id = 0 + transkription_positionB.writing_process_id = 1 + self.assertEqual(transkription_positionA.is_mergebale_with(transkription_positionB), False) + transkription_positionB.writing_process_id = -1 + self.assertEqual(transkription_positionA.is_mergebale_with(transkription_positionB), True) + def test_split(self): page = PageCreator(self.test_xml, svg_file=self.test_svg_file) tf = TranskriptionField(page.svg_file) word_part_objs = [{'text': 'Meine', 'class': 'st5 st8', 'x': 8.504, 'y': 70.5 }] transkription_position = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf)[0] tr_positions = transkription_position.split(transkription_position.left + transkription_position.width/2) self.assertEqual(tr_positions[0] is not None, True) self.assertEqual(tr_positions[1] is not None, True) transkription_position = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf)[0] tr_positions = transkription_position.split(transkription_position.left + transkription_position.width/2, transkription_position.left + transkription_position.width - 4) self.assertEqual(len(tr_positions), 3) if __name__ == "__main__": unittest.main() Index: py2ttl/class_spec.py =================================================================== --- py2ttl/class_spec.py (revision 75) +++ py2ttl/class_spec.py (revision 76) @@ -1,197 +1,199 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This is an abstract class for all classes that are semantically relevant. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc import inspect import warnings class UnSemanticClass: """ Subclasses of this class are not semantically relevant, even if their superclasses are. """ pass class SemanticClass(metaclass=abc.ABCMeta): """ This is an abstract class for all classes that are semantically relevant. """ HAS_PART = 'has_part' HAS_SEQNUM = 'has_seqnum' SINGLE_VALUE = 1 LIST = -99 CLASS_KEY = 'class' CARDINALITY = "cardinality" CARDINALITY_RESTRICTION = "cardinality_restriction" HAS_HOMOTYPIC_PARTS_URL_STRING = 'http://www.nie.org/ontology/homotypic#hasHomotypicParts' PROPERTY_NAME = "name" PROPERTY_LABEL = "label" PROPERTY_COMMENT = "comment" PROPERTIES_KEY = "properties" SUBCLASS_OF = "rdfs:subClassOf" SUBPROPERTYOF = "subPropertyOf" SUPER_CLASSES_DICT = { 'http://www.nie.org/ontology/homotypic': 'HomotypicEntity' } SUPER_PROPERTY = "super_property" THIS = "this" TYPE = "type" @classmethod def create_semantic_property_dictionary(cls, property_key, class_type, cardinality=0, cardinality_restriction='cardinality', name='', label='', comment='', subPropertyOf='') -> dict: """Create a semantic property dicitonary. :return: semantic property dicitonary (dict) """ property_content = { SemanticClass.CLASS_KEY: class_type } if cardinality > 0: property_content.update({ SemanticClass.CARDINALITY: cardinality}) property_content.update({ SemanticClass.CARDINALITY_RESTRICTION: cardinality_restriction}) if name != '': property_content.update({ SemanticClass.PROPERTY_NAME: name}) if label != '': property_content.update({ SemanticClass.PROPERTY_LABEL: label}) if comment != '': property_content.update({ SemanticClass.PROPERTY_COMMENT: comment}) if subPropertyOf != '': property_content.update({ SemanticClass.SUBPROPERTYOF: subPropertyOf}) return { property_key: property_content } @classmethod def get_class_dictionary(cls): """Creates and returns a class_dictionary with the keys cls.THIS [, cls.SUBCLASS_OF, cls.TYPE]. """ class_dict = {cls.THIS: cls } if cls.__dict__.get('OWL_EQUIVALENTCLASSES') and len(cls.OWL_EQUIVALENTCLASSES) > 0: class_dict.update({'owl:equivalentClass': cls.OWL_EQUIVALENTCLASSES }) if cls.__dict__.get('RDFS_SUBCLASSOF_LIST') and len(cls.RDFS_SUBCLASSOF_LIST) > 0: class_dict.update({cls.SUBCLASS_OF: cls.RDFS_SUBCLASSOF_LIST }) else: direct_super_class = inspect.getclasstree([cls],unique=True)[0][0] if issubclass(direct_super_class, SemanticClass) and direct_super_class != SemanticClass: class_dict.update({cls.TYPE: direct_super_class}) return class_dict def get_name_and_id(self): """Return an identification for object as 2-tuple. """ id = 0 if 'id' in self.__dict__.keys(): id = self.id elif 'number' in self.__dict__.keys(): id = self.number elif 'title' in self.__dict__.keys(): id = self.title.replace(' ', '_') return type(self).__name__, id def _get_list_of_type(self, list_type): """Return list of type == list_type if list is not empty. """ list_of_type = [] for object_list in [ list_obj for list_obj in self.__dict__.values()\ if type(list_obj) == list ]: if len(object_list) > 0 and type(object_list[0]) == list_type: return object_list return list_of_type def get_object_from_list_with_id(self, object_type, object_id): """Return object from list if object has id == object_id, None if not found. """ list_with_object = [ item for item in self._get_list_of_type(object_type)\ if item.id == object_id ] if len(list_with_object) > 0: return list_with_object[0] return None @classmethod def get_cls_hasPart_objectCls_dictionaries(cls, object_cls, xpath, object_seqnum_xpath=None, cardinality=0, cardinality_restriction='minCardinality'): """Return a dictionary containing the information for creating a class that can act as an intermediary between cls and a number of object_cls if object_cls has a position in a sequence of object_classes that belong to cls. """ part_name = object_cls.__name__ + 'Part' has_part_name = object_cls.__name__.lower() + 'PartHas' + object_cls.__name__ has_seqnum_name = object_cls.__name__.lower() + 'HasSeqNum' if object_seqnum_xpath is None: object_seqnum_xpath = xpath + '/@id' object_part_dictionary = { 'class': object_cls, 'cardinality': 1, 'xpath': xpath,\ 'name': has_part_name, 'label': '{0} has a {1}'.format(part_name, object_cls.__name__),\ 'comment': '{0} has a part, that is a {1}'.format(part_name, object_cls.__name__)} object_seqnum_dictionary = { 'class': int, 'cardinality': 1, 'xpath': object_seqnum_xpath,\ 'name': has_seqnum_name, 'label': '{0} has a sequence number'.format(part_name),\ 'comment': '{0} has a part, that stands in a sequence with this number'.format(part_name, object_cls.__name__)} object_dictionary = { 'class_name': part_name, SemanticClass.HAS_PART: object_part_dictionary, SemanticClass.HAS_SEQNUM: object_seqnum_dictionary,\ 'label': '{0} part'.format(object_cls.__name__.lower()),\ 'comment': 'This class servers as a intermediary between {0} and {1}. {0} has some {1} in a specific sequence.'.format(cls.__name__, object_cls.__name__)} dictionary = { 'flag': 'ordered_list' , 'class': object_dictionary, 'cardinality': cardinality, 'cardinality_restriction': cardinality_restriction, 'xpath': xpath,\ 'name': cls.__name__.lower() + 'Has' + part_name, 'label': '{0} has a part that connects it with a {1}'.format(cls.__name__, object_cls.__name__),\ 'comment': '{0} has a part that connects it with a {1}, that has a position in a sequence of {1}'.format(cls.__name__, object_cls.__name__)} return dictionary @classmethod @abc.abstractmethod def get_semantic_dictionary(cls): """Creates a semantic dictionary with cls.CLASS_KEY and cls.PROPERTIES_KEY as its keys. The class-key points to a class_dictionary with the keys: cls.THIS [, cls.SUBCLASS_OF, cls.TYPE]. Create initial dictionary using cls.get_class_dictionary(): dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: {} } The properties_key points to a properties_dictionary with semantically relevant keys of self.__dict__ as keys. Use cls.create_semantic_property_dictionary(...) in order to add a property dictionary for each property as follows: dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary(property_key, ...)) Return dictionary by using: cls.return_dictionary_after_updating_super_classes(dictionary) """ pass @classmethod def return_dictionary_after_updating_super_classes(cls, dictionary): """Return semantic dictionary after updating super classes if necessary. """ + if cls.PROPERTIES_KEY not in dictionary.keys(): + return dictionary subproperty_base_uri_set = set( value.get(cls.SUBPROPERTYOF).split('#')[0]\ for value in dictionary[cls.PROPERTIES_KEY].values()\ if bool(value.get(cls.SUBPROPERTYOF)) ) for sub_property_base in subproperty_base_uri_set: if bool(cls.SUPER_CLASSES_DICT.get(sub_property_base))\ and (\ cls.SUBCLASS_OF not in dictionary[cls.CLASS_KEY].keys()\ or len(dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF]) == 0\ or sub_property_base + '#' + cls.SUPER_CLASSES_DICT.get(sub_property_base) not in dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF]\ ): subclass_list = dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF]\ if cls.SUBCLASS_OF in dictionary[cls.CLASS_KEY].keys()\ and len(dictionary[cls.CLASS_KEY].get(cls.SUBCLASS_OF)) > 0\ else [] subclass_list.append(sub_property_base + '#' + cls.SUPER_CLASSES_DICT.get(sub_property_base)) dictionary[cls.CLASS_KEY].update({cls.SUBCLASS_OF: subclass_list}) return dictionary