Index: py2ttl/class_spec.py =================================================================== --- py2ttl/class_spec.py (revision 91) +++ py2ttl/class_spec.py (revision 92) @@ -1,218 +1,227 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This is an abstract class for all classes that are semantically relevant. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc import inspect import warnings class UnSemanticClass: """ Subclasses of this class are not semantically relevant, even if their superclasses are. """ pass class SemanticClass(metaclass=abc.ABCMeta): """ This is an abstract class for all classes that are semantically relevant. """ HAS_PART = 'has_part' HAS_SEQNUM = 'has_seqnum' SINGLE_VALUE = 1 LIST = -99 CLASS_KEY = 'class' CARDINALITY = "cardinality" CARDINALITY_RESTRICTION = "cardinality_restriction" HAS_HOMOTYPIC_PARTS_URL_STRING = 'http://www.nie.org/ontology/homotypic#hasHomotypicParts' + HOMOTYPIC_HAS_TEXT_URL_STRING = 'http://www.nie.org/ontology/homotypic#hasText' + STOFF_STYLE_HAS_CSS_URL_STRING = 'http://www.nie.org/ontology/standoff#styleHasCSS' PROPERTY_NAME = "name" PROPERTY_LABEL = "label" PROPERTY_COMMENT = "comment" PROPERTIES_KEY = "properties" SUBCLASS_OF = "rdfs:subClassOf" SUBPROPERTYOF = "subPropertyOf" - SUPER_CLASSES_DICT = { 'http://www.nie.org/ontology/homotypic': 'HomotypicEntity' } + SUPER_CLASSES_DICT = { 'http://www.nie.org/ontology/homotypic': 'HomotypicEntity', 'http://www.nie.org/ontology/standoff': 'Style' } SUPER_PROPERTY = "super_property" THIS = "this" TYPE = "type" @classmethod def create_semantic_property_dictionary(cls, property_key, class_type, cardinality=0, cardinality_restriction='cardinality', name='', label='', comment='', subPropertyOf='') -> dict: """Create a semantic property dicitonary. + Here is how to make a subproperty: + + Pass the IRI of the super property as subPropertyOf=IRI, + be sure that base_uri of IRI (as key) and Class identifier of super class (as value) are in cls.SUPER_CLASSES_DICT, + then call cls.return_dictionary_after_updating_super_classes -> it will subclass the class that owns the subproperty + to the super class. + :return: semantic property dicitonary (dict) """ property_content = { SemanticClass.CLASS_KEY: class_type } if cardinality > 0: property_content.update({ SemanticClass.CARDINALITY: cardinality}) property_content.update({ SemanticClass.CARDINALITY_RESTRICTION: cardinality_restriction}) if name != '': property_content.update({ SemanticClass.PROPERTY_NAME: name}) if label != '': property_content.update({ SemanticClass.PROPERTY_LABEL: label}) if comment != '': property_content.update({ SemanticClass.PROPERTY_COMMENT: comment}) if subPropertyOf != '': property_content.update({ SemanticClass.SUBPROPERTYOF: subPropertyOf}) return { property_key: property_content } @classmethod def get_class_dictionary(cls): """Creates and returns a class_dictionary with the keys cls.THIS [, cls.SUBCLASS_OF, cls.TYPE]. """ class_dict = {cls.THIS: cls } if cls.__dict__.get('OWL_EQUIVALENTCLASSES') and len(cls.OWL_EQUIVALENTCLASSES) > 0: class_dict.update({'owl:equivalentClass': cls.OWL_EQUIVALENTCLASSES }) if cls.__dict__.get('RDFS_SUBCLASSOF_LIST') and len(cls.RDFS_SUBCLASSOF_LIST) > 0: class_dict.update({cls.SUBCLASS_OF: cls.RDFS_SUBCLASSOF_LIST }) else: direct_super_class = inspect.getclasstree([cls],unique=True)[0][0] if issubclass(direct_super_class, SemanticClass) and direct_super_class != SemanticClass: class_dict.update({cls.TYPE: direct_super_class}) return class_dict def get_name_and_id(self): """Return an identification for object as 2-tuple. """ id = 0 if 'id' in self.__dict__.keys(): id = self.id elif 'number' in self.__dict__.keys(): id = self.number elif 'title' in self.__dict__.keys(): id = self.title.replace(' ', '_') return type(self).__name__, id def _get_list_of_type(self, list_type): """Return list of type == list_type if list is not empty. """ list_of_type = [] for object_list in [ list_obj for list_obj in self.__dict__.values()\ if type(list_obj) == list ]: if len(object_list) > 0 and type(object_list[0]) == list_type: return object_list return list_of_type def get_object_from_list_with_id(self, object_type, object_id): """Return object from list if object has id == object_id, None if not found. """ list_with_object = [ item for item in self._get_list_of_type(object_type)\ if item.id == object_id ] if len(list_with_object) > 0: return list_with_object[0] return None @classmethod def get_cls_hasPart_objectCls_dictionaries(cls, object_cls, xpath, object_seqnum_xpath=None, cardinality=0, cardinality_restriction='minCardinality'): """Return a dictionary containing the information for creating a class that can act as an intermediary between cls and a number of object_cls if object_cls has a position in a sequence of object_classes that belong to cls. """ part_name = object_cls.__name__ + 'Part' has_part_name = object_cls.__name__.lower() + 'PartHas' + object_cls.__name__ has_seqnum_name = object_cls.__name__.lower() + 'HasSeqNum' if object_seqnum_xpath is None: object_seqnum_xpath = xpath + '/@id' object_part_dictionary = { 'class': object_cls, 'cardinality': 1, 'xpath': xpath,\ 'name': has_part_name, 'label': '{0} has a {1}'.format(part_name, object_cls.__name__),\ 'comment': '{0} has a part, that is a {1}'.format(part_name, object_cls.__name__)} object_seqnum_dictionary = { 'class': int, 'cardinality': 1, 'xpath': object_seqnum_xpath,\ 'name': has_seqnum_name, 'label': '{0} has a sequence number'.format(part_name),\ 'comment': '{0} has a part, that stands in a sequence with this number'.format(part_name, object_cls.__name__)} object_dictionary = { 'class_name': part_name, SemanticClass.HAS_PART: object_part_dictionary, SemanticClass.HAS_SEQNUM: object_seqnum_dictionary,\ 'label': '{0} part'.format(object_cls.__name__.lower()),\ 'comment': 'This class servers as a intermediary between {0} and {1}. {0} has some {1} in a specific sequence.'.format(cls.__name__, object_cls.__name__)} dictionary = { 'flag': 'ordered_list' , 'class': object_dictionary, 'cardinality': cardinality, 'cardinality_restriction': cardinality_restriction, 'xpath': xpath,\ 'name': cls.__name__.lower() + 'Has' + part_name, 'label': '{0} has a part that connects it with a {1}'.format(cls.__name__, object_cls.__name__),\ 'comment': '{0} has a part that connects it with a {1}, that has a position in a sequence of {1}'.format(cls.__name__, object_cls.__name__)} return dictionary @classmethod @abc.abstractmethod def get_semantic_dictionary(cls): """Creates a semantic dictionary with cls.CLASS_KEY and cls.PROPERTIES_KEY as its keys. The class-key points to a class_dictionary with the keys: cls.THIS [, cls.SUBCLASS_OF, cls.TYPE]. Create initial dictionary using cls.get_class_dictionary(): dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: {} } The properties_key points to a properties_dictionary with semantically relevant keys of self.__dict__ as keys. Use cls.create_semantic_property_dictionary(...) in order to add a property dictionary for each property as follows: dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary(property_key, ...)) Return dictionary by using: cls.return_dictionary_after_updating_super_classes(dictionary) """ pass @classmethod def return_dictionary_after_updating_super_classes(cls, dictionary): """Return semantic dictionary after updating super classes if necessary. """ if cls.PROPERTIES_KEY not in dictionary.keys(): return dictionary subproperty_base_uri_set = set( value.get(cls.SUBPROPERTYOF).split('#')[0]\ for value in dictionary[cls.PROPERTIES_KEY].values()\ if bool(value.get(cls.SUBPROPERTYOF)) ) for sub_property_base in subproperty_base_uri_set: if bool(cls.SUPER_CLASSES_DICT.get(sub_property_base))\ and (\ cls.SUBCLASS_OF not in dictionary[cls.CLASS_KEY].keys()\ or len(dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF]) == 0\ or sub_property_base + '#' + cls.SUPER_CLASSES_DICT.get(sub_property_base) not in dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF]\ ): subclass_list = dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF]\ if cls.SUBCLASS_OF in dictionary[cls.CLASS_KEY].keys()\ and len(dictionary[cls.CLASS_KEY].get(cls.SUBCLASS_OF)) > 0\ else [] subclass_list.append(sub_property_base + '#' + cls.SUPER_CLASSES_DICT.get(sub_property_base)) dictionary[cls.CLASS_KEY].update({cls.SUBCLASS_OF: subclass_list}) return dictionary def __repr__(self) -> str: """Return a representation of all semantically relevant properties. """ data_string = self.__str__() return f'<{data_string}>' def __str__(self) -> str: """Return a str of all semantically relevant properties. """ name = type(self).__name__ data = [] for key in self.get_semantic_dictionary()[self.PROPERTIES_KEY].keys(): if key in self.__dict__.keys() and\ (self.__dict__[key] != None or (type(self.__dict__[key]) == list and len(self.__dict__[key]) > 0)): data.append(f'{key}: {self.__dict__[key]}') data_string = ', '.join(data) return f'{name} {data_string}' Index: py2ttl/data_handler.py =================================================================== --- py2ttl/data_handler.py (revision 91) +++ py2ttl/data_handler.py (revision 92) @@ -1,190 +1,195 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to add data to a rdf graph. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD from rdflib import RDF as ns_rdf from os.path import isfile import random import warnings from class_spec import SemanticClass from config import DATA_URL class RDFDataHandler: """ This class can be used to add data to a rdf graph. """ UNITTESTING = False SIMPLE_DATA_TYPE_MAPPING = { int: XSD.integer, float: XSD.float, str: XSD.string, bool: XSD.boolean, list: RDF.List } def __init__(self, target_file, mapping_dictionary): self.target_file = target_file self.mapping_dictionary = mapping_dictionary self.ontology_graph = Graph() self.data_graph = Graph() self.data_identifier_mapping = {} if bool(self.mapping_dictionary.get('ontology')): self.project_name = self.mapping_dictionary['ontology'].get('project_name') self.project_uri = URIRef(self.mapping_dictionary['ontology'].get('project_uri')) ontology_file = self.mapping_dictionary['ontology'].get('ontology_file') if bool(ontology_file) and isfile(ontology_file): self.ontology_graph.parse(ontology_file, format="turtle") self.ns = { uriref: ns for ns, uriref in self.data_graph.namespace_manager.namespaces() } self.data_graph.bind(self.project_name, self.project_uri) self.data_graph.bind('data', DATA_URL + '#') else: raise Exception('Error: mapping_dictionary does not contain key "ontology"!') def add_data(self, data_instance, identifier_prefix, parent_data_instance=None): """Add a data rdf instance of data_instance to the data_graph. :return: (rdflib.URIRef) subject_uri of data instance """ identifier_uri = self.create_identifier_uri(data_instance, identifier_prefix) if bool(self.mapping_dictionary['classes'].get(type(data_instance).__name__)): class_uri = self.mapping_dictionary['classes'][type(data_instance).__name__]['class_uri'] self.data_identifier_mapping.update({data_instance: identifier_uri}) self.data_graph_add((identifier_uri, RDF.type, class_uri)) semantic_dict = data_instance.get_semantic_dictionary() for key, content in semantic_dict['properties'].items(): if bool(self.mapping_dictionary['classes'][type(data_instance).__name__]['properties'].get(key)): datatype = content.get('class') cardinality = content.get('cardinality')\ if bool(content.get('cardinality')) else 0 if data_instance.__dict__.get(key) is not None\ and (type(data_instance.__dict__.get(key)) != int or data_instance.__dict__.get(key) != -1): predicate_uri = self.mapping_dictionary['classes'][type(data_instance).__name__]['properties'][key] child_data_instance = data_instance.__dict__.get(key) new_identifier_prefix = identifier_uri[identifier_uri.index('#')+1:] if datatype is list: self.add_ordered_list(child_data_instance, identifier_uri, predicate_uri,\ new_identifier_prefix, data_instance) elif issubclass(datatype, SemanticClass): if type(child_data_instance) is not list: if type(child_data_instance) != datatype: child_id = child_data_instance child_data_instance = parent_data_instance.get_object_from_list_with_id(datatype,\ child_id) if child_data_instance is None: print(key, content)# parent_data_instance.number, child_id, type(child_id), datatype) msg = 'No child_data_instance found for data_instance {0}: looking for {1} with id {2}'.format(\ type(parent_data_instance), datatype, child_id) raise Exception(msg) else: new_list_name = 'list_of_' + datatype.__name__ + 's' if new_list_name in data_instance.__dict__.keys(): data_instance.__dict__[new_list_name].append(child_data_instance) else: data_instance.__dict__.update({ new_list_name: [ child_data_instance ]}) if child_data_instance not in self.data_identifier_mapping.keys(): child_identifier_uri = self.add_data(child_data_instance, new_identifier_prefix,\ parent_data_instance=data_instance) else: child_identifier_uri = self.data_identifier_mapping[child_data_instance] self.data_graph_add((identifier_uri, predicate_uri, child_identifier_uri)) else: for child_item in child_data_instance: if child_item not in self.data_identifier_mapping.keys(): child_identifier_uri = self.add_data(child_item, new_identifier_prefix,\ parent_data_instance=data_instance) else: child_identifier_uri = self.data_identifier_mapping[child_item] self.data_graph_add((identifier_uri, predicate_uri, child_identifier_uri)) else: literal_datatype = RDFDataHandler.SIMPLE_DATA_TYPE_MAPPING[datatype] ontology_datatypes = [ o for o in self.ontology_graph.objects(subject=predicate_uri, predicate=RDFS.range) ] if len(ontology_datatypes) > 0: literal_datatype = ontology_datatypes[0] - object_literal = Literal(str(child_data_instance), datatype=literal_datatype) - self.data_graph_add((identifier_uri, predicate_uri, object_literal)) + if type(child_data_instance) is list: + for child_item in child_data_instance: + object_literal = Literal(str(child_item), datatype=literal_datatype) + self.data_graph_add((identifier_uri, predicate_uri, object_literal)) + else: + object_literal = Literal(str(child_data_instance), datatype=literal_datatype) + self.data_graph_add((identifier_uri, predicate_uri, object_literal)) else: msg = 'Mapping dictionary for {0} does not contain a entry for {1}!'.format(type(data_instance).__name__, key) raise Exception(msg) else: msg = 'Mapping dictionary does not contain a entry for {}!'.format(type(data_instance).__name__) raise Exception(msg) return identifier_uri def add_ordered_list(self, data_instance_list, identifier_uri, predicate_uri, identifier_prefix, data_instance): """Add a data rdf instance of data_instance to the data_graph. """ if len(data_instance_list) > 0: child_identifiers = [] for item in data_instance_list: if item not in self.data_identifier_mapping.keys(): child_identifiers.append(self.add_data(item, identifier_prefix, data_instance)) else: child_identifiers.append(self.data_identifier_mapping[item]) list_node = self.generate_RDF_collection(child_identifiers) self.data_graph_add((identifier_uri, predicate_uri, list_node)) def create_identifier_uri(self, data_instance, identifier_prefix): """Return a data identifier uri. :return: (rdflib.URIRef) subject_uri of data instance """ data_type, id = data_instance.get_name_and_id() identifier_uri = URIRef(DATA_URL + '#' + identifier_prefix + '_' + data_type + str(id)) randombit_length = 5 while (identifier_uri, None, None) in self.data_graph: identifier_uri = URIRef(DATA_URL + '#' + identifier_prefix + '_' + data_type + str(random.getrandbits(randombit_length))) randombit_length += 1 return identifier_uri def data_graph_add(self, rdf_triple): """Add a triple to the graph. """ #not RDFDataHandler.UNITTESTING and print(rdf_triple) self.data_graph.add(rdf_triple) def generate_RDF_collection(self, vals ) -> BNode: """ Generate an RDF List from vals, returns the head of the list @URL: @organization: U{World Wide Web Consortium} @author: U{Ivan Herman} @license: U{W3C® SOFTWARE NOTICE AND LICENSE} @param graph: RDF graph @type graph: RDFLib Graph @param vals: array of RDF Resources @return: head of the List (an RDF Resource) """ heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ] for i in range(0, len(vals)) : self.data_graph_add( (heads[i], ns_rdf["first"], vals[i]) ) self.data_graph_add( (heads[i], ns_rdf["rest"], heads[i+1]) ) return heads[0] def write(self, output_format="turtle"): """Write graph. """ f = open(self.target_file, 'wb+') f.write(self.data_graph.serialize(format=output_format)) f.close() Index: py2ttl/py2ttl_ontology.py =================================================================== --- py2ttl/py2ttl_ontology.py (revision 91) +++ py2ttl/py2ttl_ontology.py (revision 92) @@ -1,361 +1,363 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert py classes that are subclasses of class_spec.SemanticClass to a owl ontology in turtle format. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import getopt import importlib import importlib.util import inspect import lxml.etree as ET from os import sep, path, listdir from os.path import isfile, isdir, dirname, basename from progress.bar import Bar from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD import re import sys import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from class_spec import SemanticClass, UnSemanticClass from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL from data_handler import RDFDataHandler sys.path.append('shared_util') from myxmlwriter import dict2xml __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Py2TTLOntologyConverter: """This class can be used convert semantic_dictionaries to a owl ontology in turtle format. """ UNITTESTING = False + INFERRED_SUB_CLASS = RDFS.subClassOf * '*' def __init__(self, project_ontology_file=None): self.class_uri_dict = {} self.uri_mapping4cls_and_properties = {} self.project_graph = Graph() self.base_uriref = URIRef(PROJECT_URL) self.project_name = PROJECT_NAME self.ns = { self.base_uriref + '#': self.project_name } if project_ontology_file is not None and isfile(project_ontology_file): self.project_graph.parse(project_ontology_file, format="turtle") if len(self.project_graph) > 0: self.base_uriref = self.project_graph.value(predicate=RDF.type, object=OWL.Ontology, any=False) self.ns = { uriref: ns for ns, uriref in self.project_graph.namespace_manager.namespaces() } self.project_name = self.ns.get(self.base_uriref + '#') self.project_graph.bind(self.project_name, self.base_uriref + '#') self.uri_mapping4cls_and_properties.update({ 'ontology': { 'project_name': self.project_name, 'project_uri': self.base_uriref + '#' }}) self.uri_mapping4cls_and_properties.update({ 'classes': {} }) def addClass2Graph(self, cls, semantic_dict=None) -> (URIRef, type): """Add a class to project_graph. :return: (cls_uri (URIRef), super_cls (cls)) """ if semantic_dict is None: semantic_dict = cls.get_semantic_dictionary() comment, label = self.get_comment_label(cls) cls_uri = URIRef(self.base_uriref + '#' + cls.__name__) self.project_graph.add((cls_uri, RDF.type, OWL.Class)) self.project_graph.add((cls_uri, RDFS.isDefinedBy, self.base_uriref)) if comment != '': self.project_graph.add((cls_uri, RDFS.comment, Literal(comment, lang='en'))) if label != '': self.project_graph.add((cls_uri, RDFS.label, Literal(label, lang='en'))) super_uri = None super_cls = None - if SemanticClass.SUBCLASS_OF in semantic_dict[SemanticClass.CLASS_KEY].keys()\ - and len(semantic_dict[SemanticClass.CLASS_KEY][SemanticClass.SUBCLASS_OF]) > 0: - for super_uri_string in semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.SUBCLASS_OF): - super_uri = URIRef(super_uri_string) - self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri)) if bool(semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.TYPE)): super_cls = semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.TYPE) super_uri = self.createClassAndProperties(super_cls) if super_uri is not None: self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri)) + if SemanticClass.SUBCLASS_OF in semantic_dict[SemanticClass.CLASS_KEY].keys()\ + and len(semantic_dict[SemanticClass.CLASS_KEY][SemanticClass.SUBCLASS_OF]) > 0: + for super_uri_string in semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.SUBCLASS_OF): + super_uri = URIRef(super_uri_string) + if not (cls_uri, self.INFERRED_SUB_CLASS, super_uri) in self.project_graph: + self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri)) return cls_uri, super_cls def addProperty2Graph(self, property_uri, domain_uri, range_uri, info_dict, property_type=OWL.ObjectProperty): """Add a property to self.project_graph. """ label = 'has ' + property_uri.split('#')[1].replace('has','')\ if SemanticClass.PROPERTY_LABEL not in info_dict.keys() else info_dict[SemanticClass.PROPERTY_LABEL] self.project_graph.add((property_uri, RDF.type, property_type)) self.project_graph.add((property_uri, RDFS.isDefinedBy, self.base_uriref)) self.project_graph.add((property_uri, RDFS.domain, domain_uri)) self.project_graph.add((property_uri, RDFS.range, range_uri)) if SemanticClass.PROPERTY_COMMENT in info_dict.keys(): comment = info_dict[SemanticClass.PROPERTY_COMMENT] self.project_graph.add((property_uri, RDFS.comment, Literal(comment, lang='en'))) self.project_graph.add((property_uri, RDFS.label, Literal(label, lang='en'))) if SemanticClass.CARDINALITY in info_dict.keys()\ and info_dict[SemanticClass.CARDINALITY] > 0: self.addRestriction2Class(domain_uri, property_uri, info_dict) def addRestriction2Class(self, cls_uri, property_uri, info_dict): """Adds restriction on property_uri to class cls_uri. """ if SemanticClass.CARDINALITY in info_dict.keys()\ and info_dict[SemanticClass.CARDINALITY] > 0: if (cls_uri, None, None) not in self.project_graph: warnings.warn('{} not in graph!'.format(cls_uri)) restriction = BNode() cardinality_restriction = URIRef(OWL + info_dict[SemanticClass.CARDINALITY_RESTRICTION])\ if SemanticClass.CARDINALITY_RESTRICTION in info_dict.keys()\ else OWL.cardinality cardinality = info_dict[SemanticClass.CARDINALITY] self.project_graph.add((cls_uri, RDFS.subClassOf, restriction)) self.project_graph.add((restriction, RDF.type, OWL.Restriction)) self.project_graph.add((restriction, OWL.onProperty, property_uri)) self.project_graph.add((restriction, cardinality_restriction, Literal(str(cardinality), datatype=XSD.nonNegativeInteger))) def create_ontology(self, datatypes_dir, target_ontology_file): """Convert all classes contained in datatypes_dir that are subclasses of class_spec.SemanticClass to rdf. :return: exit code (int) """ if isdir(datatypes_dir): semantic_classes = self.get_semantic_classes(datatypes_dir) if not Py2TTLOntologyConverter.UNITTESTING: bar = Bar('creating classes and properties', max=len(semantic_classes)) for cls in semantic_classes: self.createClassAndProperties(cls) not bool(Py2TTLOntologyConverter.UNITTESTING) and bar.next() not bool(Py2TTLOntologyConverter.UNITTESTING) and bar.finish() self.uri_mapping4cls_and_properties['ontology'].update({'ontology_file': target_ontology_file}) f = open(target_ontology_file, 'wb+') f.write(self.project_graph.serialize(format="turtle")) f.close() if not Py2TTLOntologyConverter.UNITTESTING: xml_file = 'mapping_file4' + datatypes_dir.replace(sep, '.') + '2' + target_ontology_file.replace('.' + sep, '').replace(sep, '.').replace('.ttl', '.xml') dict2xml(self.uri_mapping4cls_and_properties, xml_file) else: print('Error: dir {} does not exist!'.format(datatypes_dir)) usage return 1 return 0 def createClassAndProperties(self, cls): """Creates a owl:Class and some owl:ObjectProperty from semantic_dictionary of a python class. """ if not cls.__name__ in self.class_uri_dict: self.class_uri_dict.update({cls.__name__: cls}) semantic_dict = cls.get_semantic_dictionary() cls_uri, super_cls = self.addClass2Graph(cls, semantic_dict) uri_mapping4properties = {} for property_key in self._get_semantic_dictionary_keys_super_first(semantic_dict['properties']): super_semantic_dict = {} if super_cls is None else super_cls.get_semantic_dictionary() if len(super_semantic_dict) == 0 or not bool(super_semantic_dict['properties'].get(property_key)): property_dict4key = semantic_dict['properties'].get(property_key) property_cls = property_dict4key.get('class') subject_uri, property_uri = self.createProperty(cls_uri, property_key, property_cls, property_dict4key) uri_mapping4properties.update({ property_key: property_uri }) elif bool(self.uri_mapping4cls_and_properties.get('classes').get(super_cls.__name__).get('properties').get(property_key)): property_uri = self.uri_mapping4cls_and_properties['classes'][super_cls.__name__]['properties'][property_key] uri_mapping4properties.update({ property_key: property_uri}) self.uri_mapping4cls_and_properties.get('classes').update({ cls.__name__: { 'class_uri': cls_uri, 'properties': uri_mapping4properties }}) return URIRef(self.base_uriref + '#' + cls.__name__) def createProperty(self, domain_uri, property_name, range_cls, info_dict) -> (URIRef, URIRef): """Creates a owl:ObjectProperty. :return: tuple of domain_uri (rdflib.URIRef) and property_uri (rdflib.URIRef) of created property """ name = self.createPropertyName(property_name=property_name)\ if SemanticClass.PROPERTY_NAME not in info_dict.keys() else info_dict[SemanticClass.PROPERTY_NAME] property_uri = URIRef(self.base_uriref + '#' + name) inferredSubClass = RDFS.subClassOf * '*' range_uri = URIRef(self.base_uriref + '#' + range_cls.__name__) super_property_uri = None if SemanticClass.SUBPROPERTYOF in info_dict.keys(): super_property_uri = URIRef(info_dict[SemanticClass.SUBPROPERTYOF]) elif SemanticClass.SUPER_PROPERTY in info_dict.keys(): domain_uri, super_property_uri = self.createProperty(domain_uri,\ info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME),\ range_cls, info_dict[SemanticClass.SUPER_PROPERTY]) if (property_uri, None, None) not in self.project_graph: property_type = OWL.ObjectProperty if range_cls.__module__ == 'builtins': if range_cls != list: property_type = OWL.DatatypeProperty range_uri = RDFDataHandler.SIMPLE_DATA_TYPE_MAPPING.get(range_cls) if range_uri == XSD.string and property_name == 'URL': range_uri = XSD.anyURI self.addProperty2Graph(property_uri, domain_uri, range_uri, info_dict, property_type=property_type) elif not True in [\ (domain_uri, inferredSubClass, o) in self.project_graph\ for o in self.project_graph.objects(property_uri, RDFS.domain)\ ]: # if domain_uri is NOT a subclass of a cls specified by RDFS.domain if SemanticClass.CARDINALITY in info_dict.keys()\ and info_dict[SemanticClass.CARDINALITY] > 0: self.addRestriction2Class(domain_uri, property_uri, info_dict) self.project_graph.add((property_uri, RDFS.domain, domain_uri)) if super_property_uri is not None\ and (property_uri, RDFS.subPropertyOf, super_property_uri) not in self.project_graph: self.project_graph.add((property_uri, RDFS.subPropertyOf, super_property_uri)) return domain_uri, property_uri def createPropertyName(self, property_name=None, subject_uri=None, object_uri=None, connector='BelongsTo', prefix='has'): """Returns a property name. """ if property_name is not None: property_name = ''.join([ property_name.split('_')[0].lower() ] + [ text.capitalize() for text in property_name.split('_')[1:] ]) return prefix + property_name[0].upper() + property_name[1:] if property_name[0].islower()\ else prefix + property_name elif subject_uri is not None: property_name = subject_uri.split('#')[1] + self.createPropertyName(object_uri=object_uri, prefix=connector) return property_name[0].lower() + property_name[1:] elif object_uri is not None: return prefix + object_uri.split('#')[1] else: return prefix def get_comment_label(self, cls): """Returns comment and label from cls __doc__. """ comment = cls.__doc__.replace('\n','').lstrip() label = cls.__name__ if '.' in cls.__doc__: comment = [ text for text in cls.__doc__.split('\n') if text != '' ][0].lstrip() if '@label' in cls.__doc__: m = re.search('(@label[:]*\s)(.*[\.]*)', cls.__doc__) label_tag, label = m.groups() elif re.search('([A-Z][a-z]+)', label): m = re.search('([A-Z]\w+)([A-Z]\w+)', label) label = ' '.join([ text.lower() for text in re.split(r'([A-Z][a-z]+)', label) if text != '' ]) return comment, label def get_semantic_classes(self, datatypes_dir): """Returns a list of all classes that are contained in datatypes_dir that are subclasses of class_spec.SemanticClass. :return: a list of (str_name, class) """ base_dir = dirname(dirname(__file__)) sys.path.append(base_dir) root_modul_name = datatypes_dir.replace('/','.') files = [ file.replace('.py','') for file in listdir(datatypes_dir) if file.endswith('.py') and not file.startswith('test_') and not file.startswith('_')] all_modules = [] for name in files: all_modules.append(importlib.import_module('{}.{}'.format(root_modul_name, name))) all_classes = [] for modul in all_modules: all_classes += inspect.getmembers(modul, inspect.isclass) #all_classes = sorted(set(all_classes)) all_classes = sorted(set(all_classes), key=lambda current_class: current_class[0]) semantic_classes = [ cls for name, cls in all_classes if issubclass(cls, SemanticClass)\ and not issubclass(cls, UnSemanticClass)\ and not (cls == SemanticClass)] return semantic_classes def _get_builtin_cls_keys(self, property_dict): """Returns a list of keys for classes that are builtin. """ builtin_cls_keys = [] for key in property_dict.keys(): property_cls = property_dict.get(key).get('class')\ if type(property_dict.get(key)) is dict\ else property_dict.get(key)[0] if type(property_cls) != dict\ and property_cls.__module__ == 'builtins': builtin_cls_keys.append(key) return builtin_cls_keys def _get_semantic_dictionary_keys_super_first(self, property_dict): """Sorts the keys of the property part of a semantic dictionary and returns the keys for super classes before keys of subclasses. :return: a sorted list of keys. """ builtin_cls_keys = self._get_builtin_cls_keys(property_dict) complex_cls_keys = [] for key in [ key for key in property_dict.keys()\ if key not in builtin_cls_keys ]: current_cls = property_dict.get(key).get('class') key_inserted = False for index, cls_key in enumerate(complex_cls_keys): potential_sub_cls = property_dict.get(cls_key).get('class') if issubclass(potential_sub_cls, current_cls): complex_cls_keys.insert(index, key) key_inserted = True break if not key_inserted: complex_cls_keys.append(key) return builtin_cls_keys + complex_cls_keys def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert py classes that are subclasses of class_spec.SemanticClass to owl:Class and its properties to owl:ObjectProperty. py2ttl/py2ttl_ontology.py [OPTIONS ] [optional] directory containing datatypes that are subclasses of class_spec.SemanticClass. Overwrites DATATYPES_DIR in py2ttl/config.py. OPTIONS: -h|--help: show help -s|--source=source_ontology_file source ontology ttl file, option overwrites PROJECT_ONTOLOGY_FILE in py2ttl/config.py -t|--target=target_ontology_file target ontology ttl file, default: 'PROJECT_PREFIX-ontology_autogenerated.ttl' :return: exit code (int) """ check_config_files_exist() datatypes_dir = get_datatypes_dir() source_ontology_file = PROJECT_ONTOLOGY_FILE target_ontology_file = '' try: opts, args = getopt.getopt(argv, "hs:t:", ["help","source=", "target="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-t', '--target'): target_ontology_file = arg elif opt in ('-s', '--source'): source_ontology_file = arg converter = Py2TTLOntologyConverter(project_ontology_file=source_ontology_file) if len(args) > 0: datatypes_dir = args[0] if target_ontology_file == '': target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, converter.project_name) return converter.create_ontology(datatypes_dir, target_ontology_file) if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: tests_py2ttl/test_class_spec.py =================================================================== --- tests_py2ttl/test_class_spec.py (revision 91) +++ tests_py2ttl/test_class_spec.py (revision 92) @@ -1,90 +1,91 @@ import unittest from os import sep, path import inspect from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD import sys sys.path.append('svgscripts') from datatypes.image import Image from datatypes.word import Word +from datatypes.simple_word import SimpleWord sys.path.append('py2ttl') try: from class_spec import SemanticClass except ImportError: sys.path.append(dirname(dirname(realpath(__file__)))) from py2ttl.class_spec import SemanticClass class TestSemanticClassFail(SemanticClass): def __init__(self): pass class Dummy: def __init__(self, id): self.id = id class TestSemanticClassOK(SemanticClass): def __init__(self): self.msg = 'Hello World!' self.id = 0 self.mylist = [ Dummy(0), Dummy(1), Dummy(2) ] @staticmethod def get_semantic_dictionary(): return {'class': { 'this': TestSemanticClassOK}, 'properties': { 'msg': (str, SemanticClass.SINGLE_VALUE) }} class TestSemanticClassB(SemanticClass): def __init__(self): self.data = [ 1, 2, 3, 4 ] self.test = [ TestSemanticClassOK(), TestSemanticClassOK() ] @staticmethod def get_semantic_dictionary(): return { 'class': {'this': TestSemanticClassB }, 'properties': TestSemanticClassB.create_semantic_property_dictionary('data', int)} def get_super(self): return inspect.getclasstree([self.__class__],unique=True)[0][0] class TestSemanticClassC(TestSemanticClassB): pass class TestSemanticClass(unittest.TestCase): def test_fail(self): with self.assertRaises(TypeError): TestSemanticClassFail() def test_success(self): test = TestSemanticClassOK() self.assertEqual(TestSemanticClassOK.get_semantic_dictionary()['properties'], { 'msg': (str, 1) }) test = TestSemanticClassB() self.assertEqual(test.get_semantic_dictionary()['class'].get('this'), TestSemanticClassB) dictionary = test.return_dictionary_after_updating_super_classes(TestSemanticClassB.get_semantic_dictionary()) def test_get_class_dictionary(self): test = TestSemanticClassC() self.assertEqual(test.get_class_dictionary().get('type') is not None, True) self.assertEqual(test.get_class_dictionary().get('type'), TestSemanticClassB) #print(test.create_semantic_property_dictionary('is_true', bool, cardinality=1, name='IsTrue', label='is true', comment='test comment')) def test_get_cls_hasPart_objectCls_dictionaries(self): dictionary = SemanticClass.get_cls_hasPart_objectCls_dictionaries(SemanticClass, 'asdf/asdf') #print(dictionary) def test_get_object_from_list_with_id(self): test = TestSemanticClassOK() #mylist = test._get_list_of_type(Dummy) d_1 = test.get_object_from_list_with_id(Dummy, 1) self.assertEqual(d_1 is not None, True) self.assertEqual(d_1.id, 1) def test_return_dictionary_after_updating_super_classes(self): class TestWord(Word): RDFS_SUBCLASSOF_LIST = [ 'http://www.example.com#Test' ] dictionary = TestWord.get_semantic_dictionary() self.assertEqual(TestWord.SUBCLASS_OF in dictionary[TestWord.CLASS_KEY].keys(), True) self.assertEqual(len(dictionary[TestWord.CLASS_KEY][TestWord.SUBCLASS_OF]), 2) def test_repr(self): word = Word() #print(word) if __name__ == "__main__": unittest.main() Index: svgscripts/datatypes/style.py =================================================================== --- svgscripts/datatypes/style.py (revision 91) +++ svgscripts/datatypes/style.py (revision 92) @@ -1,161 +1,193 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent the style of a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy from lxml import etree as ET import re import sys from .color import Color sys.path.append('py2ttl') from class_spec import SemanticClass class Style(SemanticClass): """ This class represents the style of a word. Args: manuscript: a ArchivalManuscriptUnity """ NIETSCHES_FONTS = { 'german': 'deutsche Schreibschrift', 'latin': 'lateinische Schreibschrift' } COLOR_KEYS = [ 'black', 'red', 'blue', 'green', 'grey' ] RELEVANT_STYLE_KEYS = [ 'font-family', 'fill', 'stroke' ] + ADDITIONAL_STYLE_KEYS = [ 'font-size' ] WRITING_INSTRUMENTS = { (COLOR_KEYS[0], False): 'schwarze Tinte',\ (COLOR_KEYS[0], True): 'Bleistift',\ (COLOR_KEYS[4], True): 'Bleistift',\ (COLOR_KEYS[1], False): 'braune Tinte',\ (COLOR_KEYS[1], True): 'Rotstift',\ (COLOR_KEYS[2], False): 'violette Tinte',\ (COLOR_KEYS[2], True): 'Blaustift',\ (COLOR_KEYS[3], False): '„Tinte der letzten Korrektur“'} - def __init__(self, manuscript=None, writing_process_id=-1): + def __init__(self, manuscript=None, writing_process_id=-1, extended_styles=False, deleted=False): self.color = Color.create_cls(manuscript=manuscript) + self.css_styles = [] + self.deleted = deleted + self.is_german = True self.font = self.NIETSCHES_FONTS['german'] self.font_family = 'Weidemann-Book' + self.font_size = None self.manuscript = manuscript self.relevant_key_map = {} - for key in self.RELEVANT_STYLE_KEYS: + relevant_style_keys = self.RELEVANT_STYLE_KEYS + self.ADDITIONAL_STYLE_KEYS\ + if extended_styles else self.RELEVANT_STYLE_KEYS + for key in relevant_style_keys: if not key.startswith('font'): self.relevant_key_map.update({key: self.set_color}) elif key == 'font-family': self.relevant_key_map.update({key: self.set_font}) + elif key == 'font-size': + self.relevant_key_map.update({key: self.set_size}) self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, False)] self.writing_process_id = writing_process_id def create_a_copy_wo_writing_process_id(self): new_self = copy.deepcopy(self) new_self.writing_process_id = -1 return new_self def create_a_copy(self, reduce_writing_process_id=False): writing_process_id = self.writing_process_id\ if not reduce_writing_process_id\ else self.writing_process_id-1 copy = Style(manuscript=self.manuscript, writing_process_id=writing_process_id) copy.color = self.color copy.font_family = self.font_family copy.process_style_classes() if copy.manuscript is not None: copy.manuscript.update_styles(copy) return copy + def create_css_styles(self): + """Create css styles. + """ + if self.deleted: + self.css_styles.append('text-decoration: line-through;') + if self.font_family.endswith('Bold'): + self.css_styles.append(f'font-weight: bold;') + if self.font_size is not None: + self.css_styles.append(f'font-size: {self.font_size};') + self.css_styles.append(f'color: {self.color.hex_color};') + @classmethod - def create_cls(cls, page, style_string, manuscript=None): + def create_cls(cls, page, style_string, manuscript=None, create_css=False, deleted=False): """Creates a Style from a style_string. :return: (datatypes.style) Style """ - style = cls(manuscript=manuscript) + style = cls(manuscript=manuscript, extended_styles=create_css, deleted=deleted) style_dict = { key: key_dict for key, key_dict in page.style_dict.items()\ if any(relevant_key in key_dict.keys() for relevant_key in style.relevant_key_map.keys()) } for style_key in style_string.split(' '): if style_key in style_dict.keys(): dictionary = style_dict[style_key] for key, set_function in [ (key, func) for key, func in style.relevant_key_map.items() if key in dictionary.keys() ]: if callable(set_function): set_function(dictionary[key]) style.process_style_classes() + if create_css: + style.create_css_styles() return style @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ properties = {} properties.update(cls.create_semantic_property_dictionary('font', str, cardinality=1,\ name='styleHasFont', label='style has font', comment='Connects a style with the kind of font Nietzsche used in writing.')) properties.update(cls.create_semantic_property_dictionary('writing_instrument', str, cardinality=1,\ name='styleHasWritingInstrument', label='style has writing instrument', comment='Connects a style with the description of a writing instrument.')) properties.update(cls.create_semantic_property_dictionary('color', Color, cardinality=1,\ name='styleHasColor', label='style has color', comment='Connects a style with a color.')) + properties.update(cls.create_semantic_property_dictionary('css_styles', str, cardinality=1,\ + subPropertyOf=cls.STOFF_STYLE_HAS_CSS_URL_STRING,cardinality_restriction='minCardinality',\ + name='styleHasCSS', label='style has css', comment='Connects a style with CSS style.')) dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } return cls.return_dictionary_after_updating_super_classes(dictionary) - + def process_style_classes(self): """Infere writing instrument from font-family and color. """ if self.font_family.startswith('NewsGothic'): + self.is_german = False self.font = self.NIETSCHES_FONTS['latin'] if self.color.name in self.COLOR_KEYS: self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, self.font_family.endswith('Bold'))] def set_color(self, hex_color: str): self.color = Color.create_cls(hex_color=hex_color, manuscript=self.manuscript) def set_font(self, font_family: str): self.font_family = font_family + def set_size(self, font_size: str): + self.font_size = font_size + @classmethod - def remove_irrelevant_style_keys(cls, style_string, page) -> str: + def remove_irrelevant_style_keys(cls, style_string, page, extended_styles=False) -> str: """Return a style_string without irrelevant style keys. """ + relevant_style_keys = cls.RELEVANT_STYLE_KEYS + cls.ADDITIONAL_STYLE_KEYS\ + if extended_styles else cls.RELEVANT_STYLE_KEYS return ' '.join(sorted( style_key for style_key in style_string.split(' ')\ if len(\ [ key for key in page.style_dict[style_key].keys()\ - if key in cls.RELEVANT_STYLE_KEYS ]\ + if key in relevant_style_keys ]\ ) > 0 )) def __eq__(self, other): """Returns true if self is qualitatively identical to other. Reason: For qualities, the idea of numerical identity is silly. """ if other is None: return False return self.color == other.color\ and self.font_family == other.font_family\ - and self.writing_process_id == other.writing_process_id + and self.writing_process_id == other.writing_process_id\ + and self.css_styles == other.css_styles def __hash__(self): """Return a hash value for self. """ return hash((self.color.__hash__, self.font_family, self.writing_process_id)) Index: svgscripts/datatypes/word.py =================================================================== --- svgscripts/datatypes/word.py (revision 91) +++ svgscripts/datatypes/word.py (revision 92) @@ -1,798 +1,800 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy import inspect from lxml import etree as ET from operator import attrgetter import re import string import sys import warnings from .box import Box from .matrix import Matrix from .path import Path from .simple_word import SimpleWord from .style import Style from .word_position import WordPosition from .transkription_position import TranskriptionPosition from .writing_process import WritingProcess SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation) def execute_function_on_parts(word_parts, func_name): """Execute function on parts and add those parts instead of original word to word_parts. :return: new word_parts, output from func """ copy_parts = word_parts[:] for word in word_parts: output = eval('word.{0}()'.format(func_name)) if len(word.word_parts) > 0: for part_word in word.word_parts: copy_parts.insert(copy_parts.index(word), part_word) copy_parts.remove(word) word.word_parts = [] return copy_parts, output def update_transkription_position_ids(word): """Update transkription_position' ids according to index. """ word_part_ids = [ wp.id for wp in word.word_parts ] if len(word_part_ids) != len(set(word_part_ids)): for id, wp in enumerate(word.word_parts): wp.id = id for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))): transkription_position.id = index transkription_position.has_box = None transkription_position.deleted = False class Word(SimpleWord): """ This class represents a word. """ COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ] APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' } DATA = 'debug-data' XML_TAG = 'word' XML_EARLIER_VERSION = 'earlier-version' XML_OVERWRITES = 'overwrites' XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\ 'isDeletionOfWord': 'deletesEarlierPart',\ 'isExtensionOfWord': 'extendsEarlierVersion',\ 'isTransformationOfWord': 'transformsEarlierPart' } def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None): super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.corrections = [] self.deleted = deleted self.debug_container = {} self.debug_msg = None self.earlier_version = earlier_version self.edited_text = None self.isClarificationOfWord = None self.isDeletionOfWord = None self.isExtensionOfWord = None self.isTransformationOfWord = None if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0: self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ]) self.overwrites_word = None self.styles = styles\ if styles is not None\ else [] self.verified = None self.writing_process_id = writing_process_id self.writing_processes = [] self.word_insertion_mark = None self.word_box = None self.word_parts = word_parts if word_parts is not None else [] self.word_part_objs = word_part_objs if word_part_objs is not None else [] self.IS_DEBUG_WORD = (self.text == 'Übertreibung' and self.line_number == 8) def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = super(Word,self).attach_word_to_tree(target_tree) if self.deleted is not None: word_node.set('deleted', str(self.deleted).lower()) if self.verified is not None: word_node.set('verified', str(self.verified).lower()) if self.edited_text is not None: word_node.set('edited-text', self.edited_text) if self.writing_process_id > -1: word_node.set('writing-process-id', str(self.writing_process_id)) for index, word_part in enumerate(self.word_parts): word_part.id = index word_part.attach_word_to_tree(word_node) if self.earlier_version is not None: earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION) self.earlier_version.attach_word_to_tree(earlier_node) if self.overwrites_word is not None\ and len(self.overwrites_word.transkription_positions) > 0: overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES) self.overwrites_word.attach_word_to_tree(overwrite_node) if self.word_box is not None: self.word_box.attach_object_to_tree(word_node) if len(self.corrections) > 0: word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ]))) for key in self.XML_CORRECTION_DICT.keys(): if self.__dict__[key] is not None: word_node.set(self.XML_CORRECTION_DICT[key], 'true') return word_node def belongs_to_multiple_writing_processes(self, include_parts=False): """Returns true if transkription_positions belong to different WritingProcesses. """ if len(self.word_parts) > 0 and include_parts: return len(set(word.writing_process_id for word in self.word_parts)) > 1 return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1 def set_parent_word_writing_process_id(self): """Set writing_process_id for parent word. """ ids = set(word.transkription_positions[0].style for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None) if len(ids) > 1: self.writing_process_id = max([style.writing_process_id for style in ids]) if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\ for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\ > 1: self.writing_process_id += 1 @classmethod def create_cls(cls, word_node): """Creates a word from a (lxml.Element) node. [:return:] Word """ cls = super(Word,cls).create_cls(word_node) cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1 cls.split_strings = None cls.join_string = word_node.get('join') if bool(word_node.get('split')): cls.split_strings = word_node.get('split').split(' ') if ''.join(cls.split_strings) != cls.text: error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\ format(word_node.getroottree().docinfo.URL, str(cls.id))\ + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\ + 'Text attribute: "{0}".\n'.format(cls.text) raise Exception(error_msg) cls.verified = word_node.get('verified') == 'true'\ if bool(word_node.get('verified')) else None cls.deleted = word_node.get('deleted') == 'true'\ if bool(word_node.get('deleted')) else None cls.edited_text = word_node.get('edited-text') cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ] if bool(word_node.get('corrections')): for index in [ int(i) for i in word_node.get('corrections').split(' ') ]: if index < len(cls.word_parts): cls.corrections.append(cls.word_parts[index]) cls.earlier_version = None if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0: cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0] for key_value in cls.XML_CORRECTION_DICT.values(): if word_node.get(key_value) == 'true': cls.__dict__[key_value] = True if cls.earlier_version is not None: for word_part in cls.word_parts: for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\ and len(cls.word_parts) <= len(cls.earlier_version.word_parts): try: word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id] except Exception: msg = f'{cls.id} {cls.text}: {word_part.id}' raise Exception(msg) for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls.earlier_version for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\ if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\ else None cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\ if len(word_node.xpath('./' + Box.XML_TAG)) > 0\ else None return cls def create_earlier_version(self, root_word=None, id=0): """Create an earlier version of word. """ if root_word is None: root_word = self root_word.set_parent_word_writing_process_id() word_parts = [] non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\ if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ] non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts) if non_single_punctuation_word_parts_length > 0\ and len([ word_part for word_part in non_single_punctuation_word_parts\ if word_part.deleted ])\ == non_single_punctuation_word_parts_length: self.deleted = True for word_part in non_single_punctuation_word_parts: word_part.deleted = False for id, word_part in enumerate(self.word_parts): earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id) if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) elif word_part.overwrites_word is not None\ and (len(word_part.transkription_positions) > 0\ and word_part.overwrites_word.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style\ != word_part.overwrites_word.transkription_positions[0].style): word_part.overwrites_word.id = word_part.id word_parts.append(word_part.overwrites_word) word_part.isTransformationOfWord = word_part.overwrites_word #print(f'transform: {self.text}') if word_part not in self.corrections: self.corrections.append(word_part) elif root_word.writing_process_id > -1\ and (len(word_part.transkription_positions) > 0\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style.writing_process_id\ == root_word.writing_process_id): word_part.extendsEarlierVersion = True #print('extends') if word_part not in self.corrections: self.corrections.append(word_part) else: if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) else: #print(f'default: {self.text}') word_parts.append(earlierWordPart) text = ''.join([ word.text for word in word_parts ])\ if len(word_parts) > 0\ else self.text if len(word_parts) == 1: self.transkription_positions += word_parts[0].transkription_positions self.faksimile_positions += word_parts[0].faksimile_positions word_parts = [] new_transkription_positions = copy.deepcopy(self.transkription_positions) if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None: writing_process_id = self.transkription_positions[0].style.writing_process_id for new_tp in new_transkription_positions: new_tp.style.writing_process_id = writing_process_id return Word(id=id, text=text, transkription_positions=new_transkription_positions,\ faksimile_positions=self.faksimile_positions, line_number=self.line_number,\ word_parts=word_parts) def create_correction_history(self, page=None, box_style=None): """Create correction history. """ if self.word_box is not None: manuscript = self.transkription_positions[0].style.manuscript\ if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None\ else None style = Style() if box_style is not None: style = box_style if page is not None: style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript) for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]: style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions) for transkription_position in transkription_positions: transkription_position.style = style self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\ line_number=self.line_number) for word_part in self.word_parts: word_part.create_correction_history(page=page, box_style=box_style) if len(self.word_parts) > 0: earlier_version = self.create_earlier_version() extending_words = self._get_parts_with_property_key('extendsEarlierVersion') if len(extending_words) > 0: for word in extending_words: word.isExtensionOfWord = earlier_version if self.has_mixed_status('deleted', include_parts=True): self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ]) if len(self.corrections) > 0: self.earlier_version = earlier_version @staticmethod def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): """Creates a word from a (lxml.Element) node or word_part_objs. [:return:] Word """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number text = word_node.get('text') deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true' transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ if len(word_node.findall('.//' + Word.DATA)) > 0\ else [ item.attrib for item in word_node.findall('.//part')] return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file WIDTH = 5 TOPCORRECTION = 2.0 FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize height = height x = round(float(word_part_objs[0]['x']), 3) if(page is not None and bool(page.style_dict)): HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size if endSign is not None and '%' in endSign: lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ if bool(page.style_dict[key].get('font-size'))] lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR elif endSign is not None and '%' in endSign: endX = float(endX) + WIDTH bottom = round(float(word_part_objs[0]['y']), 3) y = round(bottom - height + TOPCORRECTION, 3) width = round(float(endX) - x, 3) transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] text = ''.join([ dict['text'] for dict in word_part_objs]) line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) word.debug_msg = debug_msg return word else: error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(Word,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\ cardinality=1, cardinality_restriction='minCardinality',\ name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\ name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deleted', bool,\ name='isWordDeleted', label='has word been deleted', comment='Word has been deleted by the author.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\ name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\ name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\ name='isClarificationOfWord', label='word is a clarification of word',\ comment='The author has used this part of the word in order to clarify the appearance of that word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\ name='isDeletionOfWord', label='word is a deletion of word',\ comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\ name='isExtensionOfWord', label='word is a extension of word',\ comment='The author has used this part of a word in order to extend an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\ name='isTransformationOfWord', label='word is a transformation of word',\ comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\ name='overwritesWord', label='word overwrites word',\ comment='The author has used this word in order to overwrite that word.')) + # This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING, + # cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class. dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\ name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\ subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING)) super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\ name='isCorrectionOfWord', label='word is a correction of word',\ comment='The author has used this word in order to correct that word.') for key in cls.XML_CORRECTION_DICT.keys(): correction_dict = dictionary[cls.PROPERTIES_KEY].get(key) correction_dict.update(super_property_dictionary) dictionary[cls.PROPERTIES_KEY].update({key: correction_dict}) return cls.return_dictionary_after_updating_super_classes(dictionary) def has_mixed_status(self, property_key, include_parts=False, concerns_word=True): """Returns true if transkription_positions have mixed status concerning the property_key in their __dict__. """ if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions): return False if len(self.word_parts) > 0 and include_parts: if concerns_word: if False in set(property_key in word.__dict__.keys() for word in self.word_parts): return False return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1 else: return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\ if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1 return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1 def init_word(self, page): """Initialize word with objects from page. """ super(Word,self).init_word(page) if self.writing_process_id > -1: self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ] writing_processes = self.writing_processes for word_part in self.word_parts: word_part.init_word(page) self.lines += word_part.lines self.writing_processes += word_part.writing_processes self.lines = [ line for line in set(self.lines) ] self.writing_processes = [ wp for wp in set(self.writing_processes)] if self.overwrites_word is not None: self.overwrites_word.init_word(page) if self.earlier_version is not None: if self.earlier_version.writing_process_id == -1: self.earlier_version.writing_process_id = self.writing_process_id-1 if self.earlier_version.line_number == -1: self.earlier_version.line_number = self.line_number self.earlier_version.init_word(page) def join(self, other_word, append_at_end_of_new_word=True): """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. """ if append_at_end_of_new_word: self.text = self.text + other_word.text for position in other_word.transkription_positions: position.id = str(len(self.transkription_positions)) self.transkription_positions.append(position) else: self.text = other_word.text + self.text index = 0 for position in other_word.transkription_positions: self.transkription_positions.insert(index, position) index += 1 while index < len(self.transkription_positions): self.transkription_positions[index].id = str(index) index += 1 self.simplify_transkription_positions() def partition_according_to_deletion(self): """Partition a word according to its transkription_positions' deletion status ->split word and add partial words as its parts. """ if self.has_mixed_status('deleted'): transkription_positions = [] last_status = None for transkription_position in self.transkription_positions: if transkription_position.deleted != last_status\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.deleted if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] self.line_number = -1 self.deleted = False elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion') elif not self.deleted\ and len(self.transkription_positions) > 0\ and self.transkription_positions[0].deleted: self.deleted = True def partition_according_to_writing_process_id(self): """Partition a word according to its transkription_positions' writing_process_ids ->split word and add partial words as its parts. """ if self.belongs_to_multiple_writing_processes(): last_writing_process_id = -1 transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.writing_process_id != last_writing_process_id\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_writing_process_id = transkription_position.writing_process_id if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id') if self.belongs_to_multiple_writing_processes(include_parts=True): self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0] elif len(self.transkription_positions) > 0: self.writing_process_id = self.transkription_positions[0].writing_process_id def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False): """Determines whether word is over a word box. """ word_over_box = None if len(self.word_parts) > 0: for word in self.word_parts: current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None)) if current_word is not None and current_word.word_box is not None: word_over_box = current_word else: new_tp_dict = {} for index, transkription_position in enumerate(self.transkription_positions): if previous_word_has_box and index == 0: if len(transkription_position.positional_word_parts) > 0: transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2 #print(f'{self.text}: {transkription_position.positional_word_parts[0].left}') else: transkription_position.left += 1 word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) containing_boxes = [ box_path for box_path in box_paths\ if word_path.is_partially_contained_by(box_path)\ or box_path.do_paths_intersect(word_path) ] if len(containing_boxes) > 0: if previous_word_has_box: print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}') self._set_box_to_transkription_position(containing_boxes[0], word_path,\ transkription_position, new_tp_dict, tr_xmin) box_paths.remove(containing_boxes[0]) for replace_tp in new_tp_dict.keys(): for tp in new_tp_dict.get(replace_tp): self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp) self.transkription_positions.remove(replace_tp) word_over_box = self._get_partial_word_over_box() update_transkription_position_ids(self) return word_over_box def set_word_insertion_mark(self, word_insertion_mark): """Sets word_insertion_mark """ self.word_insertion_mark = word_insertion_mark def set_writing_process_id_to_transkription_positions(self, page): """Determines the writing process id of the transkription_positions. """ for transkription_position in self.transkription_positions: if len(transkription_position.positional_word_parts) > 0: for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): if font_key in page.fontsizekey2stage_mapping.keys(): transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) def simplify_transkription_positions(self): """Merge transkription_positions if possible. """ index = len(self.transkription_positions)-1 while index > 0\ and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]: current_tp = self.transkription_positions[index] index -= 1 previous_tp = self.transkription_positions[index] if previous_tp.is_mergebale_with(current_tp): positional_word_parts = previous_tp.positional_word_parts positional_word_parts += current_tp.positional_word_parts transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id) if len(transkription_positions) == 1: transkription_positions[0].writing_process_id = previous_tp.writing_process_id\ if previous_tp.writing_process_id != -1\ else current_tp.writing_process_id self.transkription_positions.pop(index+1) self.transkription_positions[index] = transkription_positions[0] #print(self.text, len(self.transkription_positions)) def split(self, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) return previousWord, currentWord, nextWord def split_according_to_status(self, status, splits_are_parts=False): """Split a word according to its transkription_positions' text. :return: a list of new word.Word """ new_words = [] if self.has_mixed_status(status): last_status = None transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.__dict__[status] != last_status\ and len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.__dict__[status] if len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) if splits_are_parts: self.word_parts += new_words if len(self.word_parts) > 0: self.transkription_positions = [] return new_words def undo_partitioning(self): """Undo partitioning. """ if len(self.word_parts) > 0: for word_part in self.word_parts: word_part.undo_partitioning() if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]): self.transkription_positions += word_part.transkription_positions self.earlier_version = None self.edited_text = None self.word_box = None self.word_parts = [] self.corrections = [] self.earlier_versions = [] self.box_paths = [] def _create_new_word(self, transkription_positions, status, new_id=0): """Create a new word from self and transkription_positions. """ newWord = Word(id=new_id, transkription_positions=transkription_positions) for key in self.COPY_PROPERTY_KEY: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys(): newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status]) else: newWord.__dict__[status] = transkription_positions[0].__dict__[status] return newWord def _get_parts_with_property_key(self, property_key): """Return a list of word_parts with property == property_key. """ word_parts = [] for word_part in self.word_parts: if property_key in word_part.__dict__.keys(): word_parts.append(word_part) else: word_parts += word_part._get_parts_with_property_key(property_key) return word_parts def _get_partial_word_over_box(self): """Partition a word according to its transkription_positions' has_box ->split word and add partial words as its parts. :return: word over box or self """ word_over_box = None if self.has_mixed_status('has_box'): transkription_positions = [] last_word_box = None for transkription_position in self.transkription_positions: if transkription_position.has_box != last_word_box\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box transkription_positions = [] transkription_positions.append(transkription_position) last_word_box = transkription_position.has_box if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box self.transkription_positions = [] elif len(self.word_parts) > 0: #self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box') for word_part in self.word_parts: if word_over_box is None: word_over_box = word_part._get_partial_word_over_box() else: break elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1: word_over_box = self word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box return word_over_box def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin): """Set box_path to transkription_position that is contained by box_path. Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary. """ if box_path.contains_path(word_path): transkription_position.has_box = box_path elif box_path.contains_start_of_path(word_path): split_position = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[0].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path elif box_path.contains_end_of_path(word_path): split_position = box_path.path.bbox()[0] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path else: # box_path in the middle of word_pathz split_position1 = box_path.path.bbox()[0] - tr_xmin split_position2 = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position1, split_position2) if len(new_tps) >= 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 91) +++ svgscripts/datatypes/page.py (revision 92) @@ -1,273 +1,280 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import sys import warnings from .box import Box from .color import Color from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .faksimile_position import FaksimilePosition from .lineNumber import LineNumber from .line import Line from .mark_foreign_hands import MarkForeignHands from .matrix import Matrix from .path import Path from .positional_word_part import PositionalWordPart from .super_page import SuperPage from .style import Style from .text_connection_mark import TextConnectionMark from .text_field import TextField from .transkriptionField import TranskriptionField from .writing_process import WritingProcess from .word import Word from .word_insertion_mark import WordInsertionMark sys.path.append('py2ttl') from class_spec import SemanticClass FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK class Page(SemanticClass,SuperPage): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ UNITTESTING = False def __init__(self, xml_source_file, faksimile_image=None, faksimile_svgFile=None): super(Page,self).__init__(xml_source_file) self.update_property_dictionary('faksimile_image', faksimile_image) self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile) self.init_all_properties() self.add_style(style_node=self.page_tree.getroot().find('.//style')) self.init_node_objects() @classmethod def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None): """Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT. [optional: instantiation depends on the fulfilment of a status_contains and/or on the selection of some words by a word_selection_function]. """ source_tree = ET.parse(xml_file) if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION: page = cls(xml_file) if word_selection_function is None or len(word_selection_function(page.words)) > 0: return [ page ] else: return [] elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: pages = [] xpath = '//page/@output' if status_contains != '' and status_not_contain != '': xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain) elif status_contains != '': xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains) elif status_not_contain != '': xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain) for xml_source_file in source_tree.xpath(xpath): if isfile(xml_source_file): pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function) return pages else: return [] @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'number': { 'class': str, 'cardinality': 1},\ 'faksimile_image': { 'class': FaksimileImage, 'cardinality': 1},\ 'orientation': { 'class': str, 'cardinality': 1},\ 'svg_image': { 'class': SVGImage, 'cardinality': 1}} properties.update(cls.create_semantic_property_dictionary('text_field', TextField,\ cardinality=1, name='pageIsOnTextField', label='page is on text field',\ comment='Relates a page to the text field on a faksimile image.')) for key in [ 'lines', 'words', 'writing_processes', 'word_deletion_paths', 'word_insertion_marks']: properties.update(cls.create_semantic_property_dictionary(key, list)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def init_node_objects(self): """Initialize all node objects. """ self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ] self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ] self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ] self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ] self.word_deletion_paths = [ Path(node=node) for node in self.page_tree.xpath('//' + Path.WORD_DELETION_PATH_TAG) ] if self.faksimile_image is not None and self.text_field is not None: for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks: simple_word.init_word(self) for wim in self.word_insertion_marks: if wim.line_number > -1: wim.line = [ line for line in self.lines if line.id == wim.line_number ][0] def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None): """Update the data source of page. """ if faksimile_svgFile is not None: self.faksimile_svgFile = faksimile_svgFile data_node = self.page_tree.xpath('.//data-source')[0]\ if len(self.page_tree.xpath('.//data-source')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'data-source') data_node.set('file', self.faksimile_svgFile) if xml_correction_file is not None: data_node.set('xml-corrected-words', xml_correction_file) def update_line_number_area(self, transkription_field, svg_tree=None): """Determines the width of the area where the line numbers are written in the page.source file. """ THRESHOLD = 0.4 if svg_tree is None: svg_tree = ET.parse(self.source) if len(self.line_numbers) > 1: line_number = self.line_numbers[9]\ if transkription_field.is_page_verso() and len(self.line_numbers) > 8\ else self.line_numbers[1] ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\ if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\ and LineNumber.IS_A_LINE_NUMBER(item)\ and LineNumber(raw_text_node=item, transkription_field=transkription_field).id == line_number.id ] if len(ln_nodes) > 0: matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform')) if transkription_field.is_page_verso(): transkription_field.add_line_number_area_width(matrix.getX()) elif self.svg_file is not None and isfile(self.svg_file): svg_path_tree = ET.parse(self.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } svg_x = matrix.getX() svg_y = self.line_numbers[1].bottom + transkription_field.ymin use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\ .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces) if len(use_nodes) > 0: symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '') d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces) if len(d_strings) > 0 and d_strings[0] != '': path = parse_path(d_strings[0]) xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin transkription_field.add_line_number_area_width(matrix.getX() + width) def update_page_type(self, transkription_field=None): """Adds a source to page and attaches it to page_tree. """ if transkription_field is None: if self.source is None or not isfile(self.source): raise FileNotFoundError('Page does not have a source!') transkription_field = TranskriptionField(self.source) self.page_type = Page.PAGE_VERSO\ if transkription_field.is_page_verso()\ else Page.PAGE_RECTO self.page_tree.getroot().set('pageType', self.page_type) - def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False): + def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False): """Update styles of words and add them to their transkription_positions. Args: add_to_parents: Add styles also to word (and if not None to manuscript). partition_according_to_styles: Partition word if its transkription_positions have different styles. """ style_dictionary = {} if words is None: words = self.words for word in words: if len(word.word_parts) > 0: - self.update_styles(words=word.word_parts, manuscript=manuscript,\ + self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\ add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles) for transkription_position in word.transkription_positions: if len(transkription_position.positional_word_parts) > 0: style_class = transkription_position.positional_word_parts[0].style_class writing_process_id = -1 for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]: writing_process_id = self.fontsizekey2stage_mapping.get(font_key) style_class_key = (Style.remove_irrelevant_style_keys(style_class, self), writing_process_id) - if style_dictionary.get(style_class_key) is None: - style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript) - style_dictionary[style_class_key].writing_process_id = style_class_key[1] - transkription_position.style = style_dictionary[style_class_key] - if add_to_parents and transkription_position.style not in word.styles: - word.styles.append(transkription_position.style) + if create_css: + if style_dictionary.get((style_class_key, word.deleted)) is None: + style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\ + create_css=create_css, deleted=word.deleted ) + transkription_position.style = style_dictionary[(style_class_key, word.deleted)] + #print(style_dictionary[(style_class_key, word.deleted)]) + else: + if style_dictionary.get(style_class_key) is None: + style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css) + style_dictionary[style_class_key].writing_process_id = style_class_key[1] + transkription_position.style = style_dictionary[style_class_key] + if add_to_parents and transkription_position.style not in word.styles: + word.styles.append(transkription_position.style) if partition_according_to_styles: word.split_according_to_status('style', splits_are_parts=True) if manuscript is not None\ and add_to_parents: manuscript.update_styles(*style_dictionary.values()) Index: svgscripts/datatypes/simple_word.py =================================================================== --- svgscripts/datatypes/simple_word.py (revision 91) +++ svgscripts/datatypes/simple_word.py (revision 92) @@ -1,123 +1,124 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent a simple word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc from lxml import etree as ET import sys from .line import Line from .faksimile_position import FaksimilePosition from .transkription_position import TranskriptionPosition from .word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass class SimpleWord(SemanticClass, metaclass=abc.ABCMeta): """ This class represents a simple word. """ XML_TAG = 'simple-word' XML_SUB_TAG = 'content' def __init__(self, id=0, line_number=-1, line=None, text='', deleted=False, transkription_positions=None, faksimile_positions=None): self.id = id self.text = text self.line_number = line_number self.lines = [] if line is not None: self.lines.append(line) self.transkription_positions = transkription_positions if transkription_positions is not None else [] self.faksimile_positions = faksimile_positions if faksimile_positions is not None else [] def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() if len(target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)) > 0: word_node = target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)[0] word_node.getparent().remove(word_node) word_node = ET.SubElement(target_tree, self.XML_TAG, attrib={'id': str(self.id)}) word_node.set('text', self.text) if self.line_number > -1: word_node.set('line-number', str(self.line_number)) for transkription_position in self.transkription_positions: transkription_position.attach_object_to_tree(word_node) for faksimile_position in self.faksimile_positions: faksimile_position.attach_object_to_tree(word_node) return word_node @classmethod def create_cls(cls, word_node): """Creates a cls from a (lxml.Element) node. [:return:] cls """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else -1 text = word_node.get('text') transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('./' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('./' + WordPosition.FAKSIMILE) ] return cls(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) else: error_msg = 'word_node has not been defined' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'lines': {cls.CLASS_KEY: Line,\ cls.CARDINALITY: 1,\ cls.CARDINALITY_RESTRICTION: 'minCardinality',\ cls.PROPERTY_NAME: 'wordBelongsToLine',\ cls.PROPERTY_LABEL: 'word belongs to a line',\ cls.PROPERTY_COMMENT: 'Relating a word to a line.'}} properties.update(cls.create_semantic_property_dictionary('transkription_positions', TranskriptionPosition,\ name='hasTranskriptionPosition', cardinality=1, cardinality_restriction='minCardinality')) properties.update(cls.create_semantic_property_dictionary('faksimile_positions', FaksimilePosition,\ name='hasFaksimilePosition')) #, cardinality=1, cardinality_restriction='minCardinality')) - properties.update(cls.create_semantic_property_dictionary('text', str, cardinality=1)) + properties.update(cls.create_semantic_property_dictionary('text', str, cardinality=1,\ + subPropertyOf=cls.HOMOTYPIC_HAS_TEXT_URL_STRING)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def init_word(self, page): """Initialize word with objects from page. """ for transkription_position in self.transkription_positions: transkription_position.svg_image = page.svg_image self.faksimile_positions = FaksimilePosition.create_list_of_cls(self.faksimile_positions, page.faksimile_image, page.text_field) if self.line_number > -1: self.lines += [ line for line in page.lines if line.id == self.line_number ] Index: svgscripts/datatypes/manuscript.py =================================================================== --- svgscripts/datatypes/manuscript.py (revision 91) +++ svgscripts/datatypes/manuscript.py (revision 92) @@ -1,141 +1,142 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent an archival unity of manuscript pages, i.e. workbooks, notebooks, folders of handwritten pages. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile import sys from .page import Page, FILE_TYPE_XML_MANUSCRIPT, FILE_TYPE_SVG_WORD_POSITION from .color import Color sys.path.append('py2ttl') from class_spec import SemanticClass sys.path.append('shared_util') from myxmlwriter import parse_xml_of_type, write_pretty, xml_has_type class ArchivalManuscriptUnity(SemanticClass): """ This class represents an archival unity of manuscript pages (workbooks, notebooks and portfolios of handwritten pages). @label archival unity of manuscript pages Args: title title of archival unity manuscript_type type of manuscript: 'Arbeitsheft', 'Notizheft', 'Mappe' manuscript_tree lxml.ElementTree """ XML_TAG = 'manuscript' XML_COLORS_TAG = 'colors' TYPE_DICTIONARY = { 'Mp': 'Mappe', 'N': 'Notizheft', 'W': 'Arbeitsheft' } UNITTESTING = False def __init__(self, title='', manuscript_type='', manuscript_tree=None): self.colors = [] self.manuscript_tree = manuscript_tree self.manuscript_type = manuscript_type self.pages = [] self.styles = [] self.title = title if self.manuscript_type == '' and self.title != ''\ and self.title.split(' ')[0] in self.TYPE_DICTIONARY.keys(): self.manuscript_type = self.TYPE_DICTIONARY[self.title.split(' ')[0]] def get_name_and_id(self): """Return an identification for object as 2-tuple. """ return '', self.title.replace(' ', '_') @classmethod def create_cls(cls, xml_manuscript_file, page_status_list=None, page_xpath='', update_page_styles=False): """Create an instance of ArchivalManuscriptUnity from a xml file of type FILE_TYPE_XML_MANUSCRIPT. :return: ArchivalManuscriptUnity """ manuscript_tree = parse_xml_of_type(xml_manuscript_file, FILE_TYPE_XML_MANUSCRIPT) title = manuscript_tree.getroot().get('title') if bool(manuscript_tree.getroot().get('title')) else '' manuscript_type = manuscript_tree.getroot().get('type') if bool(manuscript_tree.getroot().get('type')) else '' manuscript = cls(title=title, manuscript_type=manuscript_type, manuscript_tree=manuscript_tree) manuscript.colors = [ Color.create_cls(node=color_node) for color_node in manuscript_tree.xpath('.//' + cls.XML_COLORS_TAG + '/' + Color.XML_TAG) ] if page_xpath == '': page_status = '' if page_status_list is not None\ and type(page_status_list) is list\ and len(page_status_list) > 0: page_status = '[' + ' and '.join([ f'contains(@status, "{status}")' for status in page_status_list ]) + ']' page_xpath = f'//pages/page{page_status}/@output' manuscript.pages = [ Page(page_source)\ for page_source in manuscript_tree.xpath(page_xpath)\ if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ] if update_page_styles: - for page in manuscript.pages: page.update_styles(manuscript=manuscript, add_to_parents=True) + for page in manuscript.pages: page.update_styles(manuscript=manuscript, add_to_parents=True, create_css=True) return manuscript def get_color(self, hex_color) -> Color: """Return color if it exists or None. """ if hex_color in [ color.hex_color for color in self.colors ]: return [ color for color in self.colors if color.hex_color == hex_color ][0] return None @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} properties.update(cls.create_semantic_property_dictionary('title', str, 1)) properties.update(cls.create_semantic_property_dictionary('manuscript_type', str, 1)) properties.update(cls.create_semantic_property_dictionary('styles', list)) properties.update(cls.create_semantic_property_dictionary('pages', list)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def update_colors(self, color): """Update manuscript colors if color is not contained. """ if self.get_color(color.hex_color) is None: self.colors.append(color) if self.manuscript_tree is not None: if len(self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)) > 0: self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)[0].getparent().remove(self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)[0]) colors_node = ET.SubElement(self.manuscript_tree.getroot(), self.XML_COLORS_TAG) for color in self.colors: color.attach_object_to_tree(colors_node) if not self.UNITTESTING: write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_tree.docinfo.URL,\ script_name=__file__, backup=True,\ file_type=FILE_TYPE_XML_MANUSCRIPT) def update_styles(self, *styles): """Update manuscript styles. """ for style in styles: if style not in self.styles: + #print(style.css_styles) self.styles.append(style) Index: tests_svgscripts/test_data/N_VII_1_page006.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1_page006.xml (revision 91) +++ tests_svgscripts/test_data/N_VII_1_page006.xml (revision 92) @@ -1,1276 +1,1276 @@ svgWordPosition 2019-08-02 15:17:37 2019-08-02 15:17:37 2019-08-02 15:30:59 2019-08-02 15:30:59 - 2020-03-27 17:38:19 + 2020-05-12 10:14:00 Index: tests_svgscripts/test_style.py =================================================================== --- tests_svgscripts/test_style.py (revision 91) +++ tests_svgscripts/test_style.py (revision 92) @@ -1,84 +1,92 @@ import unittest from os import sep, path from os.path import dirname, basename, isfile, isdir import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.color import Color from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.page import Page from datatypes.style import Style class TestStyle(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_page = DATADIR + sep + 'N_VII_1_page001.xml' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' def test_create_cls(self): page = Page(self.test_page) style_string = "st11 st10 st5" style = Style.create_cls(page, style_string) self.assertEqual(style.font_family, 'Weidemann-Book') self.assertEqual(style.color.hex_color, "#DADADA") self.assertEqual(style.writing_instrument, 'schwarze Tinte') style_string = "st11 st10" style = Style.create_cls(page, style_string) self.assertEqual(style.font_family, 'Weidemann-Book') self.assertEqual(style.color.name, "black") self.assertEqual(style.writing_instrument, 'schwarze Tinte') + style_string = "st11 st3" + style = Style.create_cls(page, style_string, create_css=True) + self.assertEqual(style.font_family, 'Weidemann-Book') + self.assertEqual(style.font_size, '9px') def test_remove_irrelevant_style_keys(self): page = Page(self.test_page) style_string = "st11 st10 st9 st5 st0" self.assertEqual(Style.remove_irrelevant_style_keys(style_string, page), "st11 st5 st9") def test_process_style_classes(self): style = Style() style.color = Color.create_cls(hex_color='#009CDE') style.process_style_classes() self.assertEqual(style.writing_instrument, 'violette Tinte') self.assertEqual(style.font, 'deutsche Schreibschrift') style.font_family = "NewsGothicBT-Bold" style.process_style_classes() self.assertEqual(style.writing_instrument, 'Blaustift') self.assertEqual(style.font, 'lateinische Schreibschrift') + style = Style() + style.font_family = "NewsGothicBT-Bold" + style.process_style_classes() + #print(style.css_styles) def test_get_semantic_dictionary(self): dictionary = Style.get_semantic_dictionary() #print(dictionary) def test_copy(self): manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript) page = Page(self.test_page) page.words = [ page.words[0] ] page.update_styles(manuscript=manuscript, add_to_parents=True) self.assertEqual(len(manuscript.styles), 1) styleA = page.words[0].transkription_positions[0].style styleB = styleA.create_a_copy() self.assertEqual(styleA == styleB, True) styleB = styleA.create_a_copy(reduce_writing_process_id=True) self.assertEqual(styleA != styleB, True) def test_eq(self): page = Page(self.test_page) style_string = "st11 st10 st5" styleA = Style.create_cls(page, style_string) styleB = Style.create_cls(page, style_string) self.assertEqual(styleA == styleB, True) style_string = "st11 st10" styleC = Style.create_cls(page, style_string) self.assertEqual(styleA != styleC, True) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_word.py =================================================================== --- tests_svgscripts/test_word.py (revision 91) +++ tests_svgscripts/test_word.py (revision 92) @@ -1,455 +1,456 @@ import unittest from os import sep, path import lxml.etree as ET import sys sys.path.append('svgscripts') from process_words_post_merging import reset_page, update_writing_process_ids from datatypes.box import Box from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.matrix import Matrix import datatypes.page from datatypes.path import Path from datatypes.positional_word_part import PositionalWordPart from datatypes.style import Style from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word, execute_function_on_parts, update_transkription_position_ids from datatypes.word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass class Page: def __init__(self): self.svg_file = None def get_line_number(self, input=0): return -1 def get_biggest_fontSize4styles(self, style_set={}): return 7 class TestWord(unittest.TestCase): TESTCASE = None def setUp(self): DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_file = DATADIR + sep + 'N_VII_1_page009.xml' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] x = 0 for dict in self.word_part_objs: dict['class'] = 'st22' dict['x'] = x dict['y'] = 11 x += 1 mylist = {'text': 'abc', 'id': '0', 'line-number': '2', 'deleted': 'true' } word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] self.word_node = ET.Element('word', attrib=mylist) word_position.attach_object_to_tree(self.word_node) x = 0 for char in mylist['text']: ET.SubElement(self.word_node, 'part', attrib={'text': char, 'x': str(x), 'y': '11', 'class': 'st22' }) x += 1 def test_Word_with_word_part_objs(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) self.assertEqual(word.id, 0) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_Word_with_word_node(self): word = Word.create_cls(self.word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, True) self.assertEqual(word.transkription_positions[0].bottom, 11) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 1) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') self.assertEqual(word.line_number, 2) self.assertEqual(word.transkription_positions[0].transform.isRotationMatrix(), True) def test_attach_word_to_tree(self): newWord = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) empty_tree = ET.ElementTree(ET.Element('page')) newWord.attach_word_to_tree(empty_tree) for word_node in empty_tree.getroot().xpath('//word'): word = Word.CREATE_WORD(word_node=word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, False) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') @unittest.skipUnless(TESTCASE is None or TESTCASE == 0, 'Not testing this case') def test_create_correction_history_case0(self): # Case 1: whole word over box box = Box(earlier_text='XYX') word = Word(text='ASDF', transkription_positions=[TranskriptionPosition()]) word.word_box = box word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.overwrites_word is not None, True) @unittest.skipUnless(TESTCASE is None or TESTCASE == 1, 'Not testing this case') def test_create_correction_history_case1(self): # Case 2: part of word over box box = Box(earlier_text='XYX') partA = Word(text='A', transkription_positions=[TranskriptionPosition()]) partA.word_box = box partB = Word(text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.word_parts[0].overwrites_word is not None, True) @unittest.skipUnless(TESTCASE is None or TESTCASE == 2, 'Not testing this case') def test_create_correction_history_case3(self): # Case 3: part of word over box, word under box is part of earlier version box = Box(earlier_text='XYX') tp0 = TranskriptionPosition() tp0.style = Style(writing_process_id=0) tp1 = TranskriptionPosition() tp1.style = Style(writing_process_id=1) partA = Word(id=0, text='Test', transkription_positions=[ tp0]) partB = Word(id=1, text='er', transkription_positions=[ tp1]) partB.word_box = box word = Word(text='Tester', writing_process_id=1, word_parts=[ partA, partB ] ) word.create_correction_history(box_style=tp0.style) self.assertEqual(word.text, 'Tester') self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.earlier_version.text, 'TestXYX') self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) @unittest.skipUnless(TESTCASE is None or TESTCASE == 3, 'Not testing this case') def test_create_correction_history_case4(self): # Case 4: part of word is deleted partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.edited_text, 'SDF') @unittest.skipUnless(TESTCASE is None or TESTCASE == 4, 'Not testing this case') def test_create_correction_history_case5(self): tp0 = TranskriptionPosition() tp0.style = Style(writing_process_id=0) tp1 = TranskriptionPosition() tp1.style = Style(writing_process_id=1) partA = Word(id=0, text='Test', transkription_positions=[ tp0]) partB = Word(id=1, text='er', transkription_positions=[ tp1]) word = Word(text='Tester', word_parts=[ partA, partB ] ) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[1].extendsEarlierVersion, True) self.assertEqual(word.word_parts[1].isExtensionOfWord, word.earlier_version) #@unittest.skipUnless(TESTCASE is None or TESTCASE == 5, 'Not testing this case') #@unittest.skip('case tested, relies on a local xml file') def test_create_correction_history_case_full(self): page = datatypes.page.Page('xml/N_VII_1_page138.xml') manuscript = ArchivalManuscriptUnity() reset_page(page) update_writing_process_ids(page) word = [ word for word in page.words if word.text == 'Verschiedenes' and word.line_number == 4 ][0] wordAufBau = [ word for word in page.words if word.text == 'AufBau' ][0] #page.words = [ word ] page.update_styles(manuscript=manuscript, partition_according_to_styles=True) word.word_parts[0].transkription_positions[0].has_box = Box(earlier_text='v') self.assertEqual(len(word.word_parts), 2) word_over_box = word._get_partial_word_over_box() update_transkription_position_ids(word) word.create_correction_history(page) self.assertEqual(word.writing_process_id, 1) self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.earlier_version.text, 'verschiedenes') #print(word.earlier_version.id, [ (w.id, w.text) for w in word.earlier_version.word_parts ]) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) #print(ET.dump(word_node)) """ self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) self.assertEqual(word.word_parts[1].overwrites_word is not None, True) """ word = wordAufBau page.words = [ word ] page.update_styles(manuscript=manuscript, partition_according_to_styles=True) word.word_parts[0].deleted = True word.word_parts[1].transkription_positions[0].has_box = Box(earlier_text='b') self.assertEqual(len(word.word_parts), 3) word_over_box = word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 3) update_transkription_position_ids(word) word.create_correction_history(page) self.assertEqual(word.writing_process_id, 2) self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.text, 'AufBau') self.assertEqual(word.edited_text, 'Bau') self.assertEqual(word.earlier_version.text, 'Aufbau') self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) self.assertEqual(word.word_parts[1].overwrites_word is not None, True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) #print(ET.dump(word_node)) newWord = Word.create_cls(word_node) #@unittest.skip('') def test_earlier_version(self): partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) earlier_version = word.create_earlier_version() self.assertEqual(earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, earlier_version.word_parts[0]) def test_undo_partitioning(self): tps = [] for i, xy in enumerate([ 3, 4, 5 ]): tps.append(TranskriptionPosition(id=i, x=xy, y=xy, height=10, width=10)) partA = Word(id=0, text='Auf', writing_process_id=1, deleted=True, transkription_positions=[ tps[0]]) partB = Word(id=1, text='B', writing_process_id=2, transkription_positions=[tps[1]]) partC = Word(id=2, text='au', writing_process_id=1,transkription_positions=[tps[2]]) word = Word(text='Aufbau', writing_process_id=2, word_parts=[ partA, partB, partC ] ) word.undo_partitioning() self.assertEqual(len(word.transkription_positions), len(tps)) self.assertEqual(len(word.word_parts), 0) """ page = datatypes.page.Page('xml/N_VII_1_page138.xml') word = page.words[77] word.undo_partitioning() self.assertEqual(len(word.word_parts), 0) self.assertEqual(len(word.transkription_positions), 3) update_transkription_position_ids(word) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) print(ET.dump(word_node)) """ def test_split(self): page = Page() pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('b') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) self.assertEqual(nextWord.id, 2) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('bc') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('ab', start_id=10) self.assertEqual(currentWord.id, 10) self.assertEqual(currentWord.text, 'ab') self.assertEqual(currentWord.transkription_positions[0].width, 2.1) self.assertEqual(nextWord.id, 11) self.assertEqual(nextWord.transkription_positions[0].width, 5.2) word_part_objs=[{'text': 'x', 'class':'st22', 'x': 0, 'y': 0},\ {'text': 'Insofern', 'class':'st22', 'x': 1, 'y': 0},\ {'text': 'x', 'class':'st22', 'x': 10, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofer') word_part_objs=[{'text': 'xInsofern', 'class':'st22', 'x': 0, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofern') def test_join(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word) self.assertEqual(word.text, 'abc.') other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word, append_at_end_of_new_word=False) self.assertEqual(word.text, '.abc.') """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_get_semanticAndDataDict(self): dictionary = Word.get_semantic_dictionary() + #print(dictionary) info_dict = dictionary['properties'].get('isDeletionOfWord') self.assertEqual(SemanticClass.SUPER_PROPERTY in info_dict.keys(), True) super_info_dict = info_dict[SemanticClass.SUPER_PROPERTY] #print(info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME)) def test_simplify_transkription_positions(self): node_string = """ """ nodeA = ET.fromstring(node_string) node_string = """ """ nodeB = ET.fromstring(node_string) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) self.assertEqual(len(word.transkription_positions), 2) word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) word.transkription_positions[1].writing_process_id = -1 word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) self.assertEqual(word.transkription_positions[0].writing_process_id, 0) """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_partition(self): page = datatypes.page.Page(self.test_file) word = page.words[67] self.assertEqual(word.belongs_to_multiple_writing_processes(), True) word.partition_according_to_writing_process_id() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.belongs_to_multiple_writing_processes(), False) self.assertEqual(word.belongs_to_multiple_writing_processes(include_parts=True), True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) newWord = Word.create_cls(word_node) self.assertEqual(len(newWord.word_parts), 3) #print(ET.dump(empty_tree.getroot())) def test_partition_deletion(self): page = datatypes.page.Page(self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.deleted = transkription_position.writing_process_id == 1 self.assertEqual(word.has_mixed_status('deleted'), True) word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.has_mixed_status('deleted'), False) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) page = datatypes.page.Page(self.test_file) word = page.words[67] word.partition_according_to_writing_process_id() #print([(word.text, word.deleted) for word in word.word_parts]) word.word_parts[1].transkription_positions[1].deleted = True word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 4) #print([(word.text, word.deleted) for word in word.word_parts]) partA = Word(text='A', deleted=True) partB = Word(text='SDF', deleted=False) word = Word(text='ASDF', word_parts=[ partA, partB]) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) def test_execute_function_on_parts(self): page = datatypes.page.Page(self.test_file) word_parts = [ page.words[67], page.words[68] ] word_parts, none = execute_function_on_parts(word_parts, 'partition_according_to_writing_process_id') self.assertEqual(len(word_parts) == 4, True) def test_process_word_boxes(self): page = datatypes.page.Page(self.pdf_xml) page.source = self.pdf_xml_source page.update_styles(partition_according_to_styles=True) tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [30, 277, 288, 297, 321] for word_id, index in enumerate(indices): word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) self.assertEqual(word_over_box is not None, True) self.assertEqual(word_over_box == page.words[index] or word_over_box in page.words[index].word_parts, True) #self.assertEqual(word_over_box in page.words[index].word_parts, True) def test_process_word_several_boxesOn1LIne(self): page = datatypes.page.Page(self.pdf_xml) page.source = self.pdf_xml_source for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [30, 277, 288, 297, 321] empty_tree = ET.ElementTree(ET.Element('page')) for word_id, index in enumerate(indices): word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) self.assertEqual(word_over_box is not None, True) def test_split_according_to_status(self): page = datatypes.page.Page(self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.text = 'asdf'\ if transkription_position.writing_process_id == 1\ else word.text self.assertEqual(word.has_mixed_status('text'), True) new_words = word.split_according_to_status('text') #print([word.text for word in new_words ]) self.assertEqual(len(new_words) > 1, True) self.assertEqual(new_words[0].id, word.id) self.assertEqual(new_words[0].deleted, word.deleted) self.assertEqual(new_words[1].id, word.id+1) manuscript = ArchivalManuscriptUnity() page = datatypes.page.Page(self.test_file) word = page.words[67] page.words = [ word ] page.update_styles(manuscript=manuscript) new_words = word.split_according_to_status('style', splits_are_parts=True) self.assertEqual(len(word.word_parts), 3) def test__create_new_word(self): manuscript = ArchivalManuscriptUnity() page = datatypes.page.Page(self.test_file) word = page.words[67] page.words = [ word ] page.update_styles(manuscript=manuscript) newWord = word._create_new_word([ word.transkription_positions[0] ], 'style') for key in Word.COPY_PROPERTY_KEY: self.assertEqual(newWord.__dict__[key], word.__dict__[key]) self.assertEqual(len(newWord.styles), 1) def test__get_partial_word_over_box(self): word = Word(text='test', transkription_positions=[ TranskriptionPosition(id=0), TranskriptionPosition(id=1) ]) word.transkription_positions[0].has_box = Box(earlier_text='asdf') word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 2) partA = Word(id=0, text='A', transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition(), TranskriptionPosition(id=1)]) partB.transkription_positions[0].has_box = Box(earlier_text='asdf') word = Word(text='ASDF', word_parts=[ partA, partB]) word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 2) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_page.py =================================================================== --- tests_svgscripts/test_page.py (revision 91) +++ tests_svgscripts/test_page.py (revision 92) @@ -1,148 +1,154 @@ import unittest from os import sep, path from os.path import isdir, isfile, dirname, basename import lxml.etree as ET import sys import sys sys.path.append('svgscripts') dir_changed = False if not isdir('datatypes'): sys.path.append(dirname(sys.path[0])) dir_changed = True from datatypes.lineNumber import LineNumber from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.style import Style from datatypes.writing_process import WritingProcess from datatypes.word import Word class TestPage(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_tcm_xml = DATADIR + sep + 'N_VII_1_page001.xml' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' self.test_styles_color = DATADIR + sep + 'N_VII_1_page013.xml' def test_Page(self): page = Page(self.test_file) self.assertEqual(page.title, 'Mp XIV 1') self.assertEqual(page.number, '421') self.assertEqual(len(page.sonderzeichen_list), 2) self.assertEqual('st21' in page.sonderzeichen_list, True) self.assertEqual('st23' in page.sonderzeichen_list, True) self.assertEqual(page.style_dict['st0']['fill'], '#F8F9F8') stage0 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 0 ] stage1 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 1 ] stage2 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 2 ] fontStage0 = float(page.style_dict.get(stage0[0]).get('font-size').replace('px','')) fontStage1 = float(page.style_dict.get(stage1[0]).get('font-size').replace('px','')) fontStage2 = float(page.style_dict.get(stage2[0]).get('font-size').replace('px','')) self.assertEqual(fontStage0 > fontStage1, True) self.assertEqual(fontStage1 > fontStage2, True) def test_get_biggest_fontSize4styles(self): page = Page(self.test_file) style_set = { 'st12', 'st2', 'st14', 'st13' } self.assertEqual(page.get_biggest_fontSize4styles(style_set=style_set), 10) def test_get_words(self): page = Page(self.test_file) words = page.words self.assertEqual(len(words), 440) self.assertEqual(words[0].text, '$') self.assertEqual(words[439].text, 'mußte!') def test_get_line_number(self): page = Page(self.test_file) self.assertEqual(page.get_line_number( (page.words[0].transkription_positions[0].bottom+page.words[0].transkription_positions[0].top)/2), 1) self.assertEqual(page.get_line_number( (page.words[27].transkription_positions[0].bottom+page.words[27].transkription_positions[0].top)/2), 2) self.assertEqual(page.get_line_number( (page.words[105].transkription_positions[0].bottom+page.words[105].transkription_positions[0].top)/2), 7) def test_update_page_type(self): page = Page(self.pdf_xml) tf = TranskriptionField(self.pdf_xml_source) page.update_page_type(transkription_field=tf) self.assertEqual(page.page_type, Page.PAGE_VERSO) #page = Page(self.xml_fileB) #page.update_page_type() #self.assertEqual(page.page_type, Page.PAGE_RECTO) def test_update_line_number_area(self): page = Page(self.xml_file) transkription_field = TranskriptionField(page.source) page.update_line_number_area(transkription_field) self.assertEqual(transkription_field.line_number_area_width > 0, True) self.assertEqual(transkription_field.line_number_area_width < 15, True) page = Page(self.xml_fileB) transkription_field = TranskriptionField(page.source) page.update_line_number_area(transkription_field) self.assertEqual(transkription_field.line_number_area_width > 0, True) self.assertEqual(transkription_field.line_number_area_width < 15, True) def test_get_pages_from_xml_file(self): pages = Page.get_pages_from_xml_file(self.test_manuscript) self.assertEqual(len(pages), 4) self.assertEqual(pages[0].number, '5') self.assertEqual(pages[1].number, '6') pages = Page.get_pages_from_xml_file(self.test_manuscript, status_contains=STATUS_MERGED_OK) self.assertEqual(len(pages), 2) self.assertEqual(pages[0].number, '5') pages = Page.get_pages_from_xml_file(self.test_manuscript, status_contains=STATUS_MERGED_OK, status_not_contain=STATUS_POSTMERGED_OK) self.assertEqual(len(pages), 1) def test_get_semantic_dictionary(self): dictionary = Page.get_semantic_dictionary() #print(dictionary) def test_update_styles(self): page = Page(self.pdf_xml) page.words = [ word for word in page.words if word.text == 'Schopenhauer' ] page.update_styles(add_to_parents=True) self.assertEqual(len(page.words[0].styles), 1) self.assertEqual(page.words[0].styles[0].color.name, 'black') self.assertEqual(page.words[0].styles[0].font, Style.NIETSCHES_FONTS['latin']) self.assertEqual(page.words[0].styles[0].writing_instrument, Style.WRITING_INSTRUMENTS[('black',False)]) page = Page(self.test_styles_color) page.words = [ word for word in page.words if word.text == 'Versöhnlichkeit' ] page.update_styles(add_to_parents=True) self.assertEqual(len(page.words[0].styles), 1) self.assertEqual(page.words[0].styles[0].color.name, 'green') self.assertEqual(page.words[0].styles[0].font, Style.NIETSCHES_FONTS['german']) self.assertEqual(page.words[0].styles[0].writing_instrument, Style.WRITING_INSTRUMENTS[('green',False)]) self.assertEqual(page.words[0].styles[0].writing_process_id, WritingProcess.INSERTION_AND_ADDITION) page = Page(self.test_styles_color) page.words = [ word for word in page.words if word.text == 'Versöhnlichkeit' or word.text == 'gewisse' ] self.assertEqual(len(page.words), 2) word = page.words[0] word.transkription_positions += page.words[1].transkription_positions page.words = [ word ] page.update_styles(add_to_parents=True, partition_according_to_styles=True) self.assertEqual(len(page.words[0].word_parts), 2) + page = Page(self.test_styles_color) + page.update_styles(add_to_parents=True, create_css=True) + for word in page.words: + self.assertTrue(len(word.styles) > 0) + for style in word.styles: + self.assertTrue(len(style.css_styles) > 0) def test_lock(self): page = Page(self.test_tcm_xml) self.assertEqual(page.is_locked(), False) page.lock('asdf.txt') self.assertEqual(page.is_locked(), True) self.assertEqual(page.page_tree.xpath('//lock/reference-file/text()')[0], 'asdf.txt') page.unlock() self.assertEqual(page.is_locked(), False) if __name__ == "__main__": unittest.main()