Index: py2ttl/convert.py =================================================================== --- py2ttl/convert.py (revision 107) +++ py2ttl/convert.py (revision 108) @@ -1,115 +1,118 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert py objects to ontology and data in turtle format. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import lxml.etree as ET from os import sep, path, listdir from os.path import isfile, isdir, dirname, basename from progress.bar import Bar import re import sys sys.path.append('svgscripts') from datatypes.archival_manuscript import ArchivalManuscriptUnity if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from class_spec import SemanticClass from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL from py2ttl_data import Py2TTLDataConverter from py2ttl_ontology import Py2TTLOntologyConverter sys.path.append('shared_util') from myxmlwriter import xml2dict -from main_util import get_manuscript_files +from main_util import get_manuscript_files_and_include_status __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" FILE_TYPE_XML_PROJECT = "xmlProjectFile" def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert py objects to a owl:Ontology and rdf data in turtle format. py2ttl/py2ttl_data.py [OPTIONS] [ ...] xml file of type shared_util.myxmlwriter.FILE_TYPE_XML_MANUSCRIPT. OPTIONS: -h|--help: show help -i|--include-status=STATUS include pages with status = STATUS. STATUS is a ':' seperated string of status, e.g. 'OK:faksimile merged'. :return: exit code (int) """ check_config_files_exist() datatypes_dir = get_datatypes_dir() source_ontology_file = PROJECT_ONTOLOGY_FILE target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, PROJECT_NAME) manuscript_file = None page_status_list = [ 'OK', 'faksimile merged' ] try: opts, args = getopt.getopt(argv, "hi:", ["help", "include-status="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-i', '--include-status'): page_status_list = arg.split(':') if len(args) < 1 : usage() return 2 ontology_created = False ontology_converter = Py2TTLOntologyConverter(project_ontology_file=source_ontology_file) output = 2 - for manuscript_file in get_manuscript_files(args): + for manuscript_file, include_status in get_manuscript_files_and_include_status(args): if not isfile(manuscript_file): usage() return 2 if not ontology_created: print(Fore.CYAN + 'Create ontology from "{}" ...'.format(manuscript_file)) if ontology_converter.create_ontology(datatypes_dir, target_ontology_file) == 0: print(Fore.GREEN + '[Ontology file {0} created]'.format(target_ontology_file)) ontology_created = True else: return 2 - print(Fore.CYAN + 'Create data from "{}" ...'.format(manuscript_file)) + current_page_status_list = page_status_list\ + if include_status is None\ + else include_status.split(':') + print(Fore.CYAN + f'Create data from "{manuscript_file}" with status "{current_page_status_list}" ...') data_converter = Py2TTLDataConverter(manuscript_file, mapping_dictionary=ontology_converter.uri_mapping4cls_and_properties) - output = data_converter.convert(page_status_list=page_status_list) + output = data_converter.convert(page_status_list=current_page_status_list) return output if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: shared_util/main_util.py =================================================================== --- shared_util/main_util.py (revision 107) +++ shared_util/main_util.py (revision 108) @@ -1,93 +1,103 @@ import lxml.etree as ET from os.path import isfile, isdir, dirname, basename from svgpathtools import svg2paths2, svg_to_paths import sys sys.path.append('svgscripts') from datatypes.path import Path from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition FILE_TYPE_XML_PROJECT = "xmlProjectFile" def create_function_dictionary(list_of_keys, target_function, function_dictionary=None) -> dict: """Create a function_dictionary """ if function_dictionary is None: function_dictionary = {} for key in list_of_keys: function_dictionary.update({key: target_function}) return function_dictionary def get_manuscript_files(args: list) ->list: """Return a list of manuscript files. If first element is of type FILE_TYPE_XML_PROJECT read from xml file and return as list of filenames. """ if len(args) == 1\ and args[0].endswith('.xml')\ and ET.parse(args[0]).getroot().find('metadata/type').text == FILE_TYPE_XML_PROJECT: return ET.parse(args[0]).xpath('//manuscript[contains(@status, "OK")]/@file') return args +def get_manuscript_files_and_include_status(args: list) ->list: + """Return a list tuples of manuscript files and optional include status. If first element is of type FILE_TYPE_XML_PROJECT read from + xml file and return as list of tuples of filename (@files) and include status for manuscript pages (@include). + """ + if len(args) == 1\ + and args[0].endswith('.xml')\ + and ET.parse(args[0]).getroot().find('metadata/type').text == FILE_TYPE_XML_PROJECT: + return [ (node.get('file'),node.get('include')) for node in ET.parse(args[0]).xpath('//manuscript[contains(@status, "OK")]')] + return args + def extract_paths_on_tf(page, transkription_field=None, new_style_prefix='tln', outsiders=None, outsider_attributes=None) ->list: """Extract all paths on transkription_field. :return: a list of datatypes.path.Path """ if page.source is not None and isfile(page.source): if transkription_field is None: transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) paths, attributes = svg_to_paths.svg2paths(page.source) allpaths_on_tf = [] for index, path in enumerate(paths): attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and path.bbox()[0] >= transkription_field.xmin\ and path.bbox()[1] <= transkription_field.xmax\ and path.bbox()[2] >= transkription_field.ymin\ and path.bbox()[3] <= transkription_field.ymax: style_class = attribute.get('class') if style_class is None and attribute.get('style') is not None: style_class = create_new_style(page, attribute.get('style'), new_style_prefix=new_style_prefix) allpaths_on_tf.append(Path.create_cls(id=index, path=path, style_class=style_class, page=page)) elif outsiders is not None\ and len(path) > 0\ and path != transkription_field.path: style_class = attribute.get('class') if style_class is None and attribute.get('style') is not None: style_class = create_new_style(page, attribute.get('style'), new_style_prefix=new_style_prefix) outsiders.append(Path.create_cls(id=index, path=path, style_class=style_class, page=page)) outsider_attributes.append(attribute) return allpaths_on_tf else: return [] def create_new_style(page, style_attribute_string, new_style_prefix='tln') ->str: """Create new style, update page and return new style_class. """ style_dict = {} style_class = None for key_content in style_attribute_string.split(';'): if ':' in key_content: key, content = tuple(key_content.split(':')) style_dict.update({ key: content}) if style_dict in page.style_dict.values(): style_class = list(page.style_dict.keys())[list(page.style_dict.values()).index(style_dict)] else: new_style_index = len([ k for k in page.style_dict.keys() if k.startswith(new_style_prefix) ]) style_class = f'{new_style_prefix}{new_style_index}' page.style_dict.update({style_class: style_dict }) page.add_style(sonderzeichen_list=page.sonderzeichen_list, letterspacing_list=page.letterspacing_list,\ style_dict=page.style_dict) return style_class def get_paths_near_position(tp: TranskriptionPosition, paths: list, xmin=0, ymin=0, do_not_include_d_attributes=None) ->list: """Given a transkription position and a list of svgscripts.datatypes.path.Path, return a list of paths near this position. """ tp_x = tp.left + (tp.width/2) + xmin tp_y = tp.top + (tp.height/2) + ymin do_not_include_d_attributes = do_not_include_d_attributes if do_not_include_d_attributes is not None else [] return [ path.d_attribute for path in Path.get_nearest_paths(paths, tp_x, tp_y) if path.d_attribute not in do_not_include_d_attributes ] Index: Friedrich-Nietzsche-late-work-ontology.ttl =================================================================== --- Friedrich-Nietzsche-late-work-ontology.ttl (revision 107) +++ Friedrich-Nietzsche-late-work-ontology.ttl (revision 108) @@ -1,63 +1,143 @@ @prefix dct: . @prefix document: . @prefix homotypic: . @prefix stoff: . @prefix text: . @prefix owl: . @prefix rdfs: . +@prefix rdf: . @prefix xsd: . @prefix tln: . a owl:Ontology; dct:license ; dct:title "An ontology about the collected late works of Friedrich Nietzsche"@en; dct:description """Formal description of specific concepts in the scientific study of Friedrich Nietzsches late work."""@en; dct:creator "Dominique Steinbach, tool coordinator/software developer, NIE-INE/digital edition of der späte Nietzsche, Basel University, Switzerland"@en; dct:contributor "Christian Steiner, software developer, digital edition of der späte Nietzsche, University of Basel, Switzerland"@en; dct:publisher "Basel University, Switzerland"@en. +tln:TextGenesis a owl:Class ; + rdfs:label "identifies a genetic order of text versions"@en ; + rdfs:comment "Identifies a genetic order of text versions, i.e. groups text units as earlier and later versions of each other."@en ; + rdfs:isDefinedBy . + +tln:IdentifiedTextVersion a owl:Class ; + rdfs:label "identifies a list of text unities as a text version"@en ; + rdfs:comment "Identification of a list of text unities (e.g. pages or parts of pages) as a text version for which there is an earlier or later version."@en ; + rdfs:isDefinedBy . + +tln:PartOfPageTextUnit a owl:Class ; + rdfs:label "identifies a part of a page as a text unity"@en ; + rdfs:comment "Identification of a part of page as a text unity."@en ; + rdfs:isDefinedBy ; + rdfs:subClassOf [ a owl:Restriction ; + owl:cardinality "1"^^xsd:nonNegativeInteger ; + owl:onProperty tln:belongsToPage ], + [ a owl:Restriction ; + owl:cardinality "1"^^xsd:nonNegativeInteger ; + owl:onProperty tln:startLine ], + [ a owl:Restriction ; + owl:cardinality "1"^^xsd:nonNegativeInteger ; + owl:onProperty tln:endLine ] . + +tln:ExternalTextUnit a owl:Class ; + rdfs:label "a list text unit that has been published external to the digital edition"@en ; + rdfs:comment "A text unit that has been published external to the digital edition."@en ; + rdfs:isDefinedBy ; + rdfs:subClassOf tln:IdentifiedTextVersion . tln:Page a owl:Class ; rdfs:subClassOf document:Page . +tln:belongsToPage a owl:ObjectProperty ; + rdfs:label "relates a part of a page with the page it is a part of"@en ; + rdfs:comment "Relates a part of a page with the page it is a part of."@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:PartOfPageTextUnit ; + rdfs:range tln:Page. + +tln:startLine a owl:ObjectProperty ; + rdfs:label "relates a part of a page with the line it starts with"@en ; + rdfs:comment "Relates a part of a page with the line it starts with."@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:PartOfPageTextUnit ; + rdfs:range tln:Line. + +tln:endLine a owl:ObjectProperty ; + rdfs:label "relates a part of a page with the line it ends with"@en ; + rdfs:comment "Relates a part of a page with the line it ends with."@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:PartOfPageTextUnit ; + rdfs:range tln:Line. + +tln:identifiesAsVersion a owl:ObjectProperty ; + rdfs:label "groups a list of text unities together as a identified text version"@en ; + rdfs:comment "Groups a list of text unities together as a identified text version for which there is an ealier or later version."@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:IdentifiedTextVersion ; + rdfs:range rdf:List. + +tln:hasGeneticOrder a owl:ObjectProperty ; + rdfs:label "relates a list of text versions to an identified genetic order"@en ; + rdfs:comment "Relates a list of text versions to an identified genetic order. The position in the list determines the version of a text unit."@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:TextGenesis ; + rdfs:range rdf:List. + +tln:textUnitHasTitle a owl:ObjectProperty ; + rdfs:label "relates a external published text unit with a title"@en ; + rdfs:comment "Relates a external published text unit with a title by which it can be identified."@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:ExternalTextUnit ; + rdfs:range xsd:string . + +tln:textUnitHasUrl a owl:ObjectProperty ; + rdfs:label "relates a external published text unit with a URL"@en ; + rdfs:comment "Relates a external published text unit with a URL by which it can be visited."@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:ExternalTextUnit ; + rdfs:range xsd:anyURI . + tln:hasImage a owl:ObjectProperty ; rdfs:label "relates a page to a image"@en ; rdfs:comment "relates a page to an image that has a textfield that specifies the area where the writing that constitutes the page can be found."@en ; rdfs:isDefinedBy ; rdfs:domain tln:Page ; rdfs:range tln:Image . tln:hasUrl a owl:DatatypeProperty ; rdfs:label "has Url"@en ; rdfs:domain tln:Image ; rdfs:isDefinedBy ; rdfs:range xsd:anyURI . tln:inheritOverwritesWord a owl:ObjectProperty ; rdfs:subPropertyOf tln:overwritesWord; rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ; rdfs:comment "The author has used this word in order to overwrite that word."@en ; rdfs:isDefinedBy ; owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ). tln:lineContinuesOn a owl:ObjectProperty ; rdfs:label "writing from subject line continues on object line"@en ; rdfs:comment "the writing that ends on subject line continues on object line"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Line ; rdfs:range tln:Line . tln:pageIsOnTextField a owl:ObjectProperty ; rdfs:label "page is on text field"@en ; rdfs:comment "the writing that is referred to as subject can be found on object"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Page ; rdfs:range tln:TextField . tln:writingContinuesWithWord a owl:ObjectProperty ; rdfs:label "writing continues with next word"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Word ; rdfs:range tln:Word . + Index: svgscripts/fix_missing_glyphs.py =================================================================== --- svgscripts/fix_missing_glyphs.py (revision 107) +++ svgscripts/fix_missing_glyphs.py (revision 108) @@ -1,210 +1,213 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to fix missing glyphs. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.page import Page, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word from util import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False REMOVE_SVG_WORD_POS_PAGE_ENDING = re.compile('_page[0-9]+\w*') def find_missing_glyph_for_pwp(pwp, svg_path_tree, namespaces, xmin=0.0, ymin=0.0): """Finds missing glyph for a PositionalWordPart. :return: list of PositionalWordPart """ THRESHOLD = 15.5 #pwp = PositionalWordPart(node=positional_word_part_node) word_part_obj = { "x": pwp.left, "y": pwp.top, "text": pwp.text, "matrix": pwp.transform, "class": pwp.style_class } start_id = int(pwp.id) threshold = -0.5 positional_word_parts = [] while threshold < THRESHOLD and len(positional_word_parts) < 1: try: positional_word_parts = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces,\ start_id=start_id, xmin=xmin, ymin=ymin, threshold=threshold, throw_error_if_not_found=True) except Exception: threshold += 0.1 return positional_word_parts def update_word(word, old_transkription_position, old_positional_word_part, positional_word_parts): """Updates word according to new positional_word_parts. :return: new transkription_position """ if len(positional_word_parts) > 0: debug_msg_string = 'update word from ' + __file__ old_transkription_position.positional_word_parts.remove(old_positional_word_part) positional_word_parts.reverse() for positional_word_part in positional_word_parts: old_transkription_position.positional_word_parts.insert(int(old_positional_word_part.id), positional_word_part) for index, positional_word_part in enumerate(old_transkription_position.positional_word_parts): positional_word_part.id = index transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ old_transkription_position.positional_word_parts, debug_msg_string=debug_msg_string, transkription_position_id=old_transkription_position.id) word.transkription_positions.remove(old_transkription_position) transkription_positions.reverse() for new_tp in transkription_positions: word.transkription_positions.insert(int(old_transkription_position.id), new_tp) text = '' for index, tp in enumerate(word.transkription_positions): tp.id = index tp.writing_process_id = old_transkription_position.writing_process_id for pwp in tp.positional_word_parts: text += pwp.text if word.text != text: word.text = text return transkription_positions[0] def fix_missing_glyphs(svg_word_pos_file, manuscript_file=None): """Finds missing glyphs for xml file of type FILE_TYPE_SVG_WORD_POSITION. """ if isfile(svg_word_pos_file): if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Fixing missing glyphs for file {} ... '.format(svg_word_pos_file), end='') print(Style.RESET_ALL) page = Page(svg_word_pos_file) xmin = 0 ymin = 0 if page.svg_image is None or page.svg_image.text_field is None: transkription_field = TranskriptionField(page.svg_file) xmin = transkription_field.xmin ymin = transkription_field.ymin svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) words_without_glyphs = [ word for word in page.words\ if len([ tp for tp in word.transkription_positions\ if len([ pwp for pwp in tp.positional_word_parts if pwp.symbol_id is None]) > 0]) > 0 ] for word in words_without_glyphs: for transkription_position in word.transkription_positions: positional_word_parts = transkription_position.positional_word_parts[:] for positional_word_part in positional_word_parts: if positional_word_part.symbol_id is None: pwps = find_missing_glyph_for_pwp(positional_word_part, svg_path_tree, namespaces, xmin=xmin, ymin=ymin) new_transkription_position = update_word(word, transkription_position, positional_word_part, pwps) if new_transkription_position is not None: transkription_position = new_transkription_position page.update_and_attach_words2tree() write_pretty(xml_element_tree=page.page_tree, file_name=svg_word_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) page = Page(svg_word_pos_file) new_number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) if not UNITTESTING: result_color = Fore.LIGHTBLUE_EX if new_number_of_missing_glyphs == 0 else Fore.MAGENTA print(result_color + ' {0}/{1}'.format(number_of_missing_glyphs-new_number_of_missing_glyphs, number_of_missing_glyphs), end='') print(Fore.LIGHTBLUE_EX + ' fixed.', end='') print(Style.RESET_ALL) if len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) == 0: update_svgposfile_status(svg_word_pos_file, manuscript_file=manuscript_file, status='OK') def get_filelist_and_manuscript_file(file_a, file_b=None): """Returns a file list and a manuscript file (or None) """ file_list = [] manuscript_file = None source_tree = ET.parse(file_a) if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION\ and len([ word_part for word_part in source_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')]) > 0: # if symbol_ids are missing ... file_list.append(file_a) if file_b is not None: manuscript_file = file_b else: manuscript_file = REMOVE_SVG_WORD_POS_PAGE_ENDING.sub('', file_a) if not isfile(manuscript_file): manuscript_file = None elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: manuscript_file = file_a if file_b is not None: file_list.append(file_b) else: file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND.lower())) + if len(file_list) == 0: + file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND)) return file_list, manuscript_file def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to fix missing glyphs. svgscripts/fix_missing_glyphs.py [OPTIONS] -File [-File] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help: show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): file_b = None if len(args) > 1 and isfile(args[1]): file_b = args[1] file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b) for svg_word_pos_file in file_list: + print(f'{svg_word_pos_file}') fix_missing_glyphs(svg_word_pos_file, manuscript_file=manuscript_file) else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/datatypes/reconstructed_konvolut.py =================================================================== --- svgscripts/datatypes/reconstructed_konvolut.py (revision 107) +++ svgscripts/datatypes/reconstructed_konvolut.py (revision 108) @@ -1,153 +1,154 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a reconstruction of an original manuscript (e.g. a workbook or notebook). """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc from lxml import etree as ET from os.path import isfile import requests import sys from .description import Description from .faksimile_image import FaksimileImage from .manuscript import ManuscriptUnity from .page import Page, FILE_TYPE_XML_MANUSCRIPT, FILE_TYPE_SVG_WORD_POSITION sys.path.append('shared_util') from myxmlwriter import parse_xml_of_type, write_pretty, xml_has_type class NonExistentPage(Page): """This class represents a page that does not exist as part of the KGW edition. @label non existent page """ NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/fe/facsimile/' def __init__(self, number=None, faksimile_image=None, status=None): + self.page_tree = None self.number = number self.status = status self.faksimile_image = faksimile_image @classmethod def create_cls(cls, page_node, faksimile_image=None): """ Create an instance of NonExistentPage from a page_node :return: NonExistentPage """ number = page_node.get('title') + '_' + page_node.get('number')\ if bool(page_node.get('title'))\ else page_node.get('number') return cls(number=number, status=page_node.get('status'), faksimile_image=faksimile_image) def get_name_and_id(self): """Return an identification for object as 2-tuple. """ return type(self).__name__, self.number.replace(' ', '_') @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(NonExistentPage,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('status', str)) return cls.return_dictionary_after_updating_super_classes(dictionary) class ReconstructedKonvolut(ManuscriptUnity): """ This class represents a reconstruction of an original manuscript (e.g. a workbook or notebook). @label reconstruction of an origianl manuscript Args: title title for identification of the reconstruction manuscript_type type of manuscript: 'Arbeitsheft' or 'Notizheft' manuscript_tree lxml.ElementTree """ XML_TAG = 'reconstructed-konvolut' TYPE_DICTIONARY = { 'R_n': 'Notizheft', 'R_w': 'Arbeitsheft' } UNITTESTING = False def __init__(self, title='', manuscript_type='', manuscript_tree=None): super(ReconstructedKonvolut,self).__init__(title=title, manuscript_type=manuscript_type,manuscript_tree=manuscript_tree) @classmethod def create_cls(cls, xml_manuscript_file, page_status_list=None, page_xpath=''): """Create an instance of ReconstructedKonvolut from a xml file of type FILE_TYPE_XML_MANUSCRIPT. :return: ReconstructedKonvolut """ manuscript = super(ReconstructedKonvolut,cls).create_cls(xml_manuscript_file) manuscript_tree = manuscript.manuscript_tree if page_xpath == '': page_status = '' if page_status_list is not None\ and type(page_status_list) is list\ and len(page_status_list) > 0: page_status = '[' + ' and '.join([ f'contains(@status, "{status}")' for status in page_status_list ]) + ']' page_xpath = f'//pages/page{page_status}/@output' included_page_list = [ page_source\ for page_source in manuscript_tree.xpath(page_xpath)\ if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ] for page_node in manuscript_tree.xpath('//pages/page'): if bool(page_node.get('output'))\ and isfile(page_node.get('output'))\ and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_node.get('output')): manuscript.pages.append(Page.create_cls(\ page_node.get('output'), create_dummy_page=(page_node.get('output') not in included_page_list))) else: faksimile_image = get_or_update_faksimile(xml_manuscript_file, page_node) manuscript.pages.append(NonExistentPage.create_cls(page_node, faksimile_image)) manuscript.description = Description.create_cls_from_node(manuscript_tree.xpath(Description.XML_TAG)[0])\ if len(manuscript_tree.xpath(Description.XML_TAG)) > 0\ else None return manuscript def get_or_update_faksimile(xml_source_file, page_node) ->FaksimileImage: """Return the faksimile image of the non existent page. """ faksimile_image = None if len(page_node.xpath(f'./{FaksimileImage.XML_TAG}')) > 0: faksimile_image = FaksimileImage(node=page_node.xpath(f'./{FaksimileImage.XML_TAG}')[0]) elif bool(page_node.get('alias')): url = NonExistentPage.NIETZSCHE_SOURCES_URL + page_node.get('alias') faksimile_dict = None try: r = requests.get(url) faksimile_dict = r.json() except Exception: print(f'URL does not work: {url}') if faksimile_dict is not None and len(faksimile_dict) > 0: width = faksimile_dict['imageWidth'] height = faksimile_dict['imageHeight'] file_name = page_node.get('alias') + '.jpg' URL = FaksimileImage.NIETZSCHE_SOURCES_URL + page_node.get('alias') faksimile_image = FaksimileImage(file_name=file_name, URL=URL, height=height, width=width) faksimile_image.attach_object_to_tree(page_node) write_pretty(xml_element_tree=page_node.getroottree(), file_name=xml_source_file, script_name=__file__,\ file_type=FILE_TYPE_XML_MANUSCRIPT, backup=True) return faksimile_image Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 107) +++ svgscripts/datatypes/page.py (revision 108) @@ -1,406 +1,428 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile, basename from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import re import sys import warnings from .box import Box from .color import Color from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .faksimile_position import FaksimilePosition from .lineNumber import LineNumber from .line import Line from .mark_foreign_hands import MarkForeignHands from .matrix import Matrix from .path import Path from .positional_word_part import PositionalWordPart from .super_page import SuperPage from .style import Style from .text_connection_mark import TextConnectionMark from .text_field import TextField from .transkriptionField import TranskriptionField from .writing_process import WritingProcess from .word import Word from .word_deletion_path import WordDeletionPath from .word_insertion_mark import WordInsertionMark sys.path.append('py2ttl') from class_spec import SemanticClass sys.path.append('shared_util') from main_util import extract_paths_on_tf, get_paths_near_position FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK class Page(SemanticClass,SuperPage): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. faksimile_image: FaksimileImage. faksimile_svgFile: svg file containing information about word positions. """ UNITTESTING = False def __init__(self, xml_source_file=None, faksimile_image=None, faksimile_svgFile=None, add_paths_near_words=False, warn=False, number=None): if xml_source_file is not None: super(Page,self).__init__(xml_source_file) self.update_property_dictionary('faksimile_image', faksimile_image) self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile) self.init_all_properties() self.add_style(style_node=self.page_tree.getroot().find('.//style')) self.faksimile_text_field = None self.svg_text_field = None self.init_node_objects() self.warn = warn self.add_deletion_paths_to_words(add_paths_near_words) else: + self.page_tree = None self.number = number def add_deletion_paths_to_words(self, add_paths_near_words=False): """Add deletion paths to words. """ words = [ word for word in self.words if (len(word.word_parts) == 0 and word.deleted and len(word.deletion_paths) == 0)\ or 'add_paths_near_words' in word.process_flags ] words += [ word for word in self.words\ if len(word.word_parts) > 0 and True in\ [ (wp.deleted and len(wp.deletion_paths) == 0) for wp in word.word_parts ]] if len(words) > 0 and ((self.svg_file is not None and isfile(self.svg_file))\ or (self.source is not None and isfile(self.source))): svg_file = self.svg_file if self.svg_file is not None else self.source transkription_field = TranskriptionField(svg_file) tr_xmin = transkription_field.xmin if (self.svg_image is None or self.svg_image.text_field is None) else 0 tr_ymin = transkription_field.ymin if (self.svg_image is None or self.svg_image.text_field is None) else 0 word_deletion_paths = self.word_deletion_paths index = 0 dp_updated = False while index < len(words): word = words[index] word.add_deletion_paths(word_deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) if len(word.deletion_paths) > 0 or True in [ len(w.deletion_paths) > 0 for w in word.word_parts ]: deletion_paths = word.deletion_paths for wp in word.word_parts: deletion_paths += wp.deletion_paths for deletion_path in deletion_paths: if deletion_path not in self.word_deletion_paths: self.word_deletion_paths.append(deletion_path) elif not dp_updated: word_deletion_paths = extract_paths_on_tf(self) dp_updated = True index -= 1 if add_paths_near_words\ and ('add_paths_near_words' in word.process_flags\ or ((word.deleted and len(word.deletion_paths) == 0)\ or True in [ (w.deleted and len(w.deletion_paths) == 0) for w in word.word_parts ])): if not dp_updated\ and 'add_paths_near_words' in word.process_flags: word_deletion_paths = extract_paths_on_tf(self) dp_updated = True transform = None tp = None target_word = word paths_near_word = [] if word.deleted and len(word.transkription_positions) > 0: transform = word.transkription_positions[0].transform for tp in word.transkription_positions: word.deletion_paths_near_word += get_paths_near_position(tp, word_deletion_paths) elif len(word.word_parts) > 0: for wp in word.word_parts: if wp.deleted and len(wp.transkription_positions) > 0: target_word = wp for tp in wp.transkription_positions: wp.deletion_paths_near_word = get_paths_near_position(tp, word_deletion_paths) if self.warn and (word.deleted and len(word.deletion_paths) == 0): warnings.warn(\ f'WARNING: {self.title} {self.number}: {word.id} on {word.line_number}, {word.text} has no deletion paths! {target_word.deletion_paths_near_word}, {transform}') index += 1 @classmethod def create_cls(cls, xml_source_file=None, create_dummy_page=False, page_node=None): """Create a Page. """ if not create_dummy_page: return cls(xml_source_file) else: m = re.match(r'(.*)(page[0]*)(.*)(\.xml)', xml_source_file) if m is not None and len(m.groups()) > 3: number = m.group(3) else: number = basename(xml_source_file).replace('.xml','') return cls(number=number) @classmethod def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None): """Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT. [optional: instantiation depends on the fulfilment of a status_contains and/or on the selection of some words by a word_selection_function]. """ source_tree = ET.parse(xml_file) if source_tree.getroot().find('metadata/type').text == cls.FILE_TYPE_SVG_WORD_POSITION: page = cls(xml_file) if word_selection_function is None or len(word_selection_function(page.words)) > 0: return [ page ] else: return [] elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: pages = [] xpath = '//page/@output' if status_contains != '' and status_not_contain != '': xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain) elif status_contains != '': xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains) elif status_not_contain != '': xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain) for xml_source_file in source_tree.xpath(xpath): if isfile(xml_source_file): pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function) return pages else: return [] @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'number': { 'class': str, 'cardinality': 1}} properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage, subPropertyOf=cls.HAS_IMAGE)) properties.update(cls.create_semantic_property_dictionary('faksimile_text_field', TextField,\ name='pageIsOnFaksimileTextField', label='page is on faksimile text field',\ comment='Relates a page to the text field on a svg image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD)) properties.update(cls.create_semantic_property_dictionary('orientation', str)) properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage, subPropertyOf=cls.HAS_IMAGE)) properties.update(cls.create_semantic_property_dictionary('svg_text_field', TextField,\ name='pageIsOnSVGTextField', label='page is on svg text field',\ comment='Relates a page to the text field on a faksimile image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD)) for key in [ 'lines', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']: properties.update(cls.create_semantic_property_dictionary(key, list)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def get_word_deletion_path(self, path=None, d_attribute=None) ->WordDeletionPath: """Return a word deletion path that belongs to page. """ if path is None and d_attribute is None: raise Exception('ERROR: get_word_deletion_path needs a path or a d_attribute!') if d_attribute is None: d_attribute = path.d_attribute page_paths = [ dpath for dpath in self.word_deletion_paths if dpath.d_attribute == d_attribute ] if len(page_paths) > 0: return page_paths[0] else: dpath = WordDeletionPath.create_cls(self, path=path, d_attribute=d_attribute) if dpath is not None: dpath.id = len(self.word_deletion_paths) self.word_deletion_paths.append(dpath) dpath.attach_object_to_tree(self.page_tree) return dpath def init_node_objects(self): """Initialize all node objects. """ self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ] self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ] self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ] self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ] self.word_deletion_paths = [ WordDeletionPath.create_cls(self, node=node) for node in self.page_tree.xpath('./' + WordDeletionPath.XML_TAG) ] if self.faksimile_image is not None and self.faksimile_image.text_field is not None: self.faksimile_text_field = self.faksimile_image.text_field if self.svg_image is not None and self.svg_image.text_field is not None: self.svg_text_field = self.svg_image.text_field for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks: simple_word.init_word(self) for wim in self.word_insertion_marks: if wim.line_number > -1: wim.line = [ line for line in self.lines if line.id == wim.line_number ][0] def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None): """Update the data source of page. """ if faksimile_svgFile is not None: self.faksimile_svgFile = faksimile_svgFile data_node = self.page_tree.xpath('.//data-source')[0]\ if len(self.page_tree.xpath('.//data-source')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'data-source') data_node.set('file', self.faksimile_svgFile) if xml_correction_file is not None: data_node.set('xml-corrected-words', xml_correction_file) def update_line_number_area(self, transkription_field, svg_tree=None, set_to_text_field_zero=True): """Determines the width of the area where the line numbers are written in the page.source file. """ THRESHOLD = 0.4 if svg_tree is None: svg_tree = ET.parse(self.source) if len(self.line_numbers) > 1: line_number = self.line_numbers[9]\ if transkription_field.is_page_verso() and len(self.line_numbers) > 8\ else self.line_numbers[1] ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\ if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\ and LineNumber.IS_A_LINE_NUMBER(item)\ and LineNumber(raw_text_node=item).id == line_number.id ] if len(ln_nodes) > 0: matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform')) if transkription_field.is_page_verso(): transkription_field.add_line_number_area_width(matrix.getX()) elif self.svg_file is not None and isfile(self.svg_file): svg_path_tree = ET.parse(self.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } svg_x = matrix.getX() svg_y = self.line_numbers[1].bottom + transkription_field.ymin\ if set_to_text_field_zero\ else self.line_numbers[1].bottom use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\ .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces) if len(use_nodes) > 0: symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '') d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces) if len(d_strings) > 0 and d_strings[0] != '': path = parse_path(d_strings[0]) xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin transkription_field.add_line_number_area_width(matrix.getX() + width) def update_page_type(self, transkription_field=None): """Adds a source to page and attaches it to page_tree. """ if self.number.endswith('r')\ or self.number.endswith('v'): self.page_type = Page.PAGE_VERSO\ if self.number.endswith('v')\ else Page.PAGE_RECTO else: if transkription_field is None: if self.source is None or not isfile(self.source): raise FileNotFoundError('Page does not have a source!') transkription_field = TranskriptionField(self.source, multipage_index=self.multipage_index) self.page_type = Page.PAGE_VERSO\ if transkription_field.is_page_verso()\ else Page.PAGE_RECTO self.page_tree.getroot().set('pageType', self.page_type) def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False): """Update styles of words and add them to their transkription_positions. Args: add_to_parents: Add styles also to word (and if not None to manuscript). partition_according_to_styles: Partition word if its transkription_positions have different styles. """ style_dictionary = {} if words is None: words = self.words for word in words: if len(word.word_parts) > 0: self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\ add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles) for transkription_position in word.transkription_positions: if len(transkription_position.positional_word_parts) > 0: style_class = transkription_position.positional_word_parts[0].style_class writing_process_id = -1 for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]: writing_process_id = self.fontsizekey2stage_mapping.get(font_key) style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id) if create_css: if style_dictionary.get((style_class_key, word.deleted)) is None: color = None if len(word.deletion_paths) > 0: if word.deletion_paths[0].style_class is not None\ and word.deletion_paths[0].style_class != ''\ and self.style_dict.get(word.deletion_paths[0].style_class) is not None: color = Color.create_cls_from_style_object(self.style_dict.get(word.deletion_paths[0].style_class)) else: color = Color() style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\ create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] ) transkription_position.style = style_dictionary[(style_class_key, word.deleted)] #print(style_dictionary[(style_class_key, word.deleted)]) else: if style_dictionary.get(style_class_key) is None: style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css) style_dictionary[style_class_key].writing_process_id = style_class_key[1] transkription_position.style = style_dictionary[style_class_key] if add_to_parents and transkription_position.style not in word.styles: word.styles.append(transkription_position.style) if partition_according_to_styles: word.split_according_to_status('style', splits_are_parts=True) if manuscript is not None\ and add_to_parents: manuscript.update_styles(*style_dictionary.values()) + def __eq__(self, other): + """Returns true if self is qualitatively identical to other. + """ + if other is None: + return False + if self.page_tree is None and other.page_tree is None: + return self.number == other.number + if self.page_tree is None or other.page_tree is None: + return False + return self.page_tree.docinfo.URL == other.page_tree.docinfo.URL + + def __hash__(self): + """Return a hash value for self. + """ + try: + if self.page_tree is None: + return hash(self.number) + except AttributeError: + print(self) + return hash(self.number) + return hash(self.page_tree.docinfo.URL) Index: svgscripts/datatypes/text.py =================================================================== --- svgscripts/datatypes/text.py (revision 107) +++ svgscripts/datatypes/text.py (revision 108) @@ -1,185 +1,219 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a text that may have standoff markup. """ # Copyright (C) University of Basel 2020 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy from lxml import etree as ET import re import sys from .attachable_object import AttachableObject from .standoff_tag import StandoffTag sys.path.append('py2ttl') from class_spec import SemanticClass + class Text(AttachableObject,SemanticClass): """ This class represents a text that may have standoff markup. """ TAG_PATTERN = re.compile(r'([^<]*)(<[^/]+>)') + #START_TAG_PATTERN = re.compile(r'.*<[a-z]+>') + START_TAG_PATTERN = re.compile(r'[^<]*(?!)[^<]*<[a-z]+>') XML_TAG = 'text-with-markup' XML_SUB_TAG = 'text' def __init__(self, content=None, standoff_markups=None, id=0, tag=XML_TAG): self.id = str(id) self.tag = tag self.content = content self.standoff_markups = standoff_markups\ if standoff_markups is not None\ else [] def append(self, content: str) -> int: """Extend text with content. [:return:] startIndex of appended content """ startIndex = len(self.content) self.content += content return startIndex def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree, self.tag) obj_node.set('id', self.id) text_node = ET.SubElement(obj_node, self.XML_SUB_TAG) text_node.text = self.content for index, markup in enumerate(self.standoff_markups): markup.id = str(index) markup.attach_object_to_tree(obj_node) def extract_part(self, text_part, css_filter=';'): """Extract part of text for which text_part matchs content. :return: datatypes.text.Text """ if not css_filter.endswith(';'): css_filter += ';' if text_part in self.content: part_start_index = self.content.find(text_part) part_end_index = part_start_index + len(text_part) standoff_markups = [ markup for markup in self.standoff_markups\ if markup.css_string.endswith(css_filter)\ if (markup.startIndex <= part_start_index\ and markup.endIndex > part_start_index)\ or (markup.startIndex >= part_start_index\ and markup.startIndex < part_end_index\ and markup.endIndex <= part_end_index)\ or (markup.startIndex < part_end_index\ and markup.endIndex >= part_end_index)] new_markups = [] for markup in standoff_markups: startIndex = markup.startIndex - part_start_index\ if markup.startIndex > part_start_index else 0 endIndex = markup.endIndex - part_start_index\ if markup.endIndex <= part_end_index\ else len(text_part) new_markups.append(StandoffTag(markup.markup, startIndex, endIndex)) return Text(content=text_part, standoff_markups=new_markups) else: msg = f'ERRROR {text_part} is not a part of {self.content}!' raise Exception(msg) def join(self, other): """Join self and other. """ correction = self.append(' ' + other.content) + 1 for standoff_markup in other.standoff_markups: standoff_markup.startIndex += correction standoff_markup.endIndex += correction self.standoff_markups += other.standoff_markups del other def markup_contains_css_filter(self, css_filter) ->bool: """Returns true if markup contains css_filter. """ if not css_filter.endswith(';'): css_filter += ';' return len([ markup for markup in self.standoff_markups\ if markup.css_string.endswith(css_filter) ]) > 0 @classmethod def create_cls_from_node(cls, node): """Initialize a cls from node. [:return:] cls """ standoff_markups = [ StandoffTag.create_cls_from_node(item) for item in\ node.xpath('./' + '|./'.join(StandoffTag.MARKUP_STYLES)) ] text = node.xpath('./' + cls.XML_SUB_TAG + '/text()')[0]\ if len(node.xpath('./' + cls.XML_SUB_TAG + '/text()')) > 0\ else '' return cls(text, standoff_markups=standoff_markups, id=node.get('id'), tag=node.tag) @classmethod def create_cls_from_html(cls, html): """Creates a Text from a html string. :return: a (datatypes.text) Text """ - standoff_markups = [] + html = html.replace('<', '<').replace('>', '>') + """ tag_matched = re.match(cls.TAG_PATTERN, html) while tag_matched is not None: tag = tag_matched.group(2) tags = [ t for t in tag.split('<') if t != ''] tags.reverse() endTag = ''.join([ '(str, list): + """Extract standoff data and return the html string without tags and a list of standoff data. + """ + standoff_markups = [] + tag_matched = re.match(Text.START_TAG_PATTERN, html) + while tag_matched: + tag = re.sub(r'>.*', '', re.sub(r'^[^<]+<', '', tag_matched.group(0))) + startIndex = html.index(f'<{tag}>') + html = re.sub(rf'<{tag}>', '', html, count=1) + contains_tag_pattern = rf'.*<[a-z]+>.*.*' + if re.match(contains_tag_pattern, html): + html, new_standoff_data = extract_standoff_data(html) + standoff_markups += new_standoff_data + end_tag_pattern = rf'.*.*' + endTag_matched = re.match(end_tag_pattern, html) + if endTag_matched is not None: + endIndex = html.index(f'') + html = html[0:endIndex] + html[endIndex+len(f''):] + if bool(StandoffTag.HTML_TAG_DICTIONARY.get(f'<{tag}>')): + standoff_markups.append(StandoffTag(StandoffTag.HTML_TAG_DICTIONARY[f'<{tag}>'], startIndex, endIndex)) + else: + msg = f'HTML string contains no ending tag for {tag}!' + raise Exception(msg) + tag_matched = re.match(Text.START_TAG_PATTERN, html) + return html, standoff_markups + + + Index: svgscripts/datatypes/standoff_tag.py =================================================================== --- svgscripts/datatypes/standoff_tag.py (revision 107) +++ svgscripts/datatypes/standoff_tag.py (revision 108) @@ -1,151 +1,152 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent the standoff markup of a text. """ # Copyright (C) University of Basel 2020 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import re import sys from .attachable_object import AttachableObject sys.path.append('py2ttl') from class_spec import SemanticClass class StandoffTag(AttachableObject,SemanticClass): """ This class represents the standoff markup of a text. """ - MARKUP_STYLES = [ 'bold', 'italic', 'delete' ] + MARKUP_STYLES = [ 'bold', 'italic', 'delete', 'underline' ] RDFS_SUBCLASSOF_LIST = ['http://www.nie.org/ontology/standoff#StandoffMarkup'] RELEVANT_STYLE_KEY = 'font-family' RELEVANT_CONTENT_STARTSWITH = 'Frutiger-' RELEVANT_PATTERN = re.compile('.*(Italic|Bold)$') RELEVANT_SUB_PATTERN = re.compile('Frutiger-(Light)*') STOFF_HAS_CSS_URL_STRING = 'http://www.nie.org/ontology/standoff#hasCSS' STOFF_HAS_START_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasStartIndex' STOFF_HAS_END_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasEndIndex' - HTML_TAG_DICTIONARY = { '': 'italic', '': 'bold', '': 'delete' } + HTML_TAG_DICTIONARY = { '': 'italic', '': 'bold', '': 'delete', '': 'underline' } CSS_DICTIONARY = { 'bold': 'font-weight:bold;', 'italic': 'font-style: italic;', + 'underline': 'text-decoration:underline;', 'delete': 'text-decoration:line-through;' } def __init__(self, markup: str, startIndex: int, endIndex: int, id=0): self.id = str(id) self.css_string = self.CSS_DICTIONARY.get(markup) self.markup = markup self.startIndex = startIndex self.endIndex = endIndex def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree, self.markup) obj_node.set('id', self.id) obj_node.set('start', str(self.startIndex)) obj_node.set('end', str(self.endIndex)) @classmethod def create_cls(cls, start_index, end_index, style_string, page=None, style_dict=None): """Creates a StandoffTag from a style_string. :return: a list of (datatypes.standoff_tag) StandoffTag """ if page is not None: style_dict = cls.create_relevant_style_dictionary(page) relevant_keys = [ key for key in set(style_string.split(' '))\ if key in style_dict.keys() ] standoff_tags = [] if style_dict is None or len(style_dict) == 0: return standoff_tags for relevant_key in relevant_keys: font_family = style_dict[relevant_key][cls.RELEVANT_STYLE_KEY] if re.match(cls.RELEVANT_PATTERN, font_family): markup = re.sub(cls.RELEVANT_SUB_PATTERN, '', font_family).lower() standoff_tags.append(cls(markup, start_index, end_index)) return standoff_tags @classmethod def create_cls_from_node(cls, node): """Creates a StandoffTag from a node. :return: (datatypes.standoff_tag) StandoffTag """ return cls(node.tag, int(node.get('start')), int(node.get('end')), id=node.get('id')) @classmethod def create_relevant_style_dictionary(cls, page): """Return a style dictionary that contains only relevant keys and contents. """ return { key: key_dict for key, key_dict in page.style_dict.items()\ if cls.RELEVANT_STYLE_KEY in key_dict.keys()\ and key_dict[cls.RELEVANT_STYLE_KEY].startswith(cls.RELEVANT_CONTENT_STARTSWITH) } @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ properties = {} #properties.update(cls.create_semantic_property_dictionary('markup', str, cardinality=1,\ # name='standoffTagHasMarkup', label='standoff tag has a specific markup', comment='Connects a standoff tag with its markup, e.g. bold or italic')) properties.update(cls.create_semantic_property_dictionary('startIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_START_INDEX,\ name='standoffTagHasStartIndex', label='standoff tag has a start index', comment='Connects a standoff tag with its start index.')) properties.update(cls.create_semantic_property_dictionary('endIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_END_INDEX,\ name='standoffTagHasEndIndex', label='standoff tag has a end index', comment='Connects a standoff tag with its end index.')) properties.update(cls.create_semantic_property_dictionary('css_string', str,\ subPropertyOf=cls.STOFF_HAS_CSS_URL_STRING,\ name='standoffTagHasCSS', label='standoff tag has css', comment='Connects a standoff tag with CSS style.')) dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } return cls.return_dictionary_after_updating_super_classes(dictionary) def is_joinable(self, other): """Return true if self and other have same markup and self.endIndex == other.startIndex. """ return self.markup == other.markup and self.endIndex == other.startIndex def join(self, other): """Join self with other. """ self.endIndex = other.endIndex def join_list(self, others): """Join all others that are joinable, return remaining others as a list. """ unjoinable_others = [] for other in others: if self.is_joinable(other): self.join(other) else: unjoinable_others.append(other) return unjoinable_others Index: tests_svgscripts/test_description.py =================================================================== --- tests_svgscripts/test_description.py (revision 107) +++ tests_svgscripts/test_description.py (revision 108) @@ -1,40 +1,40 @@ import unittest from os import sep, path from os.path import dirname, basename, isfile, isdir import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.page import Page from datatypes.standoff_tag import StandoffTag from datatypes.text import Text from datatypes.description import Description class TestText(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_page = DATADIR + sep + 'N_VII_1_page001.xml' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' def test_semantic(self): pass #print(Text.get_semantic_dictionary()) def test_create_cls_from_node(self): tree = ET.parse(self.test_manuscript) node = tree.xpath('description/earlierDescription[@id="1"]/manuscriptDescription')[0] description = Description.create_cls_from_node(node) - #print(description.content) self.assertTrue(len(description.standoff_markups) > 0) + #print(description.content, description.standoff_markups) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_text.py =================================================================== --- tests_svgscripts/test_text.py (revision 107) +++ tests_svgscripts/test_text.py (revision 108) @@ -1,91 +1,94 @@ import unittest from os import sep, path from os.path import dirname, basename, isfile, isdir import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.page import Page from datatypes.standoff_tag import StandoffTag from datatypes.text import Text class TestText(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_page = DATADIR + sep + 'N_VII_1_page001.xml' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' def test_semantic(self): pass #print(Text.get_semantic_dictionary()) def test_attach_to_tree(self): empty_tree = ET.ElementTree(ET.Element('page')) content = 'asdf' standoff_tag = StandoffTag('bold', 0, len(content)-1) standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content),id='1') text = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ]) text.attach_object_to_tree(empty_tree) text = Text.create_cls_from_node(empty_tree.xpath('//' + Text.XML_TAG)[0]) self.assertEqual(text.content, content) self.assertEqual(text.id, '0') self.assertEqual(len(text.standoff_markups), 2) #print(ET.dump(empty_tree.getroot())) def test_extract(self): content = 'asdfa' standoff_tag = StandoffTag('bold', 0, len(content)-2) standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1') textA = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ]) textB = textA.extract_part('sdf') self.assertEqual(len(textB.standoff_markups), 2) textB = textA.extract_part('sdf', css_filter='bold') self.assertEqual(len(textB.standoff_markups), 1) """ content = '26: von „Regel]¿' textA = Text(content, standoff_markups=[ StandoffTag('bold', 6, 9)]) print(textA.extract_part('von', css_filter='bold')) print(textA.extract_part('„Regel', css_filter='bold')) """ def test_markup_contains_css_filter(self): content = 'asdfa' standoff_tag = StandoffTag('bold', 0, len(content)-2) standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1') textA = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ]) self.assertTrue(textA.markup_contains_css_filter('bold')) self.assertTrue(textA.markup_contains_css_filter('italic')) textA.standoff_markups.pop(0) self.assertFalse(textA.markup_contains_css_filter('bold')) def test_join(self): content = 'asdfa' standoff_tag = StandoffTag('bold', 0, len(content)-2) standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1') textA = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ]) standoff_tag = StandoffTag('bold', 0, len(content)-2) standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1') textB = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ]) textA.join(textB) self.assertEqual(textA.content, content + ' ' + content) def test_create_from_html(self): html = 'asdf test the best' text = Text.create_cls_from_html(html) self.assertEqual(len(text.standoff_markups), 3) self.assertEqual(text.standoff_markups[0].startIndex, text.standoff_markups[1].startIndex) self.assertEqual(text.standoff_markups[0].endIndex, text.standoff_markups[1].endIndex) html = 'asdf test' text = Text.create_cls_from_html(html) self.assertEqual(len(text.standoff_markups), 1) + html = 'Quart-, Oktav- und Folioblätter verschiedenen Formats (z. T. von Albert Brenners und Peter Gasts Hand); Entwürfe und Vorstufen aus dem Bereiche des <i>Menschlichen I</i> (die sogenannten <i>Sorrentiner Papiere</i>)' + text = Text.create_cls_from_html(html) + #print(text) if __name__ == "__main__": unittest.main()