Index: Friedrich-Nietzsche-late-work-ontology.ttl =================================================================== --- Friedrich-Nietzsche-late-work-ontology.ttl (revision 113) +++ Friedrich-Nietzsche-late-work-ontology.ttl (revision 114) @@ -1,167 +1,188 @@ @prefix dct: . @prefix document: . @prefix homotypic: . @prefix stoff: . @prefix text: . @prefix owl: . @prefix rdfs: . @prefix rdf: . @prefix skos: . @prefix xsd: . +@prefix information-carrier: . @prefix tln: . a owl:Ontology; dct:license ; dct:title "An ontology about the collected late works of Friedrich Nietzsche"@en; dct:description """Formal description of specific concepts in the scientific study of Friedrich Nietzsches late work."""@en; dct:creator "Dominique Steinbach, tool coordinator/software developer, NIE-INE/digital edition of der späte Nietzsche, Basel University, Switzerland"@en; dct:contributor "Christian Steiner, software developer, digital edition of der späte Nietzsche, University of Basel, Switzerland"@en; dct:publisher "Basel University, Switzerland"@en. +tln:Leaf a owl:Class ; + rdfs:label "leaf"@en, "Blatt"@de; + rdfs:comment """Sheet of a writable substance, e.g. paper, parchment, treated plant leaf."""@en ; + rdfs:isDefinedBy . + tln:TextGenesis a owl:Class ; - rdfs:label "identifies a genetic order of text versions"@en ; + rdfs:subClassOf information-carrier:Leaf ; + rdfs:label "identifies a genetic order of text version"@en ; rdfs:comment "Identifies a genetic order of text versions, i.e. groups text units as earlier and later versions of each other."@en ; rdfs:isDefinedBy . tln:IdentifiedTextVersion a owl:Class ; rdfs:label "identifies a list of text unities as a text version"@en ; rdfs:comment "Identification of a list of text unities (e.g. pages or parts of pages) as a text version for which there is an earlier or later version."@en ; rdfs:isDefinedBy . tln:PartOfPageTextUnit a owl:Class ; rdfs:label "identifies a part of a page as a text unity"@en ; rdfs:comment "Identification of a part of page as a text unity."@en ; rdfs:isDefinedBy ; rdfs:subClassOf [ a owl:Restriction ; owl:cardinality "1"^^xsd:nonNegativeInteger ; owl:onProperty tln:belongsToPage ], [ a owl:Restriction ; owl:cardinality "1"^^xsd:nonNegativeInteger ; owl:onProperty tln:startLine ], [ a owl:Restriction ; owl:cardinality "1"^^xsd:nonNegativeInteger ; owl:onProperty tln:endLine ] . tln:ExternalTextUnit a owl:Class ; rdfs:label "a list text unit that has been published external to the digital edition"@en ; rdfs:comment "A text unit that has been published external to the digital edition."@en ; rdfs:isDefinedBy ; rdfs:subClassOf tln:IdentifiedTextVersion . tln:Page a owl:Class ; rdfs:subClassOf document:Page . +tln:leafHasDescription a owl:ObjectProperty ; + rdfs:label "relates a leaf to a manuscript description"@en ; + rdfs:comment "Relates a leaf to its manuscript description."@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:Leaf ; + rdfs:range tln:ManuscriptDescription. + +tln:isDepictedBy a owl:ObjectProperty ; + rdfs:label "relates a leaf to a faksimile image"@en ; + rdfs:comment "Relates a leaf to the faksimile image that depicts it."@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:Leaf ; + rdfs:range tln:FaksimileImage. + tln:belongsToPage a owl:ObjectProperty ; rdfs:label "relates a part of a page with the page it is a part of"@en ; rdfs:comment "Relates a part of a page with the page it is a part of."@en ; rdfs:isDefinedBy ; rdfs:domain tln:PartOfPageTextUnit ; rdfs:range tln:Page. tln:startLine a owl:ObjectProperty ; rdfs:label "relates a part of a page with the line it starts with"@en ; rdfs:comment "Relates a part of a page with the line it starts with."@en ; rdfs:isDefinedBy ; rdfs:domain tln:PartOfPageTextUnit ; rdfs:range tln:Line. tln:endLine a owl:ObjectProperty ; rdfs:label "relates a part of a page with the line it ends with"@en ; rdfs:comment "Relates a part of a page with the line it ends with."@en ; rdfs:isDefinedBy ; rdfs:domain tln:PartOfPageTextUnit ; rdfs:range tln:Line. tln:identifiesAsVersion a owl:ObjectProperty ; rdfs:label "groups a list of text unities together as a identified text version"@en ; rdfs:comment "Groups a list of text unities together as a identified text version for which there is an ealier or later version."@en ; rdfs:isDefinedBy ; rdfs:domain tln:IdentifiedTextVersion ; rdfs:range rdf:List. tln:hasGeneticOrder a owl:ObjectProperty ; rdfs:label "relates a list of text versions to an identified genetic order"@en ; rdfs:comment "Relates a list of text versions to an identified genetic order. The position in the list determines the version of a text unit."@en ; rdfs:isDefinedBy ; rdfs:domain tln:TextGenesis ; rdfs:range rdf:List. tln:textUnitHasTitle a owl:ObjectProperty ; rdfs:label "relates a external published text unit with a title"@en ; rdfs:comment "Relates a external published text unit with a title by which it can be identified."@en ; rdfs:isDefinedBy ; rdfs:domain tln:ExternalTextUnit ; rdfs:range xsd:string . tln:textUnitHasUrl a owl:ObjectProperty ; rdfs:label "relates a external published text unit with a URL"@en ; rdfs:comment "Relates a external published text unit with a URL by which it can be visited."@en ; rdfs:isDefinedBy ; rdfs:domain tln:ExternalTextUnit ; rdfs:range xsd:anyURI . tln:hasImage a owl:ObjectProperty ; rdfs:label "relates a page to a image"@en ; rdfs:comment "relates a page to an image that has a textfield that specifies the area where the writing that constitutes the page can be found."@en ; rdfs:isDefinedBy ; rdfs:domain tln:Page ; rdfs:range tln:Image . tln:hasUrl a owl:DatatypeProperty ; rdfs:label "has Url"@en ; rdfs:domain tln:Image ; rdfs:isDefinedBy ; rdfs:range xsd:anyURI . #tln:inheritOverwritesWord a owl:ObjectProperty ; # rdfs:subPropertyOf tln:overwritesWord; # rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ; # rdfs:comment "The author has used this word in order to overwrite that word."@en ; # rdfs:isDefinedBy ; # owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ). tln:hasStandoffMarkup4PartThatOverwritesWord a owl:ObjectProperty ; rdfs:label "word has standoff markup for the part that overwrites a word"@en ; rdfs:comment "word has standoff markup that highlights the part of its text that overwrites a word"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Word ; rdfs:range stoff:StandoffMarkup . tln:lineContinuesOn a owl:ObjectProperty ; rdfs:label "writing from subject line continues on object line"@en ; rdfs:comment "the writing that ends on subject line continues on object line"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Line ; rdfs:range tln:Line . tln:pageIsOnTextField a owl:ObjectProperty ; rdfs:label "page is on text field"@en ; rdfs:comment "the writing that is referred to as subject can be found on object"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Page ; rdfs:range tln:TextField . tln:writingContinuesWithWord a owl:ObjectProperty ; rdfs:label "writing continues with next word"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Word ; rdfs:range tln:Word . tln:selectableWordProperty a owl:ObjectProperty ; rdfs:label "a property of a word for which it can be selected"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Word . tln:cardinalityGreaterOne a rdf:Property ; rdfs:label "whether a tln:selectableWordProperty can have a greater cardinality then one"@en ; rdfs:isDefinedBy ; rdfs:domain tln:selectableWordProperty ; rdfs:range xsd:boolean . tln:suggestedMaxCardinality a rdf:Property ; rdfs:label "the suggested max cardinaltiy of a tln:selectableWordProperty on a word"@en ; rdfs:isDefinedBy ; rdfs:domain tln:selectableWordProperty ; rdfs:range xsd:nonNegativeInteger . Index: svgscripts/datatypes/faksimile.py =================================================================== --- svgscripts/datatypes/faksimile.py (revision 113) +++ svgscripts/datatypes/faksimile.py (revision 114) @@ -1,209 +1,226 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a faksimile page. """ -# Copyright (C) University of Basel 2019 {{{1 +# Copyright (C) University of Basel 2022 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" -__version__ = "0.0.1" +__version__ = "0.0.2" import re from lxml import etree as ET from os import path from os.path import isdir, isfile, sep, basename from svgpathtools.parser import parse_path from .faksimile_image import FaksimileImage from .matrix import Matrix from .text_field import TextField from .word_position import WordPosition class FaksimilePage: """ This class represents a faksimile page. Args: xml_target_file (str): name of the xml file to which page info will be written. xml_source_file (str): name of the xml file that will be instantiated. """ XML_TAG = 'faksimile-page' def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, svg_source_file=None, faksimile_image=None, text_field=None): xml_file = xml_source_file if xml_source_file is not None else xml_target_file self.title = title self.page_number = page_number self.xml_file = xml_file if xml_file is not None and isfile(xml_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_file, parser) self.title = self.page_tree.getroot().get('title') self.page_number = self.page_tree.getroot().get('page-number') self.width = float(self.page_tree.getroot().get('width')) if bool(self.page_tree.getroot().get('width')) else 0.0 self.height = float(self.page_tree.getroot().get('height')) if bool(self.page_tree.getroot().get('height')) else 0.0 else: self.page_tree = ET.ElementTree(ET.Element(self.XML_TAG)) if title is not None: self.page_tree.getroot().set('title', title) if page_number is not None: self.page_tree.getroot().set('page-number', str(page_number)) if xml_target_file is not None: self.remove_tags_from_page_tree([WordPosition.FAKSIMILE]) if svg_source_file is not None: self.page_tree.getroot().set('svg-source-file', svg_source_file) if faksimile_image is not None: faksimile_image.attach_object_to_tree(self.page_tree) if text_field is not None: text_field.attach_object_to_tree(self.page_tree) self.svg_source_file = self.page_tree.getroot().get('svg-source-file') self.faksimile_image = FaksimileImage(node=self.page_tree.getroot().find('.//' + FaksimileImage.XML_TAG))\ if len(self.page_tree.getroot().findall('.//' + FaksimileImage.XML_TAG)) > 0 else None self.text_field = TextField(node=self.page_tree.getroot().find('.//' + TextField.XML_TAG))\ if len(self.page_tree.getroot().findall('.//' + TextField.XML_TAG)) > 0 else None self.word_positions = [ WordPosition(node=node) for node in self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE) ]\ if len(self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE)) > 0 else [] def append_word_position(self, word_position): """Appends word_position to word_positions and attaches it to page_tree. """ self.word_positions.append(word_position) word_position.attach_object_to_tree(self.page_tree) + def attach_to_tei_facsimile(self, tei_facsimile: ET.Element, xml_id: str, page_number: str): + """Attach faksimile_page as a surface to facsimile element (of a tei xml file) + """ + for item in tei_facsimile.iterchildren(): tei_facsimile.remove(item) + surface = ET.SubElement(tei_facsimile, 'surface') + surface.set('xml-id', xml_id) + surface.set('ulx', str(self.text_field.left)) + surface.set('uly', str(self.text_field.top)) + surface.set('lrx', str(round(self.text_field.left + self.text_field.width, 3))) + surface.set('lry', str(round(self.text_field.top + self.text_field.height, 3))) + surface.set('n', page_number) + graphic = ET.SubElement(surface, 'graphic') + graphic.set('url', self.faksimile_image.nietzsche_source_download) + graphic.set('width', str(self.faksimile_image.width) + 'pt') + graphic.set('height', str(self.faksimile_image.height) + 'pt') + + @classmethod def get_faksimile_pages(cls, svg_file, page_number='', isBlank=False) -> list: """Creates and returns text fields contained in a svg_file as a list. """ svg_tree = ET.parse(svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } return cls.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces, page_number=page_number, isBlank=isBlank) @staticmethod def GET_FAKSIMILEPAGES(svg_tree, namespaces=None, page_number='', isBlank=False) -> list: """Creates and returns text fields contained in a svg_tree as a list. """ THRESHOLD_X = 10 if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } source_file_name = svg_tree.docinfo.URL image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap), source_file_name) xml_dir = '.{}xml'.format(sep) faksimile_pages = list() title_string = re.sub(r'[,_][0-9]+.*\.svg', '', basename(source_file_name)) title = title_string.replace('-', ' ') if re.match(r'.*-\d+[a-z]*$', title_string): title_string = re.sub(r'-\d+[a-z]*$', '', title_string) rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap)\ if rect.get('id', svg_tree.getroot().nsmap).startswith(title_string)\ and rect.get('id', svg_tree.getroot().nsmap).endswith(str(page_number)) ] if isBlank: rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap) ] if len(rect_list) == 0 and page_number != '': return FaksimilePage.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces) for text_field_rect in rect_list: tf_x = float(text_field_rect.get('x', svg_tree.getroot().nsmap)) - image.x tf_y = float(text_field_rect.get('y', svg_tree.getroot().nsmap)) - image.y tf_width = float(text_field_rect.get('width', svg_tree.getroot().nsmap)) tf_height = float(text_field_rect.get('height', svg_tree.getroot().nsmap)) tf_matrix = Matrix(transform_matrix_string=text_field_rect.get('transform'))\ if bool(text_field_rect.get('transform'))\ else None id = text_field_rect.get('id', svg_tree.getroot().nsmap) target_file_name = xml_dir + sep + id + '.xml' if isdir(xml_dir) else id + '.xml' page_number = re.sub(r'.*[,_-]', '', id) if page_number.startswith('0'): page_number = page_number.lstrip('0') text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y, matrix=tf_matrix) faksimile_page = FaksimilePage(xml_target_file=target_file_name, svg_source_file=source_file_name,\ title=title, page_number=page_number, faksimile_image=image, text_field=text_field) x_min = text_field.xmin + image.x y_min = text_field.ymin + image.y #rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\ # x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces) rect_titles = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\ y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces) rect_titles += get_paths_inside_rect(svg_tree, '//ns:path/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\ y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces) for rect_title in rect_titles: rect = rect_title.getparent() x, y, height, width = 0.0, 0.0, 0.0, 0.0 if rect.tag.endswith('path'): path = parse_path(rect.get('d')) x, xmax, y, ymax = path.bbox() width = xmax - x height = ymax - y else: x = float(rect.get('x', svg_tree.getroot().nsmap)) y = float(rect.get('y', svg_tree.getroot().nsmap)) height = float(rect.get('height', svg_tree.getroot().nsmap)) width = width=float(rect.get('width', svg_tree.getroot().nsmap)) matrix = None if bool(rect.get('transform')): matrix = Matrix(transform_matrix_string=rect.get('transform')) text = re.sub(r'(\s(?=[-;:.,…?!’–])|(?<=[-;:.,…?!’–])\s)', '', rect_title.text) faksimile_page.append_word_position(\ WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=text, height=height,\ width=width, x=x, y=y, matrix=matrix, tag=WordPosition.FAKSIMILE)) faksimile_pages.append(faksimile_page) return faksimile_pages def remove_tags_from_page_tree(self, list_of_tags_to_remove): """Removes the tags specified in the list from the target tree. """ for xpath2remove in list_of_tags_to_remove: for node in self.page_tree.xpath('//' + xpath2remove): node.getparent().remove(node) def get_paths_inside_rect(svg_tree, xpath, x_min, x_max, y_min, y_max, not_id, namespaces={}): """Returns a list of all paths selected by xpath that are inside x_min, x_max, y_min, y_max and do not have id == not_id. """ paths = [] if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } for path_node in svg_tree.xpath(xpath, namespaces=namespaces): append_node = path_node if not path_node.tag.endswith('path') and not path_node.tag.endswith('rect'): path_node = path_node.getparent() x, xmax, y, ymax = -1, -1, -1, -1 init_xy = False if path_node.tag.endswith('rect'): x = float(path_node.get('x')) if bool(path_node.get('x')) else -1 y = float(path_node.get('y')) if bool(path_node.get('y')) else -1 xmax = x + float(path_node.get('width')) if bool(path_node.get('width')) else -1 ymax = y + float(path_node.get('height')) if bool(path_node.get('height')) else -1 init_xy = True elif path_node.tag.endswith('path') and bool(path_node.get('d')) and path_node.get('d') != 0: path = parse_path(path_node.get('d')) x, xmax, y, ymax = path.bbox() init_xy = True if init_xy: if bool(path_node.get('transform')): matrix = Matrix(transform_matrix_string=path_node.get('transform')) x, xmax = matrix.get_new_x(x=x, y=y), matrix.get_new_x(x=xmax, y=ymax) y, ymax = matrix.get_new_y(x=x, y=y), matrix.get_new_y(x=xmax, y=ymax) width = xmax - x height = ymax - y if x > x_min and x < x_max\ and y > y_min and y < y_max\ and path_node.get('id') != not_id: paths.append(append_node) return paths Index: svgscripts/datatypes/faksimile_image.py =================================================================== --- svgscripts/datatypes/faksimile_image.py (revision 113) +++ svgscripts/datatypes/faksimile_image.py (revision 114) @@ -1,127 +1,137 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent faksimile images. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import fnmatch from lxml import etree as ET import os from os.path import basename, dirname, isfile, realpath, sep import sys from .image import Image from .matrix import Matrix from .text_field import TextField sys.path.append('svgscripts') from local_config import FAKSIMILE_LOCATION class FaksimileImage(Image): """ This class represents a faksimile image. Args: file_name (str): name of the image file. node (lxml.etree.Element) node, containing information URL (str): URL of image file. height (float): height of image width (float): width of image x (float): x y (float): y """ XML_TAG = 'faksimile-image' OLD_NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/download/' NIETZSCHE_SOURCES_URL = 'https://nietzsche.philhist.unibas.ch/faksimiles/' + NIETZSCHE_SOURCES_ORIGINAL_URL = 'http://www.nietzschesource.org/DFGA/' NIETZSCHE_SOURCES_API_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/' NIETZSCHE_SOURCES_IMAGE_API_URL = 'http://www.nietzschesource.org/DFGAapi/images/DFGA/' + FAKSIMILE_DIR = 'faksimiles/' def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, text_field=None): super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL, local_path=local_path,\ height=height, width=width, matrix=matrix, text_field=text_field, tag=self.XML_TAG) self.x = x self.y = y self.apiURL = None self.thumbURL = None self.mediumURL = None if self.file_name is not None: nsource_page_name = self.file_name.replace('.jpg','') nsource_manuscript_name = nsource_page_name.split(',')[0] self.primaryURL = self.NIETZSCHE_SOURCES_URL + self.file_name self.thumbURL = self.NIETZSCHE_SOURCES_URL + self.file_name.replace('.jpg', '_thumb.jpg') + self.original_thumbURL = None + if node is not None and bool(node.get('thumb')): + self.thumbURL = self.NIETZSCHE_SOURCES_URL + node.get('thumb') + if node is not None and bool(node.get('originalThumb')): + self.original_thumbURL = self.NIETZSCHE_SOURCES_URL + node.get('originalThumb') # self.NIETZSCHE_SOURCES_IMAGE_API_URL + nsource_manuscript_name + '/mini/' + self.file_name self.apiURL = self.NIETZSCHE_SOURCES_API_URL + nsource_page_name self.mediumURL = self.NIETZSCHE_SOURCES_IMAGE_API_URL + nsource_manuscript_name + '/medium/' + self.file_name + self.nietzsche_source = self.NIETZSCHE_SOURCES_ORIGINAL_URL + self.file_name.replace('.jpg','') + self.nietzsche_source_download = self.OLD_NIETZSCHE_SOURCES_URL + self.file_name.replace('.jpg','') """ if self.primaryURL is not None and self.primaryURL.startswith(self.NIETZSCHE_SOURCES_API_URL): self.apiURL = self.primaryURL self.primaryURL = self.NIETZSCHE_SOURCES_URL + basename(self.primaryURL) """ def get_image_joined_with_text_field(self, text_field): """Returns a new instance of itself that has a text_field (text_field.TextField). """ return FaksimileImage(file_name=self.file_name, local_path=self.local_path, URL=self.URL, height=self.height,\ width=self.width, x=self.x, y=self.y, text_field=text_field) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(FaksimileImage,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('apiURL', str, subPropertyOf=cls.HAS_URL)) + dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('original_thumbURL', str, subPropertyOf=cls.HAS_URL)) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('thumbURL', str, subPropertyOf=cls.HAS_URL)) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('mediumURL', str, subPropertyOf=cls.HAS_URL)) return cls.return_dictionary_after_updating_super_classes(dictionary) @staticmethod def CREATE_IMAGE(image_node, source_file=None): """Instantiates a FaksimileImage from a (lxml.etree.Element) image_node. """ namespaces = image_node.nsmap if len(namespaces) == 0: namespaces = { 'xlink': '' } local_path = image_node.get('{%s}href' % namespaces['xlink']) file_name = basename(local_path) if file_name != local_path and source_file is not None: local_path = realpath(dirname(source_file)) + sep + local_path local_path = realpath(local_path) if not isfile(local_path): local_path = None for path, dirs, files in os.walk(os.path.abspath(FAKSIMILE_LOCATION)): for filename in fnmatch.filter(files, file_name): local_path = os.path.join(path, filename) break URL = FaksimileImage.NIETZSCHE_SOURCES_URL + file_name.replace('.jpg','') height = float(image_node.get('height')) if bool(image_node.get('height')) else 0.0 width = float(image_node.get('width')) if bool(image_node.get('width')) else 0.0 x = float(image_node.get('x')) if bool(image_node.get('x')) else 0.0 y = float(image_node.get('y')) if bool(image_node.get('y')) else 0.0 matrix = Matrix(transform_matrix_string=image_node.get('transform'))\ if bool(image_node.get('transform'))\ else None return FaksimileImage(file_name=file_name, local_path=local_path, URL=URL, height=height, width=width, x=x, y=y, matrix=matrix) Index: svgscripts/datatypes/matrix.py =================================================================== --- svgscripts/datatypes/matrix.py (revision 113) +++ svgscripts/datatypes/matrix.py (revision 114) @@ -1,348 +1,370 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to transform a svg/text[@transform] matrix-string into a matrix representation. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import re import math class Matrix: """ This class transforms a svg @transform matrix-string into a matrix representation. Args: transform_matrix_string (str) string of the form 'matrix(1.0 0.0 0.0 1.0 0.0 0.0)' or 'rotate(10)' """ A = 0 B = 1 C = 2 D = 3 E = 4 F = 5 XINDEX = 4 YINDEX = 5 MATRIX_LENGTH = 6 DOWN = 1 STRAIGHT = 0 UP = -1 def __init__(self, transform_matrix_string=None, transkription_field=None, matrix_list=[]): self.matrix = [ 0.0 for i in range(Matrix.MATRIX_LENGTH) ] if len(matrix_list) < 6 else matrix_list if transform_matrix_string is not None: m = re.search('(?<=rotate\()[-]*[0-9]+', transform_matrix_string) if m is not None: # transform='rotate(a)' to transform='matrix(cos(a), sin(a), -sin(a), cos(a), 0, 0)' angle = float(m.group(0)) self.matrix[Matrix.A] = round(math.cos(math.radians(angle)), 3) self.matrix[Matrix.B] = round(math.sin(math.radians(angle)), 3) self.matrix[Matrix.C] = round(math.sin(math.radians(angle))*-1, 3) self.matrix[Matrix.D] = round(math.cos(math.radians(angle)), 3) self.matrix[Matrix.E] = 0 self.matrix[Matrix.F] = 0 elif re.search(r'matrix\(\s*([-]*\d+([\.]*\d*(e-\d+)*)*[,\s][\s]*){5}[-]*\d+(\.\d+)*.*\s*\)', transform_matrix_string): #elif re.search(r'matrix\(\s*([-]*\d+(\.\d+(e-\d+)*)*[,\s][\s]*){5}[-]*\d+(\.\d+)*.*\s*\)', transform_matrix_string): #elif re.search(r'matrix\(\s*([-]*[0-9].*\s){5}[-]*[0-9].*\s*\)', transform_matrix_string): # old-> does not include comma separated matrix string self.matrix = [ float(i) for i in transform_matrix_string.replace('matrix(','').\ replace(', ', ',').replace(',', ' ').replace(')','').split(' ') ] else: raise Exception('Error: string "{}" is not a valid transform matrix string!'.format(transform_matrix_string)) if transkription_field is not None: self.matrix[Matrix.XINDEX] -= transkription_field.xmin self.matrix[Matrix.YINDEX] -= transkription_field.ymin if(len(self.matrix) < Matrix.MATRIX_LENGTH): raise Exception('Error: string "{}" is not a valid matrix string!'.format(transform_matrix_string)) def add2X(self, add_to_x=0): """Return x-value of matrix (float) + add_to_x. """ return self.matrix[Matrix.XINDEX] + float(add_to_x) def add2Y(self, add_to_y=0): """Return y-value of matrix (float) + add_to_y. """ return self.matrix[Matrix.YINDEX] + float(add_to_y) def getX(self): """Return x-value of matrix (float). """ return self.matrix[Matrix.XINDEX] def getY(self): """Return y-value of matrix (float). """ return self.matrix[Matrix.YINDEX] + def get90DegreeIndex(self) ->int: + """Return one of three indices for 90, 180 and 270 rotations. + """ + if not self.isRotationMatrix(): + return -1 + elif self.matrix[self.B] == 1.0 and self.matrix[self.C] == -1.0: + return 0 + elif self.matrix[self.B] == 0.0 and self.matrix[self.C] == 0.0: + return 1 + elif self.matrix[self.B] == -1.0 and self.matrix[self.C] == 1.0: + return 2 + else: + return -1 + + def is_matrix_horizontal(self): """Returns whether matrix is horizontal. [:return:] True/False """ return self.matrix[Matrix.A] == 1 and self.matrix[Matrix.B] == 0 and self.matrix[Matrix.C] == 0 and self.matrix[Matrix.D] == 1 def get_new_x(self, x=0.0, y=0.0): """Returns new position of x. :return: (float) x """ top_left_x = x - self.matrix[self.E] if x != 0.0 else 0.0 top_left_y = y - self.matrix[self.F] if y != 0.0 else 0.0 return self.matrix[Matrix.A] * top_left_x + self.matrix[Matrix.C] * top_left_y + self.matrix[self.E] def get_new_y(self, x=0.0, y=0.0): """Returns new position of y. :return: (float) y """ top_left_x = x - self.matrix[self.E] if x != 0.0 else 0.0 top_left_y = y - self.matrix[self.F] if y != 0.0 else 0.0 return self.matrix[Matrix.B] * top_left_x + self.matrix[Matrix.D] * top_left_y + self.matrix[self.F] def get_old_x(self, x=0.0, y=0.0): """Returns old position of x. :return: (float) x """ old_x = (self.matrix[self.D]*x - self.matrix[Matrix.D]*self.matrix[Matrix.E] - self.matrix[Matrix.C]*y + self.matrix[Matrix.C]*self.matrix[Matrix.F])\ /(self.matrix[Matrix.A]*self.matrix[Matrix.D] - self.matrix[Matrix.B]*self.matrix[Matrix.C]) return self.add2X(old_x) def get_transformed_positions(self, x=0.0, y=0.0, width=0.0, height=0.0): """Returns transformed x, y, width and height. """ top_left_x = x top_left_y = y top_right_x = x + width top_right_y = y bottom_left_x = x bottom_left_y = y + height bottom_right_x = x + width bottom_right_y = y + height new_x = self.matrix[Matrix.A] * top_left_x + self.matrix[Matrix.C] * top_left_y + self.matrix[self.E] new_y = self.matrix[Matrix.B] * top_left_x + self.matrix[Matrix.D] * top_left_y + self.matrix[self.F] new_top_right_x = self.matrix[Matrix.A] * top_right_x + self.matrix[Matrix.C] * top_right_y + self.matrix[self.E] new_top_right_y = self.matrix[Matrix.B] * top_right_x + self.matrix[Matrix.D] * top_right_y + self.matrix[self.F] new_bottom_left_x = self.matrix[Matrix.A] * bottom_left_x + self.matrix[Matrix.C] * bottom_left_y + self.matrix[self.E] new_bottom_left_y = self.matrix[Matrix.B] * bottom_left_x + self.matrix[Matrix.D] * bottom_left_y + self.matrix[self.F] new_bottom_right_x = self.matrix[Matrix.A] * bottom_right_x + self.matrix[Matrix.C] * bottom_right_y + self.matrix[self.E] new_bottom_right_y = self.matrix[Matrix.B] * bottom_right_x + self.matrix[Matrix.D] * bottom_right_y + self.matrix[self.F] new_width = abs(new_top_right_x - new_x)\ if abs(new_top_right_x - new_x) >= abs(new_bottom_right_x - new_bottom_left_x)\ else abs(new_bottom_right_x - new_bottom_left_x) new_height = abs(new_bottom_left_y - new_y)\ if abs(new_bottom_left_y - new_y) >= abs(new_top_right_y - new_bottom_right_y)\ else abs(new_top_right_y - new_bottom_right_y) return new_x, new_y, new_width, new_height def clone_transformation_matrix(self): """Returns a matrix that contains only the transformation part. [:return:] (Matrix) a clone of this matrix """ return Matrix(matrix_list=self.matrix[0:4]+[0,0]) def isRotationMatrix(self): """Return whether matrix is a rotation matrix. """ return self.matrix[Matrix.A] < 1 or self.matrix[Matrix.B] != 0 def toCSSTransformString(self): """Returns the CSS3 transform string: 'rotate(Xdeg)' where X is the angle. """ angle = 0 if self.isRotationMatrix(): angle = int(round(math.degrees(math.asin(self.matrix[Matrix.B])), 0)) if angle == 0: angle = int(round(math.degrees(math.acos(self.matrix[Matrix.A])), 0)) return 'rotate({}deg)'.format(angle) def toString(self): """Returns a transform_matrix_string representation of the matrix. [:returns:] (str) 'matrix(X X X X X X)' """ return 'matrix(' + ' '.join([ str(round(x, 5)) for x in self.matrix ]) + ')' def get_rotation_direction(self): """Get rotation direction of rotation matrix. [:return:] (int) direction code Matrix.UP, Matrix.STRAIGHT, Matrix.DOWN """ if not self.isRotationMatrix(): return self.STRAIGHT else: angle = int(round(math.degrees(math.asin(self.matrix[Matrix.B])), 0)) return self.UP if angle < 0 else self.DOWN + def updateOffset(self, height: float, width: float): + """Update offset of matrix + """ + if self.matrix[self.B] != 0.0 and self.matrix[self.C] != 0.0: + self.matrix[self.E] = (height-width)/2 + self.matrix[self.F] = -1*(height-width)/2 + @staticmethod def IS_BENEATH_TF(matrix, transkription_field): """Returns true if matrix specifies a position beneath transkription_field. """ if matrix.getY() < transkription_field.ymax or matrix.getY() > transkription_field.documentHeight-10: return False if transkription_field.second_field is not None\ and matrix.getY() > transkription_field.second_field.ymin_without_title: return False return True @staticmethod def IS_IN_FOOTNOTE_AREA(transform_matrix_string, transkription_field, x=0.0, marginals_on_extra_page=False): """Returns true if matrix specifies a position that is part of the footnote area. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ matrix = Matrix(transform_matrix_string=transform_matrix_string) if marginals_on_extra_page: return matrix.getY() < transkription_field.documentHeight-10\ and matrix.getY() > transkription_field.documentHeight/4\ and matrix.getX() + x > transkription_field.documentWidth/4\ and not Matrix.IS_IN_MARGIN_FIELD(transform_matrix_string, transkription_field, marginals_on_extra_page=True) if not Matrix.IS_BENEATH_TF(matrix, transkription_field): return False is_part = matrix.getX() + x > transkription_field.xmin\ if transkription_field.is_page_verso()\ else matrix.getX() + x > transkription_field.documentWidth/4 return is_part @staticmethod def NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, transkription_field, marginals_on_extra_page=False): """Returns true if matrix specifies a position that is part of the footnote area. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ matrix = Matrix(transform_matrix_string=node.get('transform')) x = sorted([ float(x.get('x')) for x in node.getchildren()])[-1]\ if len(node.getchildren()) > 0 else 0.0 if marginals_on_extra_page: return matrix.getY() < transkription_field.documentHeight-10\ and matrix.getY() > transkription_field.documentHeight/4\ and matrix.getX() + x > transkription_field.documentWidth/4\ and not Matrix.IS_IN_MARGIN_FIELD(node.get('transform'), transkription_field, marginals_on_extra_page=True) if not Matrix.IS_BENEATH_TF(matrix, transkription_field): return False is_part = matrix.getX() + x > transkription_field.xmin\ if transkription_field.is_page_verso()\ else matrix.getX() + x > transkription_field.documentWidth/4 return is_part @staticmethod def IS_IN_MARGIN_FIELD(transform_matrix_string, transkription_field, marginals_on_extra_page=False): """Returns true if matrix specifies a position that is part of the margin field. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ line_number_area_width = 15\ if transkription_field.line_number_area_width == 0.0\ else transkription_field.line_number_area_width matrix = Matrix(transform_matrix_string=transform_matrix_string) if matrix.getY() < transkription_field.ymin or matrix.getY() > transkription_field.ymax: return False if marginals_on_extra_page: return matrix.getX() > transkription_field.xmax is_part = matrix.getX() < transkription_field.xmin - line_number_area_width\ if transkription_field.is_page_verso()\ else matrix.getX() > transkription_field.xmax + line_number_area_width return is_part @staticmethod def IS_IN_PLACE_OF_PRINTING_AREA(transform_matrix_string, transkription_field): """Returns true if matrix specifies a position that is part of the area where the places of printing ('Druckorte') are printed. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ matrix = Matrix(transform_matrix_string=transform_matrix_string) if not Matrix.IS_BENEATH_TF(matrix, transkription_field): return False is_part = matrix.getX() < transkription_field.xmin\ if transkription_field.is_page_verso()\ else matrix.getX() < transkription_field.documentWidth/4 return is_part @staticmethod def IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=None, matrix=None): """Returns true if matrix specifies a position that is part of transkription field. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ if matrix is None and not bool(text_node.get('transform')): return False if matrix is None: matrix = Matrix(transform_matrix_string=text_node.get('transform')) is_part = matrix.getX() > transkription_field.xmin and matrix.getX() < transkription_field.xmax\ and matrix.getY() > transkription_field.ymin and matrix.getY() < transkription_field.ymax #if not is_part and matrix.isRotationMatrix() and len([child.text for child in text_node.getchildren() if not re.match(r'^\s*$', child.text)]) > 0: if not is_part and len([child.text for child in text_node.getchildren() if not re.match(r'^\s*$', child.text)]) > 0: first_tspan_node = [ child for child in text_node.getchildren() if not re.match(r'^\s*$', child.text)][0] x = matrix.add2X(float(first_tspan_node.get('x'))) y = matrix.add2Y(float(first_tspan_node.get('y'))) new_x = matrix.get_new_x(x=x, y=y) new_y = matrix.get_new_y(x=x, y=y) return new_x > transkription_field.xmin and new_x < transkription_field.xmax\ and new_y > transkription_field.ymin and new_y < transkription_field.ymax return is_part @staticmethod def IS_NEARX_TRANSKRIPTION_FIELD(transform_matrix_string, transkription_field, diffx=20.0): """Returns true if matrix specifies a position that is on its x axis near the transkription_field. transform_matrix_string (str): string from which to init Matrix. transkription_field (svgscripts.TranskriptionField) diffx (float): defines threshold for positions that count as near. """ matrix = Matrix(transform_matrix_string=transform_matrix_string) MINLEFT = transkription_field.xmin - diffx MAXRIGHT = transkription_field.xmax + diffx return matrix.getY() > transkription_field.ymin and matrix.getY() < transkription_field.ymax\ and ((matrix.getX() > MINLEFT and matrix.getX() < transkription_field.xmin)\ or (matrix.getX() > transkription_field.xmax and matrix.getX() < MAXRIGHT)) @staticmethod def DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b, diff_threshold=0.001): """Returns whether the conversion factors (a-d) differ more than diff_threshold. """ if matrix_a is None or matrix_b is None: return not (matrix_a is None and matrix_b is None) return abs(matrix_a.matrix[Matrix.A] - matrix_b.matrix[Matrix.A]) > diff_threshold\ or abs(matrix_a.matrix[Matrix.B] - matrix_b.matrix[Matrix.B]) > diff_threshold\ or abs(matrix_a.matrix[Matrix.C] - matrix_b.matrix[Matrix.C]) > diff_threshold\ or abs(matrix_a.matrix[Matrix.D] - matrix_b.matrix[Matrix.D]) > diff_threshold def __eq__(self, other): """Return self.matrix == other.matrix. """ if other is None: return False return self.matrix == other.matrix def __hash__(self): """Return hash value. """ return hash((self.matrix[Matrix.E], self.matrix[Matrix.F])) Index: svgscripts/datatypes/manuscript_description.py =================================================================== --- svgscripts/datatypes/manuscript_description.py (revision 0) +++ svgscripts/datatypes/manuscript_description.py (revision 114) @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This class can be used to represent a manuscript description. +""" +# Copyright (C) University of Basel 2020 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.1" + +from lxml import etree as ET +import re +import sys + +from .text import Text +from .description import Description +from .standoff_tag import StandoffTag + +class ManuscriptDescription(Description): + """ + This class represents a manuscript description, i.e. the description of the folio. + """ + ROOT_TAG = 'currentDescription' + XML_TAG = 'description' + ORIGINALLY_PATTERN = re.compile('(.*\.\s*)(Der|Als|Aus|Ursprünglich)(\s.*)') + + def __init__(self, content: str, standoff_markups=None, originally=None, id=0): + super(ManuscriptDescription,self).__init__(content, standoff_markups=standoff_markups, id=id) + self.originally = originally + + @classmethod + def get_semantic_dictionary(cls): + """ Creates and returns a semantic dictionary as specified by SemanticClass. + """ + dictionary = super(ManuscriptDescription,cls).get_semantic_dictionary() + dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('originally', str,\ + name='leafHasDescriptionAboutOriginalConstitution', label='leaf has description about origianl constitution',\ + comment='There is a description of the original constitution of the corresponding leaf.')) + return cls.return_dictionary_after_updating_super_classes(dictionary) + + @classmethod + def create_cls_from_raw_text(cls, text: str): + """Initialize a cls from node. + + [:return:] cls + """ + originally = None + m = re.match(cls.ORIGINALLY_PATTERN, text) + if m is not None: + keyword = m.groups()[1] + startIndex = text.index(keyword) + endIndex = len(text) + originally = text[startIndex:endIndex] + text = text[:startIndex].replace(' ', ' ').strip() + return cls(text.replace('+', 'x'), originally=originally) + Index: teiscripts/create_tei_files.py =================================================================== --- teiscripts/create_tei_files.py (revision 0) +++ teiscripts/create_tei_files.py (revision 114) @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This program can be used to create tei files +""" +# Copyright (C) University of Basel 2022 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +from colorama import Fore, Style +import csv +import getopt +import json +import lxml.etree as ET +import shutil +import subprocess +import sys +import os +import wget +import re +from os import listdir, sep, path, setpgrp, devnull, mkdir, remove +from os.path import exists, isfile, isdir, dirname, basename +from progress.bar import Bar +import warnings +import xml.etree.ElementTree as XET + +sys.path.append('fixes') +from get_text_field import get_text_field_on_image, get_text_field_on_thumb + +sys.path.append('svgscripts') +from datatypes.faksimile_image import FaksimileImage +from datatypes.faksimile import FaksimilePage +from datatypes.archival_manuscript import ArchivalManuscriptUnity +from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK +from datatypes.matrix import Matrix +from util import back_up, back_up_svg_file, copy_faksimile_update_image_location, copy_faksimile_svg_file +from process_files import update_svgposfile_status +from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR + +sys.path.append('shared_util') +from myxmlwriter import copy_to_bak_dir, write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT +from main_util import create_function_dictionary + + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.2" + +MAX_SVG_XY_THRESHOLD = 10 +BLANK_STATUS = 'blank' + +class CSVFileCreator: + """This class can be used in order to create svg files with textfield rects from a csv file. + """ + UNITTESTING = False + FOLIO = "Blatt-ID" + LABEL = "Label" + ALIAS = "Alias" + URL = "Nietzschesource-URL" + CONTENT = "Inhalt" + + def __init__(self, template, svg_dir='./svg', content_dir='./text', target_dir='./tmp'): + self.template = template + self.tei_tree = ET.parse(self.template) + self.namespaces = { k if k is not None else 'ns': v for k, v in self.tei_tree.getroot().nsmap.items() } + self.namespaces.update({'xml': 'http://www.w3.org/XML/1998/namespace'}) + self.content_dir = content_dir + self.svg_dir = svg_dir + self.target_dir = target_dir + + def _add_content(self, body: ET.Element, xml_id: str, page_number: str, contentFiles=None): + """Add content to body. + """ + for item in body.iterchildren(): body.remove(item) + if contentFiles is None: + contentFiles = [] + div = ET.SubElement(body, 'div') + pb = ET.SubElement(div, 'pb') + pb.set('facs', '#' + xml_id) + pb.set('n', page_number) + for contentFile in contentFiles: + contentTree = ET.parse(contentFile) + self._mark_greek_as_foreign_lang(contentTree.xpath('//hi[@class="greek"]')) + div.append(contentTree.getroot()) + + def _mark_greek_as_foreign_lang(self, greekWords: list): + """Replace to + """ + for greekWord in greekWords: + try: + del greekWord.attrib['class'] + except KeyError: + pass + greekWord.tag = 'foreign' + greekWord.attrib[ET.QName(self.namespaces['xml'],'lang')] = 'grc' # see www.iana.org/assignments/language-subtag-registry/language-subtag-registry + + def _get_faksimile_page(self, svg_file) ->FaksimilePage: + """Return the faksimile page of a faksimile image + """ + faksimile_pages = FaksimilePage.get_faksimile_pages(svg_file, isBlank=True) + if len(faksimile_pages) == 1: + return faksimile_pages[0] + elif len(faksimile_pages): + return sorted(faksimile_pages, key=lambda f: f.text_field.height)[-1] + return None + + def _get_element(self, xpath: str, firstElementOnly=True) ->ET.Element: + """Return element(s) and warn if xpath results in empty list. + """ + elements = self.tei_tree.xpath(xpath, namespaces=self.namespaces) + if len(elements) == 0: + msg = f'There are no elements for xpath {xpath}!' + warnings.warn(msg) + return None + if firstElementOnly: + return elements[0] + return elements + + def _set_idnos(self, idnos: list, contentDict: dict): + """Set idno of type URI, SpN, Blatt-ID + """ + for idno in idnos: + if bool(contentDict.get(idno.get('type'))): + idno.text = contentDict[idno.get('type')] + + def _write_file(self, target_file: str, faksimile_page: FaksimilePage, folioId: str, label: str, alias: str, url: str, content: str, contentFiles=None) ->int: + """Write data to target_file + """ + title = self._get_element('/ns:TEI/ns:teiHeader/ns:fileDesc/ns:titleStmt/ns:title') + idnos = self._get_element('/ns:TEI/ns:teiHeader/ns:fileDesc/ns:sourceDesc/ns:msDesc/ns:msIdentifier/ns:idno', firstElementOnly=False) + objectDesc = self._get_element('/ns:TEI/ns:teiHeader/ns:fileDesc/ns:sourceDesc/ns:msDesc/ns:physDesc/ns:objectDesc') + faksimile = self._get_element('/ns:TEI/ns:facsimile') + body = self._get_element('/ns:TEI/ns:text/ns:body') + if title is not None and idnos is not None and objectDesc is not None and faksimile is not None and ',' in alias and body is not None: + xml_id = label.replace('.jpg','') + page_number = alias[alias.index(',')+1:].lstrip() + title.text = alias + objectDesc.text = content + self._set_idnos(idnos, { 'URI': faksimile_page.faksimile_image.nietzsche_source, 'SpN': alias, 'Blatt-ID': folioId }) + faksimile_page.attach_to_tei_facsimile(faksimile, xml_id, page_number) + self._add_content(body, xml_id, page_number, contentFiles=contentFiles) + return write_tei(self.tei_tree, target_file, namespaces=self.namespaces) + return 2 + + def create_tei_file(self, folioId, label, alias, url, content) ->int: + """Create a svg file from csv input. + """ + target_file = self.target_dir + sep + alias.replace(', ', '_page').replace(' ', '_') + '.xml' + if isfile(target_file): + return 1 + svg_file = self.svg_dir + sep + label.replace('.jpg','.svg') + if not isfile(svg_file): + return 2 + faksimile_page = self._get_faksimile_page(svg_file) + if faksimile_page is None: + msg = f'There is no faksimile_page in {svg_file} for {label}, {alias}!' + warnings.warn(msg) + return 2 + contentFiles = [] + if 'GM' in content: + try: + contentId = content[content.index('GM'):].replace(' ', '-')\ + if ',' not in content\ + else content[content.index('GM'):content.index(',')].replace(' ', '-') + if isfile(self.content_dir + sep + contentId + '.txt'): + contentFiles.append(self.content_dir + sep + contentId + '.txt') + elif len(contentId.split('-')) > 1 and re.match(r'\d', contentId.split('-')[-2]): + split = contentId.split('-') + for i in range(int(split[-2]), int(split[-1])+1): + contentFile = self.content_dir + sep + contentId[:contentId.index('-'+split[-2])] + '-' + str(i) + '.txt' + if isfile(contentFile): + contentFiles.append(contentFile) + except ValueError: + msg = f'There has been an error with content {content} of {label}, {alias}' + warnings.warn(msg) + self._write_file(target_file, faksimile_page, folioId, label, alias, url, content, contentFiles=contentFiles) + return 0 + +def process_default(args) ->int: + """ Default process + """ + if len(args) < 2: + usage() + return 2 + exit_status = 0 + template = args[0] + csv_file = args[1] + svg_dir = args[2] if len(args) > 2 else './svg' + target_dir = args[3] if len(args) > 3 else './tmp' + content_dir = args[4] if len(args) > 4 else './text' + not isdir(target_dir) and mkdir(target_dir) + if isfile(template) and isfile(csv_file) and isdir(svg_dir) and isdir(content_dir): + counter = 0 + file_creater = CSVFileCreator(template, svg_dir=svg_dir, content_dir=content_dir, target_dir=target_dir) + with open(csv_file, newline='') as csvfile: + reader = csv.DictReader(csvfile) + for row in [ row for row in reader if not '_thumb' in row[CSVFileCreator.LABEL] ]: + status = file_creater.create_tei_file(row[CSVFileCreator.FOLIO], row[CSVFileCreator.LABEL],\ + row[CSVFileCreator.ALIAS],row[CSVFileCreator.URL],row[CSVFileCreator.CONTENT]) + if status > 1: + exit_status = 2 + elif status == 0: + counter += 1 + if not CSVFileCreator.UNITTESTING: + print(Style.RESET_ALL + f'[{counter} tei files created]') + else: + if not isdir(svg_dir): + raise FileNotFoundError(f'Directory {svg_dir} does not exist!') + if not isdir(content_dir): + raise FileNotFoundError(f'Directory {content_dir} does not exist!') + if not isfile(template): + raise FileNotFoundError(f'File {template} does not exist!') + raise FileNotFoundError(f'File {csv_file} does not exist!') + return 0 + +def write_tei(tei_tree: ET.ElementTree, target_file: str, namespaces=None) ->int: + """Write to tei file + """ + if namespaces is None: + namespaces = { k if k is not None else 'ns': v for k, v in tei_tree.getroot().nsmap.items() } + for key in namespaces.keys(): + if key == 'ns': + XET.register_namespace('', namespaces[key]) + else: + XET.register_namespace(key, namespaces[key]) + target_tree = XET.ElementTree(XET.fromstring(ET.tostring(tei_tree, pretty_print=True))) + if not CSVFileCreator.UNITTESTING: + target_tree.write(target_file, encoding="UTF-8", xml_declaration=True) + return 0 + +def usage(): + """prints information on how to use the script + """ + print(main.__doc__) + +def main(argv): + """This program can be used to create tei files. + + fixes/create_tei_files.py [OPTIONS]