Index: svgscripts/convert_wordPositions.py =================================================================== --- svgscripts/convert_wordPositions.py (revision 98) +++ svgscripts/convert_wordPositions.py (revision 99) @@ -1,530 +1,557 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert the word positions to HTML for testing purposes. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import cairosvg import getopt import json from lxml.html import builder as E from lxml.html import open_in_browser import lxml from pathlib import Path as PathLibPath from os import sep, listdir, mkdir, path, remove from os.path import exists, isfile, isdir, dirname import re import sys from svgpathtools import svg_to_paths import xml.etree.ElementTree as ET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.page_creator import PageCreator from datatypes.transkriptionField import TranskriptionField from datatypes.writing_process import WritingProcess from datatypes.word import Word __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Converter: """The converter super class. """ def __init__(self, page, non_testing=True, show_word_insertion_mark=False): self.page = page self.non_testing = non_testing self.show_word_insertion_mark = show_word_insertion_mark def _get_transkription_positions(self, transkription_positions, stage_version=''): """Returns the transkription_positions of the indicated stage_version. """ convertable_transkription_positions = transkription_positions if stage_version != '': convertable_transkription_positions = [] if re.match(r'^\d$', stage_version): writing_process_id = int(stage_version) for transkription_position in transkription_positions: if transkription_position.writing_process_id == writing_process_id: convertable_transkription_positions.append(transkription_position) elif re.match(r'^\d\+$', stage_version): version_range = [ *range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)) ] for transkription_position in transkription_positions: if transkription_position.writing_process_id in version_range: convertable_transkription_positions.append(transkription_position) elif re.match(r'^\d\-\d$', stage_version): start_stop = [ int(i) for i in re.split(r'-', stage_version) ] version_range = [ *range(start_stop[0], start_stop[1]+1) ] for transkription_position in transkription_positions: if transkription_position.writing_process_id in version_range: convertable_transkription_positions.append(transkription_position) return convertable_transkription_positions def _get_words(self, words, highlighted_words=None): """Return the words that will be hightlighted. """ return highlighted_words if highlighted_words is not None else words def convert(self, output_file=None, stage_version='', highlighted_words=None): """Prints all words. """ first_word_of_line = None out = sys.stdout if output_file is not None: out = open(output_file, 'w') for word in self.page.words: if first_word_of_line is None or first_word_of_line.line_number != word.line_number: out.write('\n') first_word_of_line = word if word.line_number % 2 == 0: out.write(str(word.line_number).zfill(2) + ' ') else: out.write(' ') if stage_version == '' or len(self._get_transkription_positions(word.transkription_positions, stage_version=stage_version)) > 0: if word.text is not None: out.write(word.text + ' ') out.close() return 0 @classmethod def CREATE_CONVERTER(cls, page, non_testing=True, converter_type='', show_word_insertion_mark=False, key=''): """Returns a converter of type converter_type. [:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None """ cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() } cls_key = converter_type + 'Converter' if bool(cls_dict.get(cls_key)): converter_cls = cls_dict[cls_key] if converter_cls == JSONConverter: return converter_cls(page, non_testing, key=key) return converter_cls(page, non_testing, show_word_insertion_mark) else: return Converter(page, non_testing, show_word_insertion_mark) class JSONConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a json file. """ PY2TS_DICT = { float: 'number', int: 'number', bool: 'boolean', str: 'string' } def __init__(self, page, non_testing=True, key=''): Converter.__init__(self, page, non_testing, False) self.key = key self.interface_output_dir = PathLibPath('ts_interfaces') if not self.interface_output_dir.is_dir(): self.interface_output_dir.mkdir() elif len(list(self.interface_output_dir.glob('*.ts'))) > 0: for ts_file in self.interface_output_dir.glob('*.ts'): remove(ts_file) def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to JSON. """ if output_file is None: output_file = 'output.json' class_dict = {} if self.key != '': object_instance = self.page.__dict__.get(self.key) if object_instance is not None: json_dict = self.add_object2dict(object_instance, class_dict) if type(json_dict) == list: json_dict = { self.key : json_dict } else: print(f'Page initialized from {self.page.page_tree.docinfo.URL} does not have an object at "{self.key}"!') return 2 else: json_dict = self.add_object2dict(self.page, class_dict) json_file = open(output_file, "w+") try: json.dump(json_dict, json_file) except Exception: raise Exception('Error in json.dump') json_file.close() self.create_imports(class_dict) return 0 def add_object2dict(self, object_instance, class_dict): """Add an object to json_dict and generate json data and interfaces. [:return:] json dict or object_instance """ json_dict = {} interface_list = [] object_type = type(object_instance) if object_type.__module__ == 'builtins': if object_type != list: return object_instance else: items = [] for item in object_instance: items.append(self.add_object2dict(item, class_dict)) if len(items) > 0: return { self.key: items } else: return { self.key: 'null' } semantic_dictionary = object_type.get_semantic_dictionary() for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]: content = object_instance.__dict__.get(key) if content_type == list\ and content is not None\ and len(content) > 0\ and type(content[0]).__module__ != 'builtins': content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item, class_dict)) json_dict.update({key: content_list}) interface_list.append(f'{key}: {type(content[0]).__name__}[];') elif content_type.__module__ == 'builtins': if content_type != list: ts_type = self.PY2TS_DICT[content_type]\ if content_type in self.PY2TS_DICT.keys()\ else 'string' interface_list.append(f'{key}: {ts_type};') json_dict.update({key: content}) else: if content is not None and type(content) == list: interface_list.append(f'{key}: {content_type.__name__}[];') content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item, class_dict)) json_dict.update({key: content_list}) else: interface_list.append(f'{key}: {content_type.__name__};') if content is not None: json_dict.update({key: self.add_object2dict(content, class_dict)}) if object_type not in class_dict.keys(): class_dict.update({object_type: self.create_interface(object_type.__name__, interface_list)}) return json_dict def create_imports(self, class_dict): """Create an ts interface from a list of key and content_types. [:return:] file_name of interface """ ts_file = PathLibPath('ts_imports.ts') file = open(ts_file, "w+") file.write(f'//import all interfaces from {self.interface_output_dir} ' + '\n') for interface_name, path_name in class_dict.items() : file.write('import {' + interface_name.__name__ + '} from \'./' + str(self.interface_output_dir.joinpath(path_name.stem)) + '\';\n') file.close() return ts_file def create_interface(self, class_name, interface_list) -> PathLibPath: """Create an ts interface from a list of key and content_types. [:return:] file_name of interface """ ts_file = self.interface_output_dir.joinpath(PathLibPath(f'{class_name.lower()}.ts')) import_list = [ import_class_name for import_class_name in\ [ import_class_name.split(': ')[1].replace(';','').replace('[]','') for import_class_name in interface_list ]\ if import_class_name not in set(self.PY2TS_DICT.values()) ] file = open(ts_file, "w") for import_class_name in set(import_list): file.write('import {' + import_class_name + '} from \'./' + import_class_name.lower() + '\';\n') file.write(f'export interface {class_name} ' + '{\n') for interace_string in interface_list: file.write(f'\t' + interace_string + '\n') file.write('}') file.close() return ts_file class SVGConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text. """ BG_COLOR = 'yellow' OPACITY = '0.2' def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY): Converter.__init__(self, page, non_testing, show_word_insertion_mark) self.bg_color = bg_color self.opacity = opacity def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to SVG """ title = self.page.title if(self.page.title is not None) else 'Test Page' title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title svg_file = self.page.svg_file if svg_file is None and self.page.svg_image is not None: svg_file = self.page.svg_image.file_name elif svg_file is None: msg = f'ERROR: xml_source_file {self.page.docinfo.URL} does neither have a svg_file nor a svg_image!' raise Exception(msg) transkription_field = TranskriptionField(svg_file) if bool(transkription_field.get_svg_attributes('xmlns')): ET.register_namespace('', transkription_field.get_svg_attributes('xmlns')) if bool(transkription_field.get_svg_attributes('xmlns:xlink')): ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink')) svg_tree = ET.parse(svg_file) transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'}) colors = [ 'yellow', 'orange' ] if self.bg_color == self.BG_COLOR else [ self.bg_color ] if highlighted_words is not None: colors = ['yellow'] else: highlighted_words = [] color_index = 0 for word in self.page.words: word_id = 'word_' + str(word.id) for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version): transkription_position_id = word_id + '_' + str(transkription_position.id) color = colors[color_index] if word not in highlighted_words else self.bg_color rect_node = ET.SubElement(transkription_node, 'rect',\ attrib={'id': transkription_position_id, 'x': str(transkription_position.left + transkription_field.xmin),\ 'y': str(transkription_position.top + transkription_field.ymin), 'width': str(transkription_position.width),\ 'height': str(transkription_position.height), 'fill': color, 'opacity': self.opacity}) if transkription_position.transform is not None: matrix = transkription_position.transform.clone_transformation_matrix() matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3) matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3) rect_node.set('transform', matrix.toString()) rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3))) rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3))) ET.SubElement(rect_node, 'title').text = word.text color_index = (color_index + 1) % len(colors) if output_file is not None: svg_tree.write(output_file) return 0 class HTMLConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a test HTML file. """ CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; } .highlight1 { background-color: pink; opacity: 0.2; } + .highlight2 { background-color: red; opacity: 0.2; } .foreign { background-color: blue; opacity: 0.4; } + .overwritten { background-color: green; opacity: 0.4; } .word-insertion-mark { background-color: orange; opacity: 0.2; } .deleted { background-color: grey; opacity: 0.2; } """ def __init__(self, page, non_testing=True, show_word_insertion_mark=False): Converter.__init__(self, page, non_testing, show_word_insertion_mark) def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to HTML """ title = self.page.title if(self.page.title is not None) else 'Test Page' title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title if stage_version != '': title = title + ', Schreibstufe: ' + stage_version if self.page.svg_image is not None: width = self.page.svg_image.width height = self.page.svg_image.height svg_file = self.page.svg_image.file_name elif self.page.svg_file is not None: svg_file = self.page.svg_file transkription_field = TranskriptionField(svg_file) width = transkription_field.getWidth() height = transkription_field.getHeight() style_content = ' position: relative; width: {}px; height: {}px; background-image: url("{}"); background-size: {}px {}px '\ .format(width, height, path.abspath(svg_file), width, height) style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS) head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style) transkription = E.DIV(id="transkription") counter = 0 for word in self.page.words: highlight_class = 'highlight' + str(counter)\ if not word.deleted else 'deleted' + if highlighted_words is not None\ + and word in highlighted_words: + highlight_class = 'highlight2' earlier_text = '' if word.earlier_version is None else word.earlier_version.text if earlier_text == '' and len(word.word_parts) > 0: earlier_versions = [ word for word in word.word_parts if word.earlier_version is not None ] earlier_text = earlier_versions[0].text if len(earlier_versions) > 0 else '' if earlier_text != '': word_title = 'id: {}/line: {}\n0: {}\n1: {}'.format(str(word.id), str(word.line_number), earlier_text, word.text) - if word.edited_text is not None: - word_title += f'\n{word.edited_text}' else: word_title = 'id: {}/line: {}\n{}'.format(str(word.id), str(word.line_number), word.text) + if word.edited_text is not None: + word_title += f'\n>{word.edited_text}' for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version): self._append2transkription(transkription, highlight_class, word_title, transkription_position) + if word.overwrites_word is not None and word.word_box is None: + overwritten_title = f'{word.text} overwrites {word.overwrites_word.text}' + for overwritten_transkription_position in word.overwrites_word.transkription_positions: + self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position) for part_word in word.word_parts: highlight_class = 'highlight' + str(counter)\ if not part_word.deleted else 'deleted' for part_transkription_position in self._get_transkription_positions(part_word.transkription_positions, stage_version=stage_version): self._append2transkription(transkription, highlight_class, word_title, part_transkription_position) + if part_word.overwrites_word is not None: + overwritten_title = f'{word.text} overwrites {part_word.overwrites_word.text}' + for overwritten_transkription_position in part_word.overwrites_word.transkription_positions: + self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position) counter = (counter + 1) % 2 word_insertion_mark_class = 'word-insertion-mark' counter = 0 for mark_foreign_hands in self.page.mark_foreign_hands: highlight_class = 'foreign' title = 'id: {}/line: {}\n{} {}'.format(str(mark_foreign_hands.id), str(word.line_number),\ mark_foreign_hands.foreign_hands_text, mark_foreign_hands.pen) for transkription_position in mark_foreign_hands.transkription_positions: self._append2transkription(transkription, highlight_class, title, transkription_position) if self.show_word_insertion_mark: for word_insertion_mark in self.page.word_insertion_marks: wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(word_insertion_mark.id), str(word_insertion_mark.line_number)) style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\ word_insertion_mark.top, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height) link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content) transkription.append(link) html = E.HTML(head,E.BODY(transkription)) bool(self.non_testing) and open_in_browser(html) if output_file is not None: with open(output_file, 'wb') as f: f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8')) f.closed return 0 def _append2transkription(self, transkription, highlight_class, title, transkription_position): """Append content to transkription-div. """ style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\ transkription_position.top, transkription_position.left, transkription_position.width, transkription_position.height) if transkription_position.transform is not None: style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString()) transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\ if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0 style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height) link = E.A(' ', E.CLASS(highlight_class), title=title, style=style_content) transkription.append(link) def create_pdf_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, pdf_file_name='output.pdf', bg_color=SVGConverter.BG_COLOR): """Creates a pdf file highlighting some words. """ if not pdf_file_name.endswith('pdf'): pdf_file_name = pdf_file_name + '.pdf' tmp_svg_file = pdf_file_name.replace('.pdf', '.svg') create_svg_with_highlighted_words(xml_source_file=xml_source_file, page=page, highlighted_words=highlighted_words,\ svg_file_name=tmp_svg_file, bg_color=bg_color) if isfile(tmp_svg_file): cairosvg.svg2pdf(url=tmp_svg_file, write_to=pdf_file_name) remove(tmp_svg_file) def create_svg_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, svg_file_name='output.svg', bg_color=SVGConverter.BG_COLOR): """Creates a svg file highlighting some words. """ if page is None and xml_source_file is not None: page = Page(xml_source_file) converter = SVGConverter(page, bg_color=bg_color) if not svg_file_name.endswith('svg'): svg_file_name = svg_file_name + '.svg' converter.convert(output_file=svg_file_name, highlighted_words=highlighted_words) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes. svgscripts/convert_wordPositions.py OPTIONS OPTIONS: -h|--help: show help -H|--HTML [default] convert to HTML test file -k|--key=key option for json converter: only convert object == page.__dict__[key] -o|--output=outputFile save output to file outputFile -P|--PDF convert to PDF test file -S|--SVG convert to SVG test file -s|--svg=svgFile: svg web file -T|--TEXT convert to TEXT output - -t|--testing execute in test mode, do not write to file or open browser + -t|--text=text highlight word -w|--word-insertion-mark show word insertion mark on HTML -v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. } + -x|--testing execute in test mode, do not write to file or open browser :return: exit code (int) """ convert_to_type = None - svg_file = None - output_file = None + key = '' non_testing = True - show_word_insertion_mark = False + output_file = None page = None + show_word_insertion_mark = False stage_version = '' - key = '' + svg_file = None + text = None try: - opts, args = getopt.getopt(argv, "hk:tHPSTws:o:v:", ["help", "key=", "testing", "HTML", "PDF", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version="]) + opts, args = getopt.getopt(argv, "hk:t:HPSTws:o:v:x", ["help", "key=", "text=", "HTML", "PDF", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version=", "testing"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-v', '--version'): if re.match(r'^(\d|\d\+|\d\-\d)$', arg): stage_version = arg else: raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg)) elif opt in ('-w', '--word-insertion-mark'): show_word_insertion_mark = True elif opt in ('-P', '--PDF'): convert_to_type = 'PDF' elif opt in ('-S', '--SVG'): convert_to_type = 'SVG' elif opt in ('-T', '--TEXT'): convert_to_type = 'TEXT' elif opt in ('-H', '--HTML'): convert_to_type = 'HTML' - elif opt in ('-t', '--testing'): + elif opt in ('-x', '--testing'): non_testing = False elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-o', '--output'): output_file = arg elif opt in ('-k', '--key'): key = arg + elif opt in ('-t', '--text'): + text = arg + print(arg) if len(args) < 1: usage() return 2 if convert_to_type is None: if output_file is not None and len(re.split(r'\.', output_file)) > 1: output_file_part_list = re.split(r'\.', output_file) convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper() else: convert_to_type = 'HTML' exit_code = 0 for word_position_file in args: if not isfile(word_position_file): print("'{}' does not exist!".format(word_position_file)) return 2 if convert_to_type == 'PDF': if output_file is None: output_file = 'output.pdf' - create_pdf_with_highlighted_words(word_position_file, pdf_file_name=output_file) + highlighted_words = None + if text is not None: + page = Page(word_position_file) + highlighted_words = [ word for word in page.words if word.text == text ] + create_pdf_with_highlighted_words(word_position_file, pdf_file_name=output_file, highlighted_words=highlighted_words) else: if svg_file is not None: if isfile(svg_file): page = PageCreator(word_position_file, svg_file=svg_file) else: print("'{}' does not exist!".format(word_position_file)) return 2 else: page = Page(word_position_file) if page.svg_file is None: print('Please specify a svg file!') usage() return 2 + + highlighted_words = None + if text is not None: + highlighted_words = [ word for word in page.words if word.text == text ] + print([ (word.id, word.text) for word in highlighted_words ]) converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark, key=key) - exit_code = converter.convert(output_file=output_file, stage_version=stage_version) + exit_code = converter.convert(output_file=output_file, stage_version=stage_version, highlighted_words=highlighted_words) return exit_code if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/compare_faksimile_words_line_wise.py =================================================================== --- svgscripts/compare_faksimile_words_line_wise.py (revision 98) +++ svgscripts/compare_faksimile_words_line_wise.py (revision 99) @@ -1,624 +1,626 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET import re import shutil import string import sys import tempfile from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path from progress.bar import Bar import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.faksimile import FaksimilePage, get_paths_inside_rect from datatypes.word_position import WordPosition from datatypes.faksimile_position import FaksimilePosition from datatypes.word import Word from datatypes.lineNumber import LineNumber from datatypes.page import Page, STATUS_MERGED_OK from datatypes.transkriptionField import TranskriptionField from interactive_merger import LineComposer, InteractiveMergerShell, ManualMergerShell from join_faksimileAndTranskription import get_filelist_and_manuscript_file, sort_faksimile_positions, sort_words from process_files import update_svgposfile_status from process_words_post_merging import post_merging_processing_and_saving from util import ExternalViewer, create_highlighted_svg_file, get_empty_node_ids, record_changes,\ record_changes_on_svg_file_to_page, record_changes_on_xml_file_to_page, get_mismatching_ids,\ replace_chars sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation) PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"') SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation) SINGLE_WORD_PATTERN = r"^[\w{}]$".format(string.punctuation) HIGHLIGHT_COLOR = 'red' OPACITY = '0.5' MIN_THRESHOLD = 2 DO_DEBUG = False class FaksimileLineComposer(LineComposer): """This class arranges the faksimile positions to lines. """ DEBUG = False def __init__(self, faksimile_positions, threshold=10, num_lines_with_words=-1, page=None): self.current_line_index = 0 self.current_faksimile_index = 0 reference_list = [ word.faksimile_positions[0] for word in page.words if len(word.faksimile_positions) > 0 and word.verified ]\ if page is not None\ else None if reference_list is not None: print([fp.text for fp in reference_list]) faksimile_positions = sort_faksimile_positions(faksimile_positions, reference_list=reference_list) self.lines_of_faksimile_positions = self._init_faksimile_positions_per_line(faksimile_positions, threshold=threshold, num_lines_with_words=num_lines_with_words) self.interactive_shell = InteractiveMergerShell(self, page=page) def _init_faksimile_positions_per_line(self, faksimile_positions, threshold=10, num_lines_with_words=-1) -> list: """Return a list containing for each line a list of faksimile positions. """ if len(faksimile_positions) == 0: return [[]] lines_of_faksimile_positions = [[]] last_wp = faksimile_positions[0] index = 0 for wp in faksimile_positions: if (wp.top+wp.bottom)/2 - (last_wp.top+last_wp.bottom)/2 > threshold: lines_of_faksimile_positions.append([]) index += 1 lines_of_faksimile_positions[index].append(wp) last_wp = wp if num_lines_with_words > -1\ and num_lines_with_words != len(lines_of_faksimile_positions)\ and threshold > MIN_THRESHOLD: return self._init_faksimile_positions_per_line(faksimile_positions,\ threshold=threshold-1, num_lines_with_words=num_lines_with_words) return lines_of_faksimile_positions def create_faksimile_dictionary(self, line_of_faksimile_positions, mergeables_only=False) ->dict: """Create a faksimile_dictionary with fp.text as key and a list of fp as value. """ faksimile_text_dictionary = {} for faksimile_position in [ fp for fp in line_of_faksimile_positions\ if not mergeables_only or not fp.mergeable ]: if faksimile_position.text not in faksimile_text_dictionary.keys(): faksimile_text_dictionary.update({faksimile_position.text: []}) faksimile_text_dictionary[faksimile_position.text].append(faksimile_position) if faksimile_position.text == '-': if '–' not in faksimile_text_dictionary.keys(): faksimile_text_dictionary.update({'–': []}) faksimile_text_dictionary['–'].append(faksimile_position) return faksimile_text_dictionary def fix_for_unmereged_items_if_two_left(self, new_words, unmerged_words, unmerged_faksimile_positions) ->int: """Merge if there are only two left. [:return:] number of unmerged items """ if len(unmerged_words) == 1 and len(unmerged_faksimile_positions) == 1: self.merge_word_with_fp(unmerged_words[0], unmerged_faksimile_positions[0], new_words) unmerged_words, unmerged_faksimile_positions = [], [] return len(unmerged_words+unmerged_faksimile_positions) def fix_for_unmereged_items_split_words(self, new_words, unmerged_words, unmerged_faksimile_positions) ->int: """Merge if there are only two left. [:return:] number of unmerged items """ if len(unmerged_words) < len(unmerged_faksimile_positions): for faksimile_position in unmerged_faksimile_positions: line_number = self.get_line_number(faksimile_position, new_words) words_on_line = [ word for word in new_words\ if word.line_number == line_number and len(word.faksimile_positions) > 0] for word in words_on_line: if word.text.replace(word.faksimile_positions[0].text, '') == faksimile_position.text: left_word, right_word, _ = word.split(faksimile_position.text) new_words.remove(word) self.merge_word_with_fp(left_word, word.faksimile_positions[0], new_words) self.merge_word_with_fp(right_word, faksimile_position, new_words) unmerged_faksimile_positions.remove(faksimile_position) return len(unmerged_words+unmerged_faksimile_positions) def fix_for_unmereged_items_startswith(self, new_words, unmerged_words, unmerged_faksimile_positions, ignoreCase=False) ->int: """Do a final attempt at fixing unmerged words and faksimile_positions. [:return:] number of unmerged items """ for word in sorted(unmerged_words, key=lambda word: len(word.text), reverse=True): matches = [ fp for fp in unmerged_faksimile_positions if text_starts_with(word.text, fp.text, ignoreCase=ignoreCase) and not fp.joined ] if len(matches) > 0: faksimile_position = sorted(matches, key=lambda w: len(w.text), reverse=True)[0] self.merge_word_with_fp(word, faksimile_position, new_words) unmerged_words.remove(word) unmerged_faksimile_positions.remove(faksimile_position) else: matches = [ fp for fp in unmerged_faksimile_positions if text_starts_with(fp.text, word.text, ignoreCase=ignoreCase) and not fp.joined ] if len(matches) > 0: faksimile_position = sorted(matches, key=lambda w: len(w.text), reverse=True)[0] self.merge_word_with_fp(word, faksimile_position, new_words) unmerged_words.remove(word) unmerged_faksimile_positions.remove(faksimile_position) return len(unmerged_words+unmerged_faksimile_positions) def final_fix_for_unmereged_items(self, new_words, unmerged_words, unmerged_faksimile_positions) ->int: """Do a final attempt at fixing unmerged words and faksimile_positions. [:return:] number of unmerged items """ self.fix_for_unmereged_items_if_two_left(new_words, unmerged_words, unmerged_faksimile_positions) self.fix_for_unmereged_items_split_words(new_words, unmerged_words, unmerged_faksimile_positions) num_unmerged = self.fix_for_unmereged_items_startswith(new_words, unmerged_words, unmerged_faksimile_positions) latest_unmerged_words = [ word for word in unmerged_words if not word.joined ] latest_unmerged_fps = [ fp for fp in unmerged_faksimile_positions if not fp.joined ] if len(latest_unmerged_fps) > 0: fp_ln_dict = {} for fp in latest_unmerged_fps: line_number = self.get_line_number(fp, new_words) if line_number > -1: if line_number not in fp_ln_dict.keys(): fp_ln_dict.update({line_number: []}) fp_ln_dict[line_number].append(fp) for word in latest_unmerged_words: if word.line_number in fp_ln_dict.keys(): matches = fp_ln_dict[word.line_number] if len(matches) > 0: self.merge_word_with_fp(word, matches.pop(0), new_words) latest_unmerged_words = [ word for word in unmerged_words if not word.joined ] latest_unmerged_fps = [ fp for fp in unmerged_faksimile_positions if not fp.joined ] if len(latest_unmerged_words+latest_unmerged_fps) > 0: if self.fix_for_unmereged_items_startswith(new_words, latest_unmerged_words, latest_unmerged_fps, ignoreCase=True) == 0: return 0 return self.fix_for_unmereged_items_if_two_left(new_words, latest_unmerged_words, latest_unmerged_fps) def get_lines_of_faksimile_positions(self) ->list: """Return lines_of_faksimile_positions. """ return self.lines_of_faksimile_positions def get_next_faksimile(self) -> WordPosition: """Return next faksimile position. """ if len(self.lines_of_faksimile_positions) == 0: return None if self.current_line_index < len(self.lines_of_faksimile_positions): if self.current_faksimile_index < len(self.lines_of_faksimile_positions[self.current_line_index]): self.current_faksimile_index += 1 return self.lines_of_faksimile_positions[self.current_line_index][self.current_faksimile_index-1] else: self.current_line_index += 1 else: self.current_line_index = 0 self.current_faksimile_index = 0 return self.get_next_faksimile() def get_line_number(self, faksimile_position, new_words) -> int: """Return line_number of line containing faksimile_position. """ line_number = -1 for line in self.lines_of_faksimile_positions: if faksimile_position in line: joined_fps = [ fp for fp in line if fp.joined ] if len(joined_fps) > 0: line_numbers_of_joined_words = [ word for word in new_words\ if len(word.faksimile_positions) > 0\ and any(fp in word.faksimile_positions for fp in joined_fps) ] if len(line_numbers_of_joined_words) > 0: #print(faksimile_position.text, [ (w.line_number, w.text, w.faksimile_positions[0].id) for w in line_numbers_of_joined_words]) line_number = line_numbers_of_joined_words[0].line_number return line_number def get_line(self, line_of_words, index=-1, offset=2, interactive=False) -> list: """Return the line that corresponds to the line_of_words. """ if index > -1: start_index = index-offset\ if index >= offset\ else 0 end_index = index+offset+1\ if len(self.lines_of_faksimile_positions) > index+offset\ else len(self.lines_of_faksimile_positions) else: start_index = 0 end_index = len(self.lines_of_faksimile_positions) matched_line = [] mergeable_line_of_word_texts = [ word.text for word in line_of_words if word.mergeable ] word_text = ''.join(mergeable_line_of_word_texts) interactive_list = [] for i in range(start_index, end_index): current_line = [ fp for fp in self.lines_of_faksimile_positions[i] if not fp.joined ] current_text = ''.join([fp.text for fp in current_line if fp.mergeable]) if (len(word_text) == len(current_text) and word_text == current_text)\ or\ (len(word_text) <= len(current_text) and current_text.find(word_text) > -1): if interactive: interactive_list.append((i, current_line)) else: matched_line = current_line break elif (len(current_text) > 0 and len(word_text) > len(current_text) and word_text.find(current_text) > -1): matched_index = word_text.find(current_text) next_i = i+1 if matched_index == 0 else i-1 while len(word_text) > len(current_text)\ and next_i > -1 and next_i < len(self.lines_of_faksimile_positions): current_line += [ fp for fp in self.lines_of_faksimile_positions[next_i] if not fp.joined ] current_text = ''.join([fp.text for fp in current_line if fp.mergeable]) next_i = next_i+1 if matched_index == 0 else next_i-1 if interactive: interactive_list.append((i, current_line)) else: matched_line = current_line break if interactive: if len(interactive_list) > 0: return interactive_list else: for i in range(start_index, end_index): current_line = [ fp for fp in self.lines_of_faksimile_positions[i] if not fp.joined ] matched_line.append((i, current_line)) return matched_line def get_new_index(self, word, line_of_words, new_list_of_words, old_word_new_word_mapping): """Return index of word in new_list_of_words such that it can be inserted before this index. """ old_index = line_of_words.index(word) new_index = 0 if old_index > 0: previous_word = line_of_words[old_index-1] new_previous_word = old_word_new_word_mapping[previous_word]\ if old_word_new_word_mapping.get(previous_word) is not None\ else previous_word if new_previous_word in new_list_of_words: new_index = new_list_of_words.index(new_previous_word)+1 else: new_index = self.get_new_index(new_previous_word, line_of_words,\ new_list_of_words, old_word_new_word_mapping)+1 return new_index def join_unmergeable_words(self, words, old_word_new_word_mapping) -> Word: """Join all words and return new word. """ if len(words) > 1: new_word = words[0] for word2join in words[1:]: new_word.join(word2join) old_word_new_word_mapping.update({word2join: new_word}) old_word_new_word_mapping.update({words[0]: new_word}) return new_word else: old_word_new_word_mapping.update({words[0]: words[0]}) return words[0] def join_unmergeable_words_with_punctuation(self, line_of_words, old_word_new_word_mapping): """Join unmergeable words on line with punctionation words. """ index = 0 while index < len(line_of_words): if not line_of_words[index].mergeable\ and index+1 < len(line_of_words)\ and not line_of_words[index+1].mergeable\ and re.match('^[.,]$', line_of_words[index+1].text): line_of_words[index].join(line_of_words[index+1]) old_word_new_word_mapping.update({line_of_words[index+1]: line_of_words[index]}) line_of_words.remove(line_of_words[index+1]) index += 1 index += 1 def merge_lines(self, line_of_words, new_words, index=-1, offset=2, interactive=False) -> bool: """Merge a line of words with the corresponding line of faksimile positions. [:return:] interactive """ if len([word for word in line_of_words if not word.joined ]) == 0: return [], interactive line_of_faksimile_positions = self.get_line(line_of_words, index, offset=offset) if len(line_of_faksimile_positions) > 0: faksimile_text_dictionary = self.create_faksimile_dictionary(line_of_faksimile_positions) self.merge_mergeables(line_of_words, faksimile_text_dictionary, new_words) self.merge_unmergeables(line_of_words, line_of_faksimile_positions, new_words) elif interactive: interactive = self.interactive_shell.interactive_merge_lines(line_of_words, new_words, index, offset+4) elif len(line_of_words) == 1 and line_of_words[0].text == '–': line_of_words[0].line_number -= 1 else: if offset < 10: interactive = self.merge_lines(line_of_words, new_words, index, offset=offset+1) return interactive def merge_mergeables(self, line_of_words, faksimile_text_dictionary, new_words): """Merge words with faksimile positions for which there are keys in in faksimile_text_dictionary. """ for word in line_of_words: fp_list = faksimile_text_dictionary.get(word.text) if fp_list is not None and len(fp_list) > 0: self.merge_word_with_fp(word, fp_list.pop(0), new_words) def merge_unmergeables(self, line_of_words, line_of_faksimile_positions, new_words): """Merge unmergeable words and faksimile_positions """ old_word_new_word_mapping = {} self.join_unmergeable_words_with_punctuation(line_of_words, old_word_new_word_mapping) unmerged_words = [ word for word in line_of_words if not word.joined and not word.mergeable ] unmerged_fps = [ fp for fp in line_of_faksimile_positions if not fp.joined and not fp.mergeable ] if len(unmerged_words) > 0: if len(unmerged_words) == len(unmerged_fps): for i, word in enumerate(unmerged_words): new_index = self.get_new_index(word, line_of_words, new_words, old_word_new_word_mapping) self.merge_word_with_fp(word, unmerged_fps[i], new_words, new_index) else: - fp_index = 0 - unmerged_unity = [] - for word in unmerged_words: + try: + fp_index = 0 + unmerged_unity = [] + for word in unmerged_words: + if len(unmerged_unity) > 0 and fp_index < len(unmerged_fps): + previous_word = unmerged_unity[-1] + previous_index = line_of_words.index(previous_word) + if line_of_words.index(word) - previous_index > 1: + new_word = self.join_unmergeable_words(unmerged_unity, old_word_new_word_mapping) + new_index = self.get_new_index(unmerged_unity[0], line_of_words, new_words, old_word_new_word_mapping) + self.merge_word_with_fp(new_word, unmerged_fps[fp_index], new_words, new_index) + fp_index += 1 + unmerged_unity = [] + unmerged_unity.append(word) if len(unmerged_unity) > 0 and fp_index < len(unmerged_fps): - previous_word = unmerged_unity[len(unmerged_unity)-1] - previous_index = line_of_words.index(previous_word) - if line_of_words.index(word) - previous_index > 1: - new_word = self.join_unmergeable_words(unmerged_unity, old_word_new_word_mapping) - new_index = self.get_new_index(unmerged_unity[0], line_of_words, new_words, old_word_new_word_mapping) - self.merge_word_with_fp(new_word, unmerged_fps[fp_index], new_words, new_index) - fp_index += 1 - unmerged_unity = [] - unmerged_unity.append(word) - if len(unmerged_unity) > 0 and fp_index < len(unmerged_fps): - new_word = self.join_unmergeable_words(unmerged_unity, old_word_new_word_mapping) - new_index = self.get_new_index(unmerged_unity[0], line_of_words, new_words, old_word_new_word_mapping) - self.merge_word_with_fp(new_word, unmerged_fps[fp_index], new_words, new_index) + new_word = self.join_unmergeable_words(unmerged_unity, old_word_new_word_mapping) + new_index = self.get_new_index(unmerged_unity[0], line_of_words, new_words, old_word_new_word_mapping) + self.merge_word_with_fp(new_word, unmerged_fps[fp_index], new_words, new_index) + except RecursionError as e: + warnings.warn('RecursionError') for old_word, new_word in old_word_new_word_mapping.items(): old_word.joined = new_word.joined return new_words def merge_word_with_fp(self, word, faksimile_position, list_of_new_words, index=-1): """Merge word with faksimile position. """ word.joined, faksimile_position.joined = True, True word.faksimile_positions.append(faksimile_position) if index == -1: list_of_new_words.append(word) else: list_of_new_words.insert(index, word) def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None, redo_ok=False): """Return svg_pos_file and manuscript_file if they are ready for processing. """ svg_pos_file = None manuscript_tree = None - if manuscript_file is not None\ - and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')): + if manuscript_file is not None: manuscript_tree = ET.parse(manuscript_file) else: title_string = faksimile_page.title.replace(' ', '_') manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\ if isdir('.{}xml'.format(sep)) else title_string + '.xml' if isfile(manuscript_file): manuscript_tree = ET.parse(manuscript_file) if manuscript_tree is not None: if redo_ok and len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0: svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0] if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0: svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0] else: if not UNITTESTING: if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0: msg = Fore.LIGHTBLUE_EX +'->' + Fore.CYAN + 'Data from page {0} already merged with {1}!'.format(\ faksimile_page.page_number,\ manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)[0]) else: msg = Fore.MAGENTA + 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number) print(msg, end='') print(Style.RESET_ALL) return svg_pos_file, manuscript_file def merge_faksimile_file_and_pages(faksimile_file, manuscript_file=None, page=None) -> int: """Merge the data of a faksimile file with the data of svgposfile. [:return:] exit status """ if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='') print(Style.RESET_ALL) faksimile_tree = ET.parse(faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces) if page is not None: faksimile_pages = [ faksimile_page for faksimile_page in faksimile_pages\ if get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)[0]\ == page.page_tree.docinfo.URL ] exit_status = 0 for faksimile_page in faksimile_pages: svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file) if svg_pos_file is not None: image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field) if page is None: page = Page(svg_pos_file, faksimile_image=image4page, faksimile_svgFile=faksimile_file) write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__,\ file_type=FILE_TYPE_SVG_WORD_POSITION) if not UNITTESTING: print(Fore.LIGHTBLUE_EX + '->', end='') print(Fore.CYAN + 'Merging faksimile positions from page {0} with words from file {1} ... '.format(faksimile_page.page_number, svg_pos_file), end='') exit_status = num_unmerged = merge_faksimile_positions_and_words(page, faksimile_page.word_positions) if num_unmerged > 0: page = Page(page.page_tree.docinfo.URL) for carrier in faksimile_page.word_positions: carrier.joined = False exit_status = num_unmerged = merge_faksimile_positions_and_words(page, faksimile_page.word_positions, interactive=True) if not UNITTESTING: if num_unmerged == 0: print(Fore.GREEN + '[OK]') new_words = sort_words(page) for word in new_words: if len(word.faksimile_positions) == 0 or word.text != word.faksimile_positions[0].text: word.verified = False if page.is_locked(): page.unlock() post_merging_processing_and_saving(svg_pos_file=page.page_tree.docinfo.URL, new_words=new_words, page=page, manuscript_file=manuscript_file) else: print(Fore.RED + f'[ERROR: {num_unmerged} not joined!]\n') print([ (word.id, word.text,word.line_number) for word in page.words if not word.joined]) print([ (fp.id, fp.text) for fp in faksimile_page.word_positions if not fp.joined]) print(Fore.RESET) else: if num_unmerged > 0: unmerged_words = [ word for word in page.words if not word.joined] unmerged_fps = [ fp for fp in faksimile_page.word_positions if not fp.joined ] print([ (word.id, word.text,word.line_number) for word in unmerged_words]) print([ (fp.id, fp.text) for fp in unmerged_fps]) if len(unmerged_fps) == 0: for word in page.words: if len(word.faksimile_positions) < 1: print(f'{word.line_number}: {word.id} {word.text}') elif word.text != word.faksimile_positions[0].text: print(f'{word.line_number}: {word.id} {word.text} {[(fp.id,fp.text) for fp in word.faksimile_positions]}') else: words = sort_words(page) for word in words: if len(word.faksimile_positions) < 1: print(f'{word.line_number}: {word.id} {word.text}') elif not word.verified and word.text != word.faksimile_positions[0].text: print(f'{word.line_number}: {word.id} {word.text} {[(fp.id,fp.text) for fp in word.faksimile_positions]}') page = None return exit_status def merge_faksimile_positions_and_words(page, faksimile_positions, interactive=False) -> int: """Merge words with faksimile positions. [:return:] exit code """ words = sort_words(page) mark_unmergeable_words_and_faksimile_positions(words, faksimile_positions) lines_with_words = set([ word.line_number for word in words]) faksimile_lines_composer = FaksimileLineComposer(faksimile_positions, page=page) new_words = [] if interactive: faksimile_lines_composer.interactive_shell.set_command_history() for index, line_number in enumerate(sorted(lines_with_words)): words_on_line = [ word for word in words if word.line_number == line_number] interactive = faksimile_lines_composer.merge_lines(words_on_line, new_words, index, interactive=interactive) unmerged_words = [ word for word in page.words if not word.joined ] unmerged_fps = [ fp for fp in faksimile_positions if not fp.joined ] exit_code = faksimile_lines_composer.final_fix_for_unmereged_items(new_words, unmerged_words, unmerged_fps) if exit_code == 0: page.words = new_words else: faksimile_lines_composer.interactive_shell.print_command_history() manual_merger = ManualMergerShell(unmerged_words, unmerged_fps, new_words, page=page) try: exit_code = manual_merger.run() except Exception as e: exit_code = 666 print(e) manual_merger.print_history() return exit_code def mark_unmergeable_words_and_faksimile_positions(words, faksimile_positions): """Mark all words and faksimile_positions for which the number of text instances does not accord. """ unique_texts = set() for text_carrier in words + faksimile_positions: if 'joined' not in text_carrier.__dict__.keys(): text_carrier.joined = False text_carrier.mergeable = True unique_texts.add(text_carrier.text) for text in unique_texts: words_with_text = [ word for word in words if word.text == text ] faksimile_positions_with_text = [ fp for fp in faksimile_positions if fp.text == text ] if len(words_with_text) != len(faksimile_positions_with_text): for text_carrier in words_with_text + faksimile_positions_with_text: text_carrier.mergeable = False def text_starts_with(text1, text2, ignoreCase=False) ->bool: """Return text1.startswith(text2) """ if ignoreCase: return text1.lower().startswith(text2.lower()) else: return text1.startswith(text2) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION line wise. svgscripts/compare_faksimile_words_line_wise.py [OPTIONS] [xmlManuscriptFile] a directory containing a svg file containing information about the word positions on the faksimile. a xml file about a manuscript, containing information about its pages. OPTIONS: -h|--help: show help :return: exit code (int) """ correct_words_dir = None try: opts, args = getopt.getopt(argv, "hc:", ["help", "correct-words=", ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-c', '--correct-words'): correct_words_dir = arg if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if exists(file_a): file_b = None if len(args) > 1 and exists(args[1]): file_b = args[1] file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b, correction_dir=correct_words_dir) for faksimile_file in file_list: merge_faksimile_file_and_pages(faksimile_file, manuscript_file=manuscript_file) else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/extractWordPosition.py =================================================================== --- svgscripts/extractWordPosition.py (revision 98) +++ svgscripts/extractWordPosition.py (revision 99) @@ -1,547 +1,544 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract the position of the words in a svg file and write them to a xml file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import inspect import getopt from lxml import etree as ET from os import sep, listdir, mkdir, path from os.path import exists, isfile, isdir from progress.bar import Bar import re import sys import warnings from datatypes.lineNumber import LineNumber from datatypes.matrix import Matrix from datatypes.page_creator import PageCreator, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from datatypes.pdf import PDFText from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word from datatypes.word_insertion_mark import WordInsertionMark +from util import process_warnings4status sys.path.append('shared_util') from myxmlwriter import write_pretty __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Extractor: """ This class can be used to extract the word positions in a svg file and write it to a xml file. Args: [xml_dir (str): target directory] [title (str): title of document] [manuscript_file (str): xml file containing information about the archival unity to which the current page belongs """ UNITTESTING = False SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ] def __init__(self, xml_dir=None, title=None, manuscript_file=None, compare2pdf=False): if bool(xml_dir): self.xml_dir = xml_dir not isdir(self.xml_dir) and mkdir(self.xml_dir) else: self.xml_dir = 'xml' if(isdir('xml')) else '' self.latest_status = None self.compare2pdf = compare2pdf self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else '' self.title = title self.manuscript_file = manuscript_file self.manuscript_tree = None if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file): self.manuscript_tree = ET.parse(self.manuscript_file) self.title = self.manuscript_tree.getroot().get('title') elif bool(self.manuscript_file): raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file)) elif bool(self.title): self.update_title_and_manuscript(self.title, False) def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None): """Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word). If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created. :returns: the new word counter (int) """ break_points = [] if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points for Sonderzeichen in self.SONDERZEICHEN_LIST: contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ] if True in contains_Sonderzeichen: break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]] for sz_point in [i for i, e in break_points]: wim_index = len(page.word_insertion_marks) x = float(word_part_objs[sz_point]['x']) y = float(word_part_objs[sz_point]['y']) if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None: svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } xmin = transkription_field.xmin ymin = transkription_field.ymin wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\ line_number=page.get_line_number(y-1), mark_type=Sonderzeichen) page.word_insertion_marks.append(wim) if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points THRESHOLDX = 20 # Threshold between line number and text last_x = -1 for i, x in enumerate([float(dict['x']) for dict in word_part_objs]): if(last_x > -1 and (x - last_x > THRESHOLDX)): break_points.append((i, i)) last_x = x if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words from_index = 0 for end_point, next_from_index in break_points: new_word_part_objs = word_part_objs[from_index:end_point] new_endX = word_part_objs[end_point]['x'] from_index = next_from_index index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) if from_index > 0 and from_index < len(word_part_objs): new_word_part_objs = word_part_objs[from_index:] index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) return index else: if len(word_part_objs) > 0: transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\ debug_msg_string=debug_msg, transkription_field=transkription_field) text = self.get_word_from_part_obj(word_part_objs) line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2) if line_number == -1: if transkription_positions[0].transform is not None: line_number = page.get_line_number(transkription_positions[0].transform.getY()) if line_number == -1 and len(page.words) > 0: lastWord = page.words[-1] lastWord_lastTP = lastWord.transkription_positions[-1] lastTP = transkription_positions[-1] if transkription_positions[0].left > lastWord_lastTP.left\ and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2: line_number = lastWord.line_number else: line_number = lastWord.line_number+1 newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions) page.words.append(newWord) return int(index) + 1 else: return int(index) def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default', multipage_index=-1, marginals_page=None): """Extracts information about positions of text elements and writes them to a xml file. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file exit_status = 0 with warnings.catch_warnings(record=record_warnings) as w: warnings.simplefilter(warning_filter) page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile,\ multipage_index=multipage_index, marginals_page=marginals_page) - status_message = 'OK' - if w is not None and len(w) > 0: - status_message = 'with warnings' - if True in [ str(warn.message).startswith(PageCreator.WARNING_MISSING_USE_NODE4PWP) for warn in w ]: - status_message += ':{}:'.format(PageCreator.WARNING_MISSING_USE_NODE4PWP.lower()) - if True in [ str(warn.message).startswith(PageCreator.WARNING_MISSING_GLYPH_ID4WIM) for warn in w ]: - status_message += ':{}:'.format(PageCreator.WARNING_MISSING_GLYPH_ID4WIM.lower()) + status_message = process_warnings4status(w, [ PageCreator.WARNING_MISSING_USE_NODE4PWP, PageCreator.WARNING_MISSING_GLYPH_ID4WIM ],\ + '', 'OK', 'with warnings') + if status_message != 'OK': self.latest_status = status_message exit_status = 1 else: self.latest_status = None page.page_tree.getroot().set('status', status_message) write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) return exit_status else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, multipage_index=-1, marginals_page=None) -> PageCreator: """Extracts information about positions of text elements. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file transkription_field = TranskriptionField(file_name, multipage_index=multipage_index) svg_tree = ET.parse(file_name) page = PageCreator(xml_target_file, title=self.title, multipage_index=multipage_index,\ page_number=page_number, pdfFile=pdfFile,\ svg_file=svg_file, source=file_name, marginals_source=marginals_page) sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot()) page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict) if transkription_field is not None: page.init_line_numbers(LineNumber.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax) self.extract_word_position(svg_tree, page, transkription_field=transkription_field) page.create_writing_processes_and_attach2tree() page.update_and_attach_words2tree() for word_insertion_mark in page.word_insertion_marks: # it is not clear if we really need to know this alternative word ordering. See 'TODO.md' #word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark) word_insertion_mark.attach_object_to_tree(page.page_tree) return page else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def extract_word_position(self, svg_tree, page, transkription_field=None): """Extracts word positions. """ counter = 0 word_part_obj = [] endSign = '%' last_matrix = None MAXBOTTOMDIFF = 5 MAXXDIFF = 6 if not Extractor.UNITTESTING: bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)])) for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field): current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field) # check for line breaks if (last_matrix is not None and len(word_part_obj) > 0 and (\ Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\ (abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\ (abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\ or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()): endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\ round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\ str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix))) counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field) word_part_obj = [] endX = current_matrix.getX() if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))): word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} ) else: endSign = text_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT endX = current_matrix.add2X(tspan_item.get('x')) if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))): y = current_matrix.add2Y(tspan_item.get('y')) word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix }) if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: """text_item has letterspacing class (set s & set t = new set with elements common to s and t) """ endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='tspan with letterspacing', transkription_field=transkription_field) word_part_obj = [] else: endSign = tspan_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='svg/text/tspan/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' last_matrix = current_matrix not bool(Extractor.UNITTESTING) and bar.next() if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\ transkription_field=transkription_field) word_part_obj = [] endSign = '%' not bool(Extractor.UNITTESTING) and bar.finish() def find_inserted_words_by_position(self, target_tree, x, y): """Returns an Array with the words that are inserted above the x, y position or [] if not found. """ warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.') MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 if(len(target_tree.getroot().xpath('//word[@id]')) > 0): result_list = [] minus2left = 20.0 minus2top = 19.0 while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX : result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ] minus2left -= 1 minus2top += 1 if len(result_list) > 0: result_bottom = result_list[len(result_list)-1].bottom result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)): result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def find_inserted_words(self, target_tree, word_insertion_mark): """Returns an Array with the words that are inserted above/underneath the word_insertion_mark. """ warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.') if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1: return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y) if(len(target_tree.getroot().xpath('//word[@id]')) > 0): MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 result_list = [] x = word_insertion_mark.x y = word_insertion_mark.y if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line line_number = word_insertion_mark.line_number - 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: minus2top = 1.0 while len(result_list) == 0 and minus2top < MINY: for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y - minus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break minus2top += 1 elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line line_number = word_insertion_mark.line_number + 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: plus2top = 1.0 while len(result_list) == 0 and plus2top < MINY : for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y + plus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break plus2top += 1 if len(result_list) > 0: # now, collect more words that are right of already collected words result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width for item in target_tree.getroot().xpath(\ '//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)): result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def get_file_name(self, file_name, page_number=None): """Returns the file_name of the target xml file. """ dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else '' if bool(self.title): return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml' else: return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml')) def get_page_number(self, file_name, page_number=None): """ Returns page number as a string (with leading zero(s) if len(page_number) < 3). """ if not bool(page_number) and bool(re.search(r'\d', file_name)): """if page_number=None and filename contains digits, then split filename into its parts that contain only digits, remove empty strings and return the last part containing only digits. """ page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop() if bool(page_number): leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else '' return leading_zeros + str(page_number) else: return '' def get_style(self, etree_root): """Returns the style specification as a dictionary. :returns: sonderzeichen_list: list of keys for classes that are 'Sonderzeichen' style_dict: dictionary: key = class name (str), value = style specification (dictionary) """ style_dict = {} sonderzeichen_list = [] letterspacing_list = [] style = etree_root.find('style', etree_root.nsmap) if style is not None: for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))): style_key = style_item.split('{')[0].replace('.', '') style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \ for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))} style_dict[style_key] = style_value_dict if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'): sonderzeichen_list.append(style_key) if bool(style_value_dict.get('letter-spacing')): letterspacing_list.append(style_key) return sonderzeichen_list, letterspacing_list, style_dict def get_text_items(self, tree_root, transkription_field=None): """Returns all text elements with a matrix or (if transkription_field is specified) all text elements that are located inside the transkription field. """ if transkription_field is not None: return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=x),\ tree_root.iterfind(".//text", tree_root.nsmap)) else: return tree_root.iterfind(".//text", tree_root.nsmap) def get_word_from_part_obj(self, word_part_obj): """Extracts all 'text' from a list of dicitonaries and concats it to a string. """ return ''.join([ dict['text'] for dict in word_part_obj]) def get_word_object_multi_char_x(self, word_part_obj_dict): """Returns the x of the last char of word_part_object. TODO: get real widths from svg_file!!! """ WIDTHFACTOR = 2.6 return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR def update_title_and_manuscript(self, title, update_manuscript=True): """Updates title and manuscript. """ self.title = title if update_manuscript or not bool(self.manuscript_file): self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml' if not isfile(self.manuscript_file): self.manuscript_tree = ET.ElementTree(ET.Element('manuscript', attrib={"title": self.title})) write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile') def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract the position of the words in a svg file and write them to a xml file. svgscripts/extractWordPosition.py [OPTIONS] svg file OR xml target file containing file name of svg file as "/page/@source". directory containing svg files OPTIONS: -h|--help: show help -c|--compare-to-pdf compare words to pdf and autocorrect -d|--xml-dir=xmlDir: target directory for the xml output file(s) -m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s) -p|--page=pageNumber: page number of the current page. For use with _one_ file only. -P|--PDF=pdfFile: pdf file - used for word correction -s|--svg=svgFile: svg web file -t|--title=title: title of the manuscript to which the current page(s) belong(s) -x|--xml-target-file=xmlOutputFile: xml target file :return: exit code (int) """ compare2pdf = True manuscript_file = None page_number = None pdfFile = None svg_file = None title = None xml_target_file = None xml_dir = ".{}xml".format(sep) try: opts, args = getopt.getopt(argv, "hcd:m:t:p:s:x:P:", ["help", "compare-to-pdf", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-c', '--compare-to-pdf'): compare2pdf = True elif opt in ('-d', '--xml-dir'): xml_dir = arg elif opt in ('-m', '--manuscript-file'): manuscript_file = arg elif opt in ('-t', '--title'): title = arg elif opt in ('-p', '--page'): page_number = str(arg) elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-P', '--PDF'): pdfFile = arg elif opt in ('-x', '--xml-target-file'): xml_target_file = str(arg) files_to_process = list() for arg in args: if isfile(arg): files_to_process.append(arg) elif isdir(arg): files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg))) else: print("'{}' does not exist!".format(arg)) return 2 if len(files_to_process) < 1 or args[0].endswith('xml'): if xml_target_file is None: xml_target_file = args[0] if len(args) > 0 else None if xml_target_file is not None and isfile(xml_target_file): target_file_tree = ET.parse(xml_target_file) file_name = target_file_tree.getroot().get('source') title = target_file_tree.getroot().get('title') if title is None else title page_number = target_file_tree.getroot().get('number') if page_number is None else page_number if svg_file is None: if len(target_file_tree.xpath('//svg-image')) > 0: svg_file = target_file_tree.xpath('.//svg-image/@file-name')[0]\ if len(target_file_tree.xpath('.//svg-image/@file-name')) > 0 else None else: svg_file = target_file_tree.xpath('.//svg/@file')[0]\ if len(target_file_tree.xpath('.//svg/@file')) > 0 else None files_to_process.insert(0, file_name) if xml_target_file in files_to_process: files_to_process.remove(xml_target_file) else: usage() return 2 if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)): print("ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!") usage() return 2 extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, compare2pdf=compare2pdf) for file in files_to_process: extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/join_faksimileAndTranskription.py =================================================================== --- svgscripts/join_faksimileAndTranskription.py (revision 98) +++ svgscripts/join_faksimileAndTranskription.py (revision 99) @@ -1,602 +1,604 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET import re import shutil import string import sys import tempfile from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path from progress.bar import Bar import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from convert_wordPositions import create_pdf_with_highlighted_words, create_svg_with_highlighted_words from create_task import CorrectWords from datatypes.faksimile import FaksimilePage, get_paths_inside_rect from datatypes.lineNumber import LineNumber from datatypes.page import Page, STATUS_MERGED_OK from datatypes.transkriptionField import TranskriptionField from process_files import update_svgposfile_status from process_words_post_merging import post_merging_processing_and_saving from util import ExternalViewer, create_highlighted_svg_file, get_empty_node_ids, record_changes,\ record_changes_on_svg_file_to_page, record_changes_on_xml_file_to_page, get_mismatching_ids,\ replace_chars sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation) PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"') SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation) SINGLE_WORD_PATTERN = r"^[\w{}]$".format(string.punctuation) HIGHLIGHT_COLOR = 'red' OPACITY = '0.5' def create_task_correct_words(target_dir, xml_source_file=None, source_svg_file=None, page=None, unmatched_word_ids=None, unmatched_node_ids=None): """Create a task CorrectWords or process corrected files. """ exit_status = 0 if xml_source_file is None or source_svg_file is None: if xml_source_file is None and page is not None and isfile(page.page_tree.docinfo.URL): xml_source_file = page.page_tree.docinfo.URL if xml_source_file is None else xml_source_file elif xml_source_file is None: raise Exception('create_task_correct_words needs a xml_source_file or a page that has a valid tree source!') if source_svg_file is None and page is not None and isfile(page.faksimile_svgFile): source_svg_file = page.faksimile_svgFile if source_svg_file is None else source_svg_file elif source_svg_file is None: raise Exception('create_task_correct_words needs a source_svg_file or a page that has a faksimile_svgFile!') if page is None: page = Page(xml_source_file) correct_words = CorrectWords(xml_source_file, source_svg_file, target_dir, page=page,\ unmatched_node_ids=unmatched_node_ids) if not correct_words.has_been_created(page): if not page.is_locked(): reference_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.(xml|svg)') lock_dict = { 'reference_file': reference_file,\ 'message': 'Run:$ python3 {0} -c {1} {2}'.format(__file__, target_dir, source_svg_file)} write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file, script_name=__file__,\ file_type=FILE_TYPE_SVG_WORD_POSITION, **lock_dict) correct_words.create() if not UNITTESTING: print('Created a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description)) elif correct_words.has_been_finished(page): msg = 'Task "correct words" for page {} has been finished!'.format(str(page.number)) xml_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.xml', is_finished=True) transkription_svg = correct_words.get_target_filepath(page, is_faksimile_svg=False, is_finished=True) faksimile_svg = correct_words.get_target_filepath(page, is_finished=True) faksimile_file = faksimile_svg if isfile(faksimile_svg) else source_svg_file if isfile(xml_file): msg += '\n Words loaded from file {}.'.format(xml_file) page = record_changes_on_xml_file_to_page(xml_source_file, xml_file) page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=xml_file) elif isfile(transkription_svg): msg += '\n Words loaded from file {}.'.format(transkription_svg) page = record_changes_on_svg_file_to_page(xml_source_file, transkription_svg, word_ids=unmatched_word_ids) page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=transkription_svg) msg += '\n Faksimile loaded from file {}.'.format(faksimile_file) if not UNITTESTING: print(msg) exit_status = join_faksimileAndTranskription(faksimile_file, page=page) elif not UNITTESTING: print('There is a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description)) return exit_status def debug_function(words, input=''): """Custon debug function. """ if len([ word for word in words if word.debug_container.get('marked') ]) > 0: print(Fore.RED + 'marked word(s): {}'.format([ word.text for word in words if word.debug_container.get('marked') ])) if input != '': print('input: {}'.format(input)) print(Fore.RESET) def fix_errors(faksimile_file, unmerged_faksimile_positions, unmerged_words, text_field_id=None, faksimile_page=None, xml_source_file=None, manuscript_file=None, namespaces={}): """Creates a faksimile svg file and a pdf file highlighting the positions of the word positions that could not been merged. After correction, results are inserted into origianl file and processed again. :return: exit status (int) """ parser = ET.XMLParser(remove_blank_text=True) faksimile_tree = ET.parse(faksimile_file, parser) if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } if faksimile_page is None: faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces) if text_field_id is not None\ and text_field_id in [ faksimile_page.text_field.id for faksimile_page in faksimile_pages ]: faksimile_page = [ faksimile_page for faksimile_page in faksimile_pages if faksimile_page.text_field.id == text_field_id ][0] else: faksimile_page = faksimile_pages[0] if xml_source_file is None or manuscript_file is None: xml_source_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file) tmp_dir = tempfile.mkdtemp() tmp_pdf_file = tmp_dir + sep + 'output.pdf' tmp_svg_file = tmp_dir + sep + 'output.svg' tmp_faksimile = tmp_dir + sep + 'faksimile.svg' empyt_node_ids = get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces)\ if len(unmerged_faksimile_positions) < len(unmerged_words) else [] highlight_node_ids = [ faksimile_position.id for faksimile_position in unmerged_faksimile_positions ] highlight_node_ids += empyt_node_ids create_highlighted_svg_file(faksimile_tree, highlight_node_ids, target_file=tmp_faksimile, local_image_path=faksimile_page.faksimile_image.local_path, namespaces=namespaces, highlight_color=HIGHLIGHT_COLOR) #create_pdf_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, pdf_file_name=tmp_pdf_file, bg_color=HIGHLIGHT_COLOR) create_svg_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, svg_file_name=tmp_svg_file, bg_color=HIGHLIGHT_COLOR) exit_status = 2 if isfile(tmp_svg_file) and isfile(tmp_faksimile): ExternalViewer.show_files(list_of_files=[tmp_svg_file, tmp_faksimile]) record_changes_on_svg_file_to_page(xml_source_file, tmp_svg_file, word_ids=[ word.id for word in unmerged_words ]) record_changes(faksimile_file, tmp_faksimile, highlight_node_ids, namespaces=namespaces) shutil.rmtree(tmp_dir) exit_status = join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, do_fix_errors=False, join_single_char_words=True) return exit_status def get_filelist_and_manuscript_file(file_a, file_b=None, correction_dir=None): """Returns a file list and a manuscript file (or None) """ file_list = [] manuscript_file = None if isfile(file_a) and file_a.endswith('svg'): file_list.append(file_a) if file_b is not None and isfile(file_b): manuscript_file = file_b elif isfile(file_a) and file_a.endswith('xml'): manuscript_file = file_a if file_b is not None and isfile(file_b): file_list.append(file_b) elif file_b is not None and isdir(file_b): file_list = [ svgfile for svgfile in listdir(file_b) if svgfile.endswith('svg') ] elif correction_dir is not None and isdir(correction_dir)\ and Path(correction_dir, CorrectWords.finish_dir).is_dir(): finish_dir = Path(correction_dir, CorrectWords.finish_dir) xml_files = list(finish_dir.glob('*.xml')) svg_files = list(finish_dir.glob('*.svg')) if len(xml_files + svg_files) > 1: manuscript_tree = ET.parse(manuscript_file) for xml_file in xml_files: output = manuscript_tree.xpath(f'.//page[contains(@output, "{xml_file.name}")]/@output') if len(output) > 0: file_list.append(output[0]) elif isdir(file_a): file_list = [ file_a + sep + svgfile for svgfile in listdir(file_a) if svgfile.endswith('svg') ] if file_b is not None and isfile(file_b): manuscript_file = file_b return file_list, manuscript_file def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None, redo_ok=False): """Return svg_pos_file and manuscript_file if they are ready for processing. """ svg_pos_file = None manuscript_tree = None - if manuscript_file is not None\ - and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')): + if manuscript_file is not None: + #and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')): manuscript_tree = ET.parse(manuscript_file) else: title_string = faksimile_page.title.replace(' ', '_') manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\ if isdir('.{}xml'.format(sep)) else title_string + '.xml' if isfile(manuscript_file): manuscript_tree = ET.parse(manuscript_file) if manuscript_tree is not None: if redo_ok and len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0: svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0] if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0: svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0] else: if not UNITTESTING: if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0: msg = Fore.LIGHTBLUE_EX +'->' + Fore.CYAN + 'Data from page {0} already merged with {1}!'.format(\ faksimile_page.page_number,\ manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)[0]) else: msg = Fore.MAGENTA + 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number) print(msg, end='') print(Style.RESET_ALL) return svg_pos_file, manuscript_file def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, page=None, do_fix_errors=False, redo_ok=False, debug_word_text='', **kwargs): """Joins the data of a faksimile file with the data of svgposfile. """ if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='') print(Style.RESET_ALL) if not do_fix_errors and 'do_fix_errors' in kwargs.keys(): do_fix_errors = kwargs.get('do_fix_errors') if not redo_ok and 'redo_ok' in kwargs.keys(): redo_ok = kwargs.get('redo_ok') if debug_word_text == '' and 'debug_word_text' in kwargs.keys(): debug_word_text = kwargs.get('debug_word_text') faksimile_tree = ET.parse(faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces) if page is not None: faksimile_pages = [ faksimile_page for faksimile_page in faksimile_pages\ if get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)[0]\ == page.page_tree.docinfo.URL ] exit_status = 0 for faksimile_page in faksimile_pages: svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok) if svg_pos_file is not None: image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field) if page is None: page = Page(svg_pos_file, faksimile_image=image4page, faksimile_svgFile=faksimile_file) write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__,\ file_type=FILE_TYPE_SVG_WORD_POSITION) if not UNITTESTING: print(Fore.LIGHTBLUE_EX + '->', end='') print(Fore.CYAN + 'Joining data from page {0} with file {1} ... '.format(faksimile_page.page_number, svg_pos_file), end='') words = sort_words(page) if debug_word_text != '' and len([ word for word in words if word.text == debug_word_text ]) > 0: for word in words: if word.text == debug_word_text: word.debug_container.update({'marked': True}) if bool(kwargs.get('join_single_char_words')): removed_words = join_single_char_words(words) page.words = words page.update_and_attach_words2tree() #print([ word.text for word in page.words if word in removed_words ]) faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions) new_words = [] unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\ key=lambda text: len(text)) faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions, unique_faksimile_words) for word_text in unique_faksimile_words: process_word_text(new_words, word_text, faksimile_positions, words) if False not in [ word.joined for word in words if word.text != '.' ]\ and False not in [ position.joined for position in faksimile_positions]\ and not UNITTESTING: if page.is_locked(): page.unlock() post_merging_processing_and_saving(svg_pos_file=svg_pos_file, new_words=new_words, page=page, manuscript_file=manuscript_file) print(Fore.GREEN + '[OK]') print(Style.RESET_ALL) elif not UNITTESTING: mismatch_words, mismatch_faksimile_positions = get_mismatching_ids(words, faksimile_positions) not_joined_fp = [ (position.id, position.text) for position in sorted(mismatch_faksimile_positions, key=lambda fp: fp.top) ] plural_fp = '' if len(not_joined_fp) < 2 else 's' not_joined_tw = [ (word.id, word.line_number, word.text) for word in sorted(mismatch_words, key=lambda word: word.transkription_positions[0].top) ] plural_tw = '' if len(not_joined_tw) < 2 else 's' print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp)) print([(position.id, position.text) for position in faksimile_positions if not position.joined]) print('--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw)) print([(word.id, word.line_number, word.text) for word in words if not word.joined ]) debug_function(new_words, input='new_words') debug_function(words, input='words') print(Style.RESET_ALL) if kwargs.get('correct_words') is not None: unmatched_node_ids = [ position.id for position in mismatch_faksimile_positions ] unmatched_node_ids += get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces) exit_status = create_task_correct_words(kwargs.get('correct_words'), page=page, source_svg_file=faksimile_file,\ unmatched_word_ids=[ word.id for word in mismatch_words ],\ unmatched_node_ids=unmatched_node_ids) elif do_fix_errors: exit_status = fix_errors(faksimile_file, [position for position in faksimile_positions if not position.joined],\ [ word for word in words if not word.joined ], text_field_id=faksimile_page.text_field.id,\ faksimile_page=faksimile_page, xml_source_file=svg_pos_file,\ manuscript_file=manuscript_file, namespaces=namespaces) else: exit_status = 2 elif False in [ word.joined for word in words ]: print([ (word.id, word.text) for word in words if not word.joined ]) exit_status = 2 page = None return exit_status def join_single_char_words(words, threshold_x=5, threshold_y=5): """Join single char words. :return: a list of removed words """ #all_single_char_words = [ word for word in words if re.match(r'^\w$', word.text) ] removed_words = [] all_single_char_words = [ word for word in words if re.match(SINGLE_WORD_PATTERN, word.text) ] if not UNITTESTING: bar = Bar('Joining single char words', max=len(all_single_char_words)) line_numbers = sorted(set(word.line_number for word in all_single_char_words)) for line_number in line_numbers: single_char_words = [ word for word in all_single_char_words if word.line_number == line_number ] index = len(single_char_words) while index > 0: index -= 1 word = None not UNITTESTING and bar.next() if single_char_words[index] in words: single_char_word_index = words.index(single_char_words[index]) if re.match(SINGLE_PUNCTUATION_PATTERN, single_char_words[index].text)\ and words_close_enough(words[single_char_word_index-1], single_char_words[index], 15, 12): words[single_char_word_index-1].join(single_char_words[index]) removed_words.append(words.pop(single_char_word_index)) #print('{0} -> {1}, {2}'.format(word.text, words[single_char_word_index-1].text)) elif index > 0\ and words_close_enough(single_char_words[index-1], single_char_words[index], threshold_x, threshold_y): words[single_char_word_index-1].join(single_char_words[index]) removed_words.append(words.pop(single_char_word_index)) elif single_char_word_index > 0\ and words[single_char_word_index-1].line_number == line_number\ and words_close_enough(words[single_char_word_index-1], single_char_words[index], threshold_x, threshold_y): words[single_char_word_index-1].join(single_char_words[index]) removed_words.append(words.pop(single_char_word_index)) not UNITTESTING and bar.finish() return removed_words def process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text='', min_length_split=5): """Joins faksimile_positions with text == word_text with words with text == word_text. """ text = word_text if alt_word_text == '' else alt_word_text fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ] words4word = [ word for word in words if word.text == word_text and not word.joined ] if alt_word_text != '': words4word += [ word for word in words if word.text == text and not word.joined ] words4word = sorted(words4word, key=attrgetter('id')) if len(fposition4word) == len(words4word): for index, faksimile_position in enumerate(fposition4word): faksimile_position.joined = True words4word[index].faksimile_positions = [ faksimile_position ] words[words.index(words4word[index])].joined = True new_words.append(words4word[index]) elif len(words4word) < len(fposition4word): if re.match(r'(.*)ss(.*)', text): alt_word_text = re.sub(r'ss', 'ß', text) process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split) elif re.match(SINGLE_PUNCTUATION_PATTERN, text): if text == '-': alt_word_text = text.replace('-', '–') process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split) else: print('single', word_text, len(fposition4word), len(words4word)) + """ elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text): alt_word_text = re.sub(PUNCTUATION_PATTERN, "", text) debug_function(words4word, input='elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text) text {0}'.format(text)) if alt_word_text != '': pattern = r'(.*){0}(.*)'.format(alt_word_text) words4word += [ word for word in words if re.match(pattern, word.text) and not word.joined ] if len(words4word) < len(fposition4word): process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split) else: words4word = sorted(words4word, key=attrgetter('id')) for index, faksimile_position in enumerate(fposition4word): faksimile_position.joined = True words4word[index].faksimile_positions = [ faksimile_position ] if words4word[index].text[len(words4word[index].text)-1] != word_text[len(word_text)-1]\ and words.index(words4word[index])+1 < len(words)\ and words[words.index(words4word[index])+1].text == word_text[len(word_text)-1]: words4word[index].join(words[words.index(words4word[index])+1]) words[words.index(words4word[index])+1].joined = True words[words.index(words4word[index])].joined = True words4word[index].text = word_text new_words.append(words4word[index]) elif len(text) >= min_length_split and len([ word for word in words if word.text.startswith(text) and not word.joined ]) == len(fposition4word): new_words4word = [ word for word in words if word.text.startswith(text) and not word.joined ] debug_function(new_words4word, input='word.startswith {}'.format(text)) for index, fposition in enumerate(fposition4word): old_word = new_words4word[index] none_word, new_word, next_word = old_word.split(text, start_id=old_word.id) fposition4word[index].joined = True new_word.faksimile_positions = [ fposition4word[index] ] words[words.index(old_word)] = new_word if next_word is not None: next_word.id = len(words) next_word.joined = False words.append(next_word) new_word.joined = True new_words.append(new_word) elif len(text) >= min_length_split and len([ word for word in words if word.text.endswith(text) and not word.joined ]) == len(fposition4word): new_words4word = [ word for word in words if word.text.endswith(text) and not word.joined ] debug_function(new_words4word, input='word.endswith {}'.format(text)) for index, fposition in enumerate(fposition4word): old_word = new_words4word[index] before_word, new_word, none_word = old_word.split(text, start_id=old_word.id) fposition4word[index].joined = True new_word.faksimile_positions = [ fposition4word[index] ] words[words.index(old_word)] = new_word if before_word is not None: before_word.id = len(words) before_word.joined = False words.append(before_word) new_word.joined = True new_words.append(new_word) else: if len(text) > 1: new_words4word = [ word for word in words if word.text == text[1:] and not word.joined ] debug_function(new_words4word, input='else text {0}'.format(text)) if len(new_words4word) == 0: alt_word_text = text[1:] process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split) else: for new_word in new_words4word: collected_text = new_word.text current_word = new_word while collected_text != word_text and word_text.endswith(collected_text) and current_word.id > 0: previous_word = words[current_word.id-1] if word_text.endswith(previous_word.text + collected_text): words[current_word.id].joined = True previous_word.join(current_word) current_word = previous_word collected_text = current_word.text else: collected_text = previous_word.text + collected_text words4word.append(current_word) words4word = sorted(words4word, key=attrgetter('id')) for index, faksimile_position in enumerate(fposition4word): if index < len(words4word): faksimile_position.joined = True words4word[index].faksimile_positions = [ faksimile_position ] words4word[index].text = word_text words[words.index(words4word[index])].joined = True new_words.append(words4word[index]) else: print('<{0}> f{1}/t{2}, ids: {3}'.\ format(word_text, len(fposition4word), len(words4word), [ position.id for position in fposition4word ])) + """ else: print('<{0}> f{1}/t{2}'.format(word_text, len(fposition4word), len(words4word))) -def sort_words(page): +def sort_words(page)->list: """Returns sorted words (from top left to bottom right). """ if -1 in [ word.line_number for word in page.words ]: warnings.warn('{} in page file {} for word with ids {}'.format(LineNumber.WARN_NO_LINE_NUMBER, page.page_tree.docinfo.URL, page.page_tree.xpath('./word[not(@line-number)]/@id'))) words = [] for line_number in page.line_numbers: word_on_line = [ word for word in page.words if word.line_number == line_number.id ] if line_number.id % 2 == 0: words += sorted(word_on_line, key=lambda word: word.transkription_positions[0].left) else: words += sorted(word_on_line, key=cmp_to_key(\ lambda wordA, wordB: -1\ if wordA.transkription_positions[0].left < wordB.transkription_positions[0].left\ and abs(wordA.transkription_positions[0].bottom-wordB.transkription_positions[0].bottom) < wordA.transkription_positions[0].height/2\ else 1)) for index, word in enumerate(words): words[index].id = index words[index].joined = len(words[index].faksimile_positions) > 0 and words[index].verified return words def sort_faksimile_positions(faksimile_positions, reference_list=None): """Returns sorted words (from top left to bottom right). """ for faksimile_position in faksimile_positions: faksimile_position.joined = False\ if reference_list is None\ else faksimile_position in reference_list return sorted(faksimile_positions) """ return sorted(faksimile_positions, key=cmp_to_key(\ lambda positionA, positionB: -1\ if positionA.left+positionA.width/2 <= positionB.left+positionB.width/2\ and positionA.top+positionA.height/2 <= positionB.top+positionB.height/2\ else 1\ )\ ) """ @deprecated(reason="Writing process id is now set to word not word_position, TODO: check faksimile_positions for split candidates!") def update_writing_process(word): """Updates the writing process of the faksimile word position by synchronizing it with the corresponding transkription word position. If there are several transkription positions belonging to different writing processes but just one faksimile position, then we skip the update. We will fix these faksimile positions by manually adding more word positions and processing those additions in a later stage. """ writing_processes = [ writing_process_id for writing_process_id in set( tp.writing_process_id for tp in word.transkription_positions ) ] if len(writing_processes) == 1 and len(word.faksimile_positions) > 0: word.faksimile_positions[0].writing_process_id = writing_processes[0] def words_close_enough(wordA, wordB, threshold_x=10, threshold_y=5): """Return true if words are closer than thresholds """ return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left)\ -wordB.transkription_positions[0].left) < threshold_x\ and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y #return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left+wordA.transkription_positions[len(wordA.transkription_positions)-1].width)\ # -wordB.transkription_positions[0].left) < threshold_x\ # and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION. svgscripts/join_faksimileAndTranskription.py [OPTIONS] [xmlManuscriptFile] a directory containing a svg file containing information about the word positions on the faksimile. a xml file about a manuscript, containing information about its pages. OPTIONS: -h|--help: show help -c|--correct-words=DIR create a taks "CorrectWords" in target dir DIR -d|--debug-word=WORD show debug information for word == WORD -f|--fix-errors: open faksimilie svg file if there are errors -i|--ignore-status-ok ignore status "OK:faksimile merged" in manuscript file and redo merging. -j|--join-single-char-words join single char words :return: exit code (int) """ commando_dict = { 'do_fix_errors': False, 'redo_ok': False, 'debug_word_text': '', 'correct_words': None,\ 'join_single_char_words': False } try: opts, args = getopt.getopt(argv, "hc:d:fij", ["help", "correct-words=", "debug-word=", "fix-errors", "ignore-status-ok",\ "join-single-char-words" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-c', '--correct-words'): commando_dict['correct_words'] = arg elif opt in ('-d', '--debug-word'): commando_dict['debug_word_text'] = arg elif opt in ('-f', '--fix-errors'): commando_dict['do_fix_errors'] = True elif opt in ('-i', '--ignore-status-ok'): commando_dict['redo_ok'] = True elif opt in ('-j', '--join-single-char-words'): commando_dict['join_single_char_words'] = True if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if exists(file_a): file_b = None if len(args) > 1 and exists(args[1]): file_b = args[1] file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b, correction_dir=commando_dict['correct_words']) #if commando_dict['correct_words'] is not None and isdir(commando_dict['correct_words']): # print('checking new function, please remove this condition if successful!') # for file in file_list: print(file) # return 0 for faksimile_file in file_list: join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, **commando_dict) else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/show_highlighted_svg_file.py =================================================================== --- svgscripts/show_highlighted_svg_file.py (revision 0) +++ svgscripts/show_highlighted_svg_file.py (revision 99) @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This program can be used to show a svg file(s) with highlighted words. +""" +# Copyright (C) University of Basel 2019 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +import getopt +from progress.bar import Bar +import os +from os import listdir, sep, path, setpgrp, devnull, makedirs +from os.path import basename, commonpath, dirname, exists, isfile, isdir, join, realpath, splitext +import lxml.etree as ET +import sys +import tempfile + +if dirname(__file__) not in sys.path: + sys.path.append(dirname(__file__)) + +from util import create_highlighted_svg_file, ExternalViewer + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.1" + +UNITTESTING = False + +def get_node_ids4word(faksimile_tree, word, namespaces=None) ->list: + """Return a list of node ids for word. + """ + if namespaces is None: + namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } + return [ item.getparent().get('id') for item in faksimile_tree.xpath(f'//ns:rect/ns:title[text() = "{word}"]', namespaces=namespaces) ] + +def usage(): + """prints information on how to use the script + """ + if __package__ is not None: + print(main.__doc__.format(package=__package__, file=basename(__file__))) + else: + print(main.__doc__.format(package=dirname(__file__), file=basename(__file__))) + +def main(argv): + """This program can be used to copy a svg file(s) to a target directory, and also updating its image-path. + + {package}/{file} [OPTIONS] [ ...] + + a svg file containing information about the word positions on the faksimile. + word that should be highlighted. + + OPTIONS: + -h|--help: show help + + :return: exit code (int) + """ + try: + opts, args = getopt.getopt(argv, "h", ["help" ]) + except getopt.GetoptError: + usage() + return 2 + for opt, arg in opts: + if opt in ('-h', '--help'): + usage() + return 0 + if len(args) < 2: + usage() + return 2 + exit_status = 0 + messages = [] + svg_file = args[0] + target_file = join(tempfile.mkdtemp(), basename(svg_file)) + if exists(svg_file): + if not UNITTESTING: + print(f'Showing {basename(svg_file)} with highlighted words {[text for text in args[1:]]}') + faksimile_tree = ET.parse(svg_file) + namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } + node_ids = [] + for word in args[1:]: + node_ids += get_node_ids4word(faksimile_tree, word, namespaces=namespaces) + create_highlighted_svg_file(faksimile_tree, node_ids, target_file=target_file) + ExternalViewer.show_files(target_file) + else: + raise FileNotFoundError(f'File {svg_file} does not exist!') + return exit_status + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) Index: svgscripts/process_files.py =================================================================== --- svgscripts/process_files.py (revision 98) +++ svgscripts/process_files.py (revision 99) @@ -1,463 +1,448 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract information from all text svg files in directory. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import csv import getopt import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from convertPDF2SVG4Web import Converter from datatypes.page_creator import PageCreator from datatypes.transkriptionField import TranskriptionField from extractWordPosition import Extractor +from fix_missing_glyphs import fix_missing_glyphs +from util import update_svgposfile_status, update_manuscript_file sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False +WARN_MISSING_USE_NODE = f'with warnings:{PageCreator.WARNING_MISSING_USE_NODE4PWP}:' +WARN_MISSING_GLYPH = f'with warnings:{PageCreator.WARNING_MISSING_GLYPH_ID4WIM}:' class MyCSVHandler: """This class can be used to handle csv files that contain information about the tile and layout of the svg files. """ ENTRY_KEY_PAGE = 'pdf_page_number' ENTRY_KEY_FILE = 'svg_source_file' ENTRY_KEY_TITLE = 'manuscript_title' ENTRY_KEY_PAGE_NAMES = 'page_names' ENTRY_KEY_MARG_PAGE = 'marginals_page_entry' MANUSCRIPT_AE_REMOVAL = re.compile('[a-e]') MANUSCRIPT_KEY = 'Ms' MANUSCRIPT_PATTERN = re.compile(r'(\d+)(>\s)(.*)') MANUSCRIPT_TITLE_EXTENSION = 'Mp' MANUSCRIPT_TITLE_PARTS = re.compile(r'([I-X]+[a-e]*)(\s)(\d+\w*)(/\d+\w*)*') MARGINALS_PAGE = re.compile(r'([I-X]+[a-e]*)(\s)(\d+\w*)(\s)(Marg)') REMOVE_NONNUMERIC = re.compile('\D') - def __init__(self, csv_file_name, pdf_file, svg_dir): + def __init__(self, csv_file_name, pdf_file, svg_dir, title=None): self.csv_entries = [] self.pdf_file = pdf_file self.svg_dir = svg_dir + self.title = title self._init_csv_entries(csv_file_name) def _init_csv_entries(self, csv_file_name): """Init csv entries by reading the csv_file. """ with open(csv_file_name, newline='') as csvfile: reader = csv.DictReader(csvfile) list_of_svg_files = [ svg_file for svg_file in listdir(self.svg_dir) if svg_file.endswith('.svg') ] marg_entry = None for row in reader: ms_string = row[self.MANUSCRIPT_KEY] manuscript_match = re.match(self.MANUSCRIPT_PATTERN, ms_string) if manuscript_match is not None: page_number = int(manuscript_match.group(1)) files_matching = [ svg_file for svg_file in list_of_svg_files\ if re.match(rf'([0]*{page_number})(.svg)', svg_file.replace(re.split(r'\d+\.svg', svg_file)[0], '')) ] if len(files_matching) > 0: svg_file = files_matching[0] title_parts = re.match(self.MANUSCRIPT_TITLE_PARTS, manuscript_match.group(3)) marginals_page = re.match(self.MARGINALS_PAGE, manuscript_match.group(3)) if marginals_page is not None: marg_entry = { self.ENTRY_KEY_PAGE: page_number, self.ENTRY_KEY_FILE: svg_file } elif title_parts is not None: title = self.MANUSCRIPT_AE_REMOVAL.sub('', title_parts.group(1)) manuscript_title = f'{self.MANUSCRIPT_TITLE_EXTENSION} {title}' entry = { self.ENTRY_KEY_PAGE: page_number,\ self.ENTRY_KEY_FILE: svg_file,\ self.ENTRY_KEY_TITLE: manuscript_title,\ self.ENTRY_KEY_PAGE_NAMES: [ f'{title_parts.group(3)}' ] } if title_parts.group(4) is not None: entry[self.ENTRY_KEY_PAGE_NAMES].append(title_parts.group(4).replace('/','')) if marg_entry is not None\ and marg_entry[self.ENTRY_KEY_PAGE] == page_number-1: entry[self.ENTRY_KEY_MARG_PAGE] = marg_entry marg_entry = None - self.csv_entries.append(entry) + if self.title is None\ + or self.title == manuscript_title: + self.csv_entries.append(entry) def process_files(self, svg_target_dir, xml_target_dir, error_handler=None) -> int: """Process files and return exit status. """ exit_status = 0 if len(self.csv_entries) > 0: converter = Converter(target_dir=svg_target_dir) extractor = Extractor(xml_dir=xml_target_dir) for entry in self.csv_entries: title = entry[self.ENTRY_KEY_TITLE] extractor.update_title_and_manuscript(title) #converter.title = title.replace(' ', '_') pdf_page_number = entry[self.ENTRY_KEY_PAGE] svgfile = f'{self.svg_dir}{sep}{entry[self.ENTRY_KEY_FILE]}' for index, page_number in enumerate(entry[self.ENTRY_KEY_PAGE_NAMES]): pdf_name_dictionary = { pdf_page_number: title.replace(' ', '_') + '_' + str(page_number) + '_web' } multipage_index = -1\ if len(entry[self.ENTRY_KEY_PAGE_NAMES]) == 1\ else index marginals_page = None\ if not bool(entry.get(self.ENTRY_KEY_MARG_PAGE))\ else f'{self.svg_dir}{sep}{entry[self.ENTRY_KEY_MARG_PAGE][self.ENTRY_KEY_FILE]}' try: - if not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number): + if page_has_status(WARN_MISSING_USE_NODE,\ + manuscript_file=extractor.manuscript_file, page_number=page_number)\ + or page_has_status(WARN_MISSING_GLYPH,\ + manuscript_file=extractor.manuscript_file, page_number=page_number): + svg_pos_file = get_page_output_file(page_number, manuscript_file=extractor.manuscript_file) + if svg_pos_file is not None and isfile(svg_pos_file): + fix_missing_glyphs(svg_pos_file, manuscript_file=extractor.manuscript_file) + elif not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number): exit_status = process_file(converter, extractor, svgfile, self.pdf_file, page_number,\ pdf_name_dictionary=pdf_name_dictionary, multipage_index=multipage_index,\ marginals_page=marginals_page) except Exception as err: print(err) if error_handler is not None: - error_handler.record_error(svgfile, pdffile, title, page_number, error=err) + error_handler.record_error(svgfile, self.pdf_file, title, page_number, error=err) if not UNITTESTING: print(Fore.RED) print('There was an error ->', err) print(Style.RESET_ALL) if error_handler is not None: error_handler.write() return exit_status class MyErrorHandler: """This class can be used to handle errors executing extractWordPosition.Extractor.extractAndWriteInformation. """ ERROR_LOG = 'error_log.xml' def __init__(self): self.tree = ET.ElementTree(ET.Element('error-log')) if isfile(MyErrorHandler.ERROR_LOG): parser = ET.XMLParser(remove_blank_text=True) self.tree = ET.parse(MyErrorHandler.ERROR_LOG, parser) def record_error(self, svgfile, pdffile, title, page_number, error=None): """Records an error. """ if len(self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))) > 0: error_node = self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))[0] else: error_node = ET.SubElement(self.tree.getroot(), 'error', attrib={'title': title, 'number': page_number}) ET.SubElement(error_node, 'svgfile').text = svgfile ET.SubElement(error_node, 'pdffile').text = pdffile if error is not None: error_node.set('type', str(type(error).__name__)) if str(error) != '': error_msg = ET.SubElement(error_node, 'error-msg') error_msg.text = str(error) if str(type(error).__name__) == 'ExpatError': error_msg.text += '->svgfile is empty!' def run(self, title=None, page_number=None, error_type=None): """Run all or some errors [:return:] exit status (int) """ xpath = '//error' if title is not None and page_number is not None: xpath = '//error[@title="{0}" and @number="{1}"]'.format(title, page_number) elif title is not None: xpath = '//error[@title="{0}"]'.format(title) elif page_number is not None: xpath = '//error[@number="{0}"]'.format(page_number) if error_type is not None: xpath = xpath + '[@type="{0}"]'.format(error_type)\ if title is None and page_number is None\ else xpath.replace(']', ' ') + 'and @type="{0}"]'.format(error_type) exit_status = 0 for error in self.tree.xpath(xpath): title = error.get('title') page_number = error.get('number') svgfile = error.xpath('./svgfile/text()')[0]\ if len(error.xpath('./svgfile/text()')) > 0 else None pdffile = error.xpath('./pdffile/text()')[0]\ if len(error.xpath('./pdffile/text()')) > 0 else None if svgfile is not None: converter = Converter(title=title) extractor = Extractor(title=title, compare2pdf=True) status = process_file(converter, extractor, svgfile, pdffile, page_number) if status > 0: exit_status = status if status < 2: error.getparent().remove(error) self.write() return exit_status def write(self): """Writes error log. """ write_pretty(xml_element_tree=self.tree, file_name=MyErrorHandler.ERROR_LOG, script_name=__file__, file_type='xmlErrorLog') +def get_page_output_file(page_number: str, manuscript_file=None, manuscript_tree=None) ->str: + """Return filename of xml output file for page with page number page_number. + """ + if manuscript_tree is None: + if manuscript_file is None or not isfile(manuscript_file): + msg = f'File {manuscript_file} does not exist!'\ + if manuscript_file is not None\ + else 'Please specify either manuscript_file or manuscript_tree' + raise Exception(msg) + manuscript_tree = ET.parse(manuscript_file) + if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0: + return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output') + return None + def is_page_ok(manuscript_file=None, page_number=None): """Returns true if page status is 'OK'. """ + return page_has_status('OK', manuscript_file=manuscript_file, page_number=page_number) + +def page_has_status(status, manuscript_file=None, page_number=None): + """Returns true if page status is 'OK'. + """ if manuscript_file is not None and isfile(manuscript_file): manuscript_tree = ET.parse(manuscript_file) if page_number is not None\ and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0: - return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('status') == 'OK'\ - and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')) + return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('status') == status return False def is_svg_ok(manuscript_file=None, page_number=None): """Returns true if svgfile contains a valid svg graphic location. """ if manuscript_file is not None and isfile(manuscript_file): manuscript_tree = ET.parse(manuscript_file) if page_number is not None\ and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0\ and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')): xml_source_tree = ET.parse(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')) return len(xml_source_tree.xpath('//svg/@file')) > 0 and isfile(xml_source_tree.xpath('//svg/@file')[0]) return False def process_file(converter, extractor, svgfile, pdffile, page_number, pdf_name_dictionary=None, multipage_index=-1, marginals_page=None): """Processes file. [:return:] exit status (int) """ exit_status = 0 if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Processing file {} ...'.format(svgfile)) print(Style.RESET_ALL) if converter.pdf2svg(pdffile, page_number=page_number, name_dictionary=pdf_name_dictionary) == 0: for path_svg_file in converter.latest_converted_files: transkriptionField = TranskriptionField(path_svg_file, multipage_index=multipage_index) transkriptionField.shrink_svg_to_transkription_field() xml_target_file = extractor.get_file_name(svgfile, page_number) extraction_status = extractor.extractAndWriteInformation(svgfile, xml_target_file=xml_target_file,\ page_number=page_number, pdfFile=pdffile, svg_file=path_svg_file, record_warnings=True,\ multipage_index=multipage_index, marginals_page=marginals_page) if extraction_status < 2 and extractor.manuscript_file is not None: status = 'OK' if extraction_status == 1: status = extractor.latest_status exit_status = 1 update_svgposfile_status(xml_target_file, manuscript_file=extractor.manuscript_file, status=status) return exit_status def update_graphical_svg(converter, svgfile, pdffile, page_number, xml_target_file): """Create a new graphical svg file and update xml output file. [:return:] exit status (int) """ exit_status = 0 if isfile(xml_target_file): path_svg_file = converter.get_file_name(pdffile, page_number=page_number) if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Creating file {} ...'.format(path_svg_file)) print(Style.RESET_ALL) if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0: transkriptionField = TranskriptionField(path_svg_file) transkriptionField.shrink_svg_to_transkription_field() page = PageCreator(xml_target_file, svg_file=path_svg_file) write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) else: exit_status = 2 return exit_status -def update_manuscript_file(manuscript_file, page_number, file_name, status='changed', append=True): - """Updates manuscript file: adds status information about page. - """ - if isfile(manuscript_file): - parser = ET.XMLParser(remove_blank_text=True) - manuscript_tree = ET.parse(manuscript_file, parser) - if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0: - node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0] - old_status = node.get('status') - if old_status is None or 'OK' not in old_status.split(':'): - node.set('status', status) - elif append: - if status not in old_status.split(':'): - new_status = old_status + ':' + status - node.set('status', new_status) - else: - node.set('status', new_status) - if not bool(node.get('output')): - node.set('output', file_name) - else: - pages_node = manuscript_tree.getroot().find('pages')\ - if manuscript_tree.getroot().find('pages') is not None\ - else ET.SubElement(manuscript_tree.getroot(), 'pages') - new_id = len(pages_node.findall('page')) + 1 - ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name}) - write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT) - -def update_svgposfile_status(file_name, manuscript_file=None, status='changed', append=True): - """Updates svg position file's status. Changes its status to status if it does not contain 'OK', - else it appends new status to old status. - """ - if isfile(file_name): - parser = ET.XMLParser(remove_blank_text=True) - file_tree = ET.parse(file_name, parser) - old_status = file_tree.getroot().get('status') - if old_status is None or 'OK' not in old_status.split(':'): - file_tree.getroot().set('status', status) - elif append: - if status not in old_status.split(':'): - new_status = old_status + ':' + status - file_tree.getroot().set('status', new_status) - else: - file_tree.getroot().set('status', new_status) - write_pretty(xml_element_tree=file_tree, file_name=file_name, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) - if manuscript_file is not None and isfile(manuscript_file): - page_number = file_tree.getroot().get('number') - update_manuscript_file(manuscript_file, page_number, file_name, status=status) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract information from all text svg files in a directory. svgscripts/process_files.py [OPTIONS] svgscripts/process_files.py [OPTIONS] svgscripts/process_files.py [OPTIONS] Directory containing pdfs corresponding to svg files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg). Directory containing svg files corresponding to pdf files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg). OPTIONS: -h|--help: show help -e|--run-error Rerun error cases. -g|--check-graphic-svg Check that graphical svg file exists or generate a new svg file. -n|--number=pageNumber Use this with OPTION -e|--run-error in order to specify an error case. -t|--title=title: title of the manuscript to which all files belong. -T|--error-type: error type, use this with OPTION -e|--run-error in order to specify an error case. -s|--svg-target-dir=svg-target-dir target directory for path svg files, i.e. svg files that can be displayed on the web. -x|--xml-target-dir=xml-target-dir target directory for xml files. :return: exit code (int) """ check_graphic_svg_exists = False csv_handler = None error_handler = MyErrorHandler() error_type = None number = None rerun_errors = False svg_target_dir = ".{}svg".format(sep) title = None xml_target_dir = ".{}xml".format(sep) try: opts, args = getopt.getopt(argv, "hegn:s:t:T:x:", ["help", "run-error", "check-graphic-svg", "number=", "svg-target-dir=", "title=", "error-type=", "xml-target-dir="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-e', '--run-error'): rerun_errors = True elif opt in ('-g', '--check-graphic-svg'): check_graphic_svg_exists = True elif opt in ('-t', '--title'): title = arg elif opt in ('-T', '--error-type'): error_type = arg elif opt in ('-n', '--number'): number = arg elif opt in ('-s', '--svg-target-dir'): svg_target_dir = arg elif opt in ('-x', '--xml-target-dir'): xml_target_dir = arg if rerun_errors: return error_handler.run(title=title, page_number=number, error_type=error_type) if len(args) == 1 and args[0].endswith('.xml'): source_tree = ET.parse(args[0]) if source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: svg_word_file_tree = ET.parse(source_tree.xpath('//page/@output')[0]) svg_dir = dirname(svg_word_file_tree.xpath('//page/@source')[0]) pdf_dir = dirname(svg_word_file_tree.xpath('//page/pdf/@file')[0]) else: print('File {} is not of type {}'.format(args[0], FILE_TYPE_XML_MANUSCRIPT)) usage() return 2 elif len(args) < 1 or\ (len(args) == 1\ and (True not in [ pdffile.endswith('pdf') for pdffile in listdir(args[0]) ]\ or True not in [ svgfile.endswith('svg') for svgfile in listdir(args[0]) ])\ ): print("Please specify both PDFDIR and TEXT_SVG_DIR!") usage() return 2 elif len(args) < 2: pdf_dir, svg_dir = args[0], args[0] elif isdir(args[0]) and isdir(args[1]): pdf_dir, svg_dir = args[0], args[1] if True in [ svgfile.endswith('pdf') for svgfile in listdir(args[1]) ]: pdf_dir, svg_dir = args[1], args[0] elif len(args) == 3\ and isfile(args[0]) and args[0].endswith('.csv')\ and isfile(args[1]) and args[1].endswith('.pdf')\ and isdir(args[2]): - csv_handler = MyCSVHandler(args[0], args[1], args[2]) + csv_handler = MyCSVHandler(args[0], args[1], args[2], title=title) return csv_handler.process_files(svg_target_dir, xml_target_dir, error_handler) else: not_existing = args[0] if not isdir(args[0]) else args[1] print("ERROR directory {} does not exist!".format(not_existing)) return 2 list_of_svg = [ svgfile for svgfile in listdir(svg_dir) if svgfile.endswith('svg') ] list_of_pdf = [ pdffile for pdffile in listdir(pdf_dir) if pdffile.endswith('pdf') ] converter = Converter(target_dir=svg_target_dir, title=title) extractor = Extractor(xml_dir=xml_target_dir, title=title, compare2pdf=True) exit_status = 0 for svgfile in list_of_svg: if svgfile.replace('.svg', '.pdf') in list_of_pdf: title = re.split(r'(^[A-Z]+p*_[A-Z]*_[0-9]*)', svgfile)[1].replace('_', ' ') if extractor.title is None or extractor.title != title: extractor.update_title_and_manuscript(title) if converter.title is None or converter.title != title: converter.title = title.replace(' ', '_') if 'page' in svgfile: page_number = svgfile.replace('.svg','').split('page')[1] else: page_number = svgfile.replace('.svg','').split('_')[-1] pdffile = '{}{}{}'.format(pdf_dir, sep, svgfile.replace('.svg', '.pdf')) if not check_graphic_svg_exists and not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number): try: svgfile = '{}{}{}'.format(svg_dir, sep, svgfile) exit_status = process_file(converter, extractor, svgfile, pdffile, page_number) except Exception as err: error_handler.record_error(svgfile, pdffile, title, page_number, error=err) if not UNITTESTING: print(Fore.RED) print('There was an error ->', err) print(Style.RESET_ALL) elif not is_svg_ok(manuscript_file=extractor.manuscript_file, page_number=page_number): update_graphical_svg(converter, svgfile, pdffile, page_number, extractor.get_file_name(svgfile, page_number)) error_handler.write() return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/fix_missing_glyphs.py =================================================================== --- svgscripts/fix_missing_glyphs.py (revision 98) +++ svgscripts/fix_missing_glyphs.py (revision 99) @@ -1,205 +1,205 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to fix missing glyphs. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.page import Page, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word -from process_files import update_svgposfile_status +from util import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False REMOVE_SVG_WORD_POS_PAGE_ENDING = re.compile('_page[0-9]+\w*') def find_missing_glyph_for_pwp(pwp, svg_path_tree, namespaces, xmin=0.0, ymin=0.0): """Finds missing glyph for a PositionalWordPart. :return: list of PositionalWordPart """ THRESHOLD = 15.5 #pwp = PositionalWordPart(node=positional_word_part_node) word_part_obj = { "x": pwp.left, "y": pwp.top, "text": pwp.text, "matrix": pwp.transform, "class": pwp.style_class } start_id = int(pwp.id) threshold = -0.5 positional_word_parts = [] while threshold < THRESHOLD and len(positional_word_parts) < 1: try: positional_word_parts = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces,\ start_id=start_id, xmin=xmin, ymin=ymin, threshold=threshold, throw_error_if_not_found=True) except Exception: threshold += 0.1 return positional_word_parts def update_word(word, old_transkription_position, old_positional_word_part, positional_word_parts): """Updates word according to new positional_word_parts. :return: new transkription_position """ if len(positional_word_parts) > 0: debug_msg_string = 'update word from ' + __file__ old_transkription_position.positional_word_parts.remove(old_positional_word_part) positional_word_parts.reverse() for positional_word_part in positional_word_parts: old_transkription_position.positional_word_parts.insert(int(old_positional_word_part.id), positional_word_part) for index, positional_word_part in enumerate(old_transkription_position.positional_word_parts): positional_word_part.id = index transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ old_transkription_position.positional_word_parts, debug_msg_string=debug_msg_string, transkription_position_id=old_transkription_position.id) word.transkription_positions.remove(old_transkription_position) transkription_positions.reverse() for new_tp in transkription_positions: word.transkription_positions.insert(int(old_transkription_position.id), new_tp) text = '' for index, tp in enumerate(word.transkription_positions): tp.id = index tp.writing_process_id = old_transkription_position.writing_process_id for pwp in tp.positional_word_parts: text += pwp.text if word.text != text: word.text = text return transkription_positions[0] def fix_missing_glyphs(svg_word_pos_file, manuscript_file=None): """Finds missing glyphs for xml file of type FILE_TYPE_SVG_WORD_POSITION. """ if isfile(svg_word_pos_file): if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Fixing missing glyphs for file {} ... '.format(svg_word_pos_file), end='') print(Style.RESET_ALL) page = Page(svg_word_pos_file) transkription_field = TranskriptionField(page.svg_file) svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) words_without_glyphs = [ word for word in page.words\ if len([ tp for tp in word.transkription_positions\ if len([ pwp for pwp in tp.positional_word_parts if pwp.symbol_id is None]) > 0]) > 0 ] for word in words_without_glyphs: for transkription_position in word.transkription_positions: positional_word_parts = transkription_position.positional_word_parts[:] for positional_word_part in positional_word_parts: if positional_word_part.symbol_id is None: pwps = find_missing_glyph_for_pwp(positional_word_part, svg_path_tree, namespaces, xmin=transkription_field.xmin, ymin=transkription_field.ymin) new_transkription_position = update_word(word, transkription_position, positional_word_part, pwps) if new_transkription_position is not None: transkription_position = new_transkription_position page.update_and_attach_words2tree() write_pretty(xml_element_tree=page.page_tree, file_name=svg_word_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) page = Page(svg_word_pos_file) new_number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) if not UNITTESTING: result_color = Fore.LIGHTBLUE_EX if new_number_of_missing_glyphs == 0 else Fore.MAGENTA print(result_color + ' {0}/{1}'.format(number_of_missing_glyphs-new_number_of_missing_glyphs, number_of_missing_glyphs), end='') print(Fore.LIGHTBLUE_EX + ' fixed.', end='') print(Style.RESET_ALL) if len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) == 0: update_svgposfile_status(svg_word_pos_file, manuscript_file=manuscript_file, status='OK') def get_filelist_and_manuscript_file(file_a, file_b=None): """Returns a file list and a manuscript file (or None) """ file_list = [] manuscript_file = None source_tree = ET.parse(file_a) if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION\ and len([ word_part for word_part in source_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')]) > 0: # if symbol_ids are missing ... file_list.append(file_a) if file_b is not None: manuscript_file = file_b else: manuscript_file = REMOVE_SVG_WORD_POS_PAGE_ENDING.sub('', file_a) if not isfile(manuscript_file): manuscript_file = None elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: manuscript_file = file_a if file_b is not None: file_list.append(file_b) else: file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND.lower())) return file_list, manuscript_file def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to fix missing glyphs. svgscripts/fix_missing_glyphs.py [OPTIONS] -File [-File] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help: show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): file_b = None if len(args) > 1 and isfile(args[1]): file_b = args[1] file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b) for svg_word_pos_file in file_list: fix_missing_glyphs(svg_word_pos_file, manuscript_file=manuscript_file) else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/datatypes/matrix.py =================================================================== --- svgscripts/datatypes/matrix.py (revision 98) +++ svgscripts/datatypes/matrix.py (revision 99) @@ -1,346 +1,348 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to transform a svg/text[@transform] matrix-string into a matrix representation. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import re import math class Matrix: """ This class transforms a svg @transform matrix-string into a matrix representation. Args: transform_matrix_string (str) string of the form 'matrix(1.0 0.0 0.0 1.0 0.0 0.0)' or 'rotate(10)' """ A = 0 B = 1 C = 2 D = 3 E = 4 F = 5 XINDEX = 4 YINDEX = 5 MATRIX_LENGTH = 6 DOWN = 1 STRAIGHT = 0 UP = -1 def __init__(self, transform_matrix_string=None, transkription_field=None, matrix_list=[]): self.matrix = [ 0.0 for i in range(Matrix.MATRIX_LENGTH) ] if len(matrix_list) < 6 else matrix_list if transform_matrix_string is not None: m = re.search('(?<=rotate\()[-]*[0-9]+', transform_matrix_string) if m is not None: # transform='rotate(a)' to transform='matrix(cos(a), sin(a), -sin(a), cos(a), 0, 0)' angle = float(m.group(0)) self.matrix[Matrix.A] = round(math.cos(math.radians(angle)), 3) self.matrix[Matrix.B] = round(math.sin(math.radians(angle)), 3) self.matrix[Matrix.C] = round(math.sin(math.radians(angle))*-1, 3) self.matrix[Matrix.D] = round(math.cos(math.radians(angle)), 3) self.matrix[Matrix.E] = 0 self.matrix[Matrix.F] = 0 - elif re.search(r'matrix\(\s*([-]*\d+(\.\d+(e-\d+)*)*[,\s][\s]*){5}[-]*\d+(\.\d+)*.*\s*\)', transform_matrix_string): + elif re.search(r'matrix\(\s*([-]*\d+([\.]*\d*(e-\d+)*)*[,\s][\s]*){5}[-]*\d+(\.\d+)*.*\s*\)', transform_matrix_string): + #elif re.search(r'matrix\(\s*([-]*\d+(\.\d+(e-\d+)*)*[,\s][\s]*){5}[-]*\d+(\.\d+)*.*\s*\)', transform_matrix_string): #elif re.search(r'matrix\(\s*([-]*[0-9].*\s){5}[-]*[0-9].*\s*\)', transform_matrix_string): # old-> does not include comma separated matrix string self.matrix = [ float(i) for i in transform_matrix_string.replace('matrix(','').\ replace(', ', ',').replace(',', ' ').replace(')','').split(' ') ] else: raise Exception('Error: string "{}" is not a valid transform matrix string!'.format(transform_matrix_string)) if transkription_field is not None: self.matrix[Matrix.XINDEX] -= transkription_field.xmin self.matrix[Matrix.YINDEX] -= transkription_field.ymin if(len(self.matrix) < Matrix.MATRIX_LENGTH): raise Exception('Error: string "{}" is not a valid matrix string!'.format(transform_matrix_string)) def add2X(self, add_to_x=0): """Return x-value of matrix (float) + add_to_x. """ return self.matrix[Matrix.XINDEX] + float(add_to_x) def add2Y(self, add_to_y=0): """Return y-value of matrix (float) + add_to_y. """ return self.matrix[Matrix.YINDEX] + float(add_to_y) def getX(self): """Return x-value of matrix (float). """ return self.matrix[Matrix.XINDEX] def getY(self): """Return y-value of matrix (float). """ return self.matrix[Matrix.YINDEX] def is_matrix_horizontal(self): """Returns whether matrix is horizontal. [:return:] True/False """ return self.matrix[Matrix.A] == 1 and self.matrix[Matrix.B] == 0 and self.matrix[Matrix.C] == 0 and self.matrix[Matrix.D] == 1 def get_new_x(self, x=0.0, y=0.0): """Returns new position of x. :return: (float) x """ top_left_x = x - self.matrix[self.E] if x != 0.0 else 0.0 top_left_y = y - self.matrix[self.F] if y != 0.0 else 0.0 return self.matrix[Matrix.A] * top_left_x + self.matrix[Matrix.C] * top_left_y + self.matrix[self.E] def get_new_y(self, x=0.0, y=0.0): """Returns new position of y. :return: (float) y """ top_left_x = x - self.matrix[self.E] if x != 0.0 else 0.0 top_left_y = y - self.matrix[self.F] if y != 0.0 else 0.0 return self.matrix[Matrix.B] * top_left_x + self.matrix[Matrix.D] * top_left_y + self.matrix[self.F] def get_old_x(self, x=0.0, y=0.0): """Returns old position of x. :return: (float) x """ old_x = (self.matrix[self.D]*x - self.matrix[Matrix.D]*self.matrix[Matrix.E] - self.matrix[Matrix.C]*y + self.matrix[Matrix.C]*self.matrix[Matrix.F])\ /(self.matrix[Matrix.A]*self.matrix[Matrix.D] - self.matrix[Matrix.B]*self.matrix[Matrix.C]) return self.add2X(old_x) def get_transformed_positions(self, x=0.0, y=0.0, width=0.0, height=0.0): """Returns transformed x, y, width and height. """ top_left_x = x top_left_y = y top_right_x = x + width top_right_y = y bottom_left_x = x bottom_left_y = y + height bottom_right_x = x + width bottom_right_y = y + height new_x = self.matrix[Matrix.A] * top_left_x + self.matrix[Matrix.C] * top_left_y + self.matrix[self.E] new_y = self.matrix[Matrix.B] * top_left_x + self.matrix[Matrix.D] * top_left_y + self.matrix[self.F] new_top_right_x = self.matrix[Matrix.A] * top_right_x + self.matrix[Matrix.C] * top_right_y + self.matrix[self.E] new_top_right_y = self.matrix[Matrix.B] * top_right_x + self.matrix[Matrix.D] * top_right_y + self.matrix[self.F] new_bottom_left_x = self.matrix[Matrix.A] * bottom_left_x + self.matrix[Matrix.C] * bottom_left_y + self.matrix[self.E] new_bottom_left_y = self.matrix[Matrix.B] * bottom_left_x + self.matrix[Matrix.D] * bottom_left_y + self.matrix[self.F] new_bottom_right_x = self.matrix[Matrix.A] * bottom_right_x + self.matrix[Matrix.C] * bottom_right_y + self.matrix[self.E] new_bottom_right_y = self.matrix[Matrix.B] * bottom_right_x + self.matrix[Matrix.D] * bottom_right_y + self.matrix[self.F] new_width = abs(new_top_right_x - new_x)\ if abs(new_top_right_x - new_x) >= abs(new_bottom_right_x - new_bottom_left_x)\ else abs(new_bottom_right_x - new_bottom_left_x) new_height = abs(new_bottom_left_y - new_y)\ if abs(new_bottom_left_y - new_y) >= abs(new_top_right_y - new_bottom_right_y)\ else abs(new_top_right_y - new_bottom_right_y) return new_x, new_y, new_width, new_height def clone_transformation_matrix(self): """Returns a matrix that contains only the transformation part. [:return:] (Matrix) a clone of this matrix """ return Matrix(matrix_list=self.matrix[0:4]+[0,0]) def isRotationMatrix(self): """Return whether matrix is a rotation matrix. """ return self.matrix[Matrix.A] < 1 or self.matrix[Matrix.B] != 0 def toCSSTransformString(self): """Returns the CSS3 transform string: 'rotate(Xdeg)' where X is the angle. """ angle = 0 if self.isRotationMatrix(): angle = int(round(math.degrees(math.asin(self.matrix[Matrix.B])), 0)) if angle == 0: angle = int(round(math.degrees(math.acos(self.matrix[Matrix.A])), 0)) return 'rotate({}deg)'.format(angle) def toString(self): """Returns a transform_matrix_string representation of the matrix. [:returns:] (str) 'matrix(X X X X X X)' """ return 'matrix(' + ' '.join([ str(round(x, 5)) for x in self.matrix ]) + ')' def get_rotation_direction(self): """Get rotation direction of rotation matrix. [:return:] (int) direction code Matrix.UP, Matrix.STRAIGHT, Matrix.DOWN """ if not self.isRotationMatrix(): return self.STRAIGHT else: angle = int(round(math.degrees(math.asin(self.matrix[Matrix.B])), 0)) return self.UP if angle < 0 else self.DOWN @staticmethod def IS_BENEATH_TF(matrix, transkription_field): """Returns true if matrix specifies a position beneath transkription_field. """ if matrix.getY() < transkription_field.ymax or matrix.getY() > transkription_field.documentHeight-10: return False if transkription_field.second_field is not None\ and matrix.getY() > transkription_field.second_field.ymin_without_title: return False return True @staticmethod def IS_IN_FOOTNOTE_AREA(transform_matrix_string, transkription_field, x=0.0, marginals_on_extra_page=False): """Returns true if matrix specifies a position that is part of the footnote area. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ matrix = Matrix(transform_matrix_string=transform_matrix_string) if marginals_on_extra_page: return matrix.getY() < transkription_field.documentHeight-10\ and matrix.getY() > transkription_field.documentHeight/4\ and matrix.getX() + x > transkription_field.documentWidth/4\ and not Matrix.IS_IN_MARGIN_FIELD(transform_matrix_string, transkription_field, marginals_on_extra_page=True) if not Matrix.IS_BENEATH_TF(matrix, transkription_field): return False is_part = matrix.getX() + x > transkription_field.xmin\ if transkription_field.is_page_verso()\ else matrix.getX() + x > transkription_field.documentWidth/4 return is_part @staticmethod def NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, transkription_field, marginals_on_extra_page=False): """Returns true if matrix specifies a position that is part of the footnote area. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ matrix = Matrix(transform_matrix_string=node.get('transform')) x = sorted([ float(x.get('x')) for x in node.getchildren()])[-1]\ if len(node.getchildren()) > 0 else 0.0 if marginals_on_extra_page: return matrix.getY() < transkription_field.documentHeight-10\ and matrix.getY() > transkription_field.documentHeight/4\ and matrix.getX() + x > transkription_field.documentWidth/4\ and not Matrix.IS_IN_MARGIN_FIELD(node.get('transform'), transkription_field, marginals_on_extra_page=True) if not Matrix.IS_BENEATH_TF(matrix, transkription_field): return False is_part = matrix.getX() + x > transkription_field.xmin\ if transkription_field.is_page_verso()\ else matrix.getX() + x > transkription_field.documentWidth/4 return is_part @staticmethod def IS_IN_MARGIN_FIELD(transform_matrix_string, transkription_field, marginals_on_extra_page=False): """Returns true if matrix specifies a position that is part of the margin field. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ line_number_area_width = 15\ if transkription_field.line_number_area_width == 0.0\ else transkription_field.line_number_area_width matrix = Matrix(transform_matrix_string=transform_matrix_string) if matrix.getY() < transkription_field.ymin or matrix.getY() > transkription_field.ymax: return False if marginals_on_extra_page: return matrix.getX() > transkription_field.xmax is_part = matrix.getX() < transkription_field.xmin - line_number_area_width\ if transkription_field.is_page_verso()\ else matrix.getX() > transkription_field.xmax + line_number_area_width return is_part @staticmethod def IS_IN_PLACE_OF_PRINTING_AREA(transform_matrix_string, transkription_field): """Returns true if matrix specifies a position that is part of the area where the places of printing ('Druckorte') are printed. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ matrix = Matrix(transform_matrix_string=transform_matrix_string) if not Matrix.IS_BENEATH_TF(matrix, transkription_field): return False is_part = matrix.getX() < transkription_field.xmin\ if transkription_field.is_page_verso()\ else matrix.getX() < transkription_field.documentWidth/4 return is_part @staticmethod def IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=None, matrix=None): """Returns true if matrix specifies a position that is part of transkription field. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ if matrix is None and not bool(text_node.get('transform')): return False if matrix is None: matrix = Matrix(transform_matrix_string=text_node.get('transform')) is_part = matrix.getX() > transkription_field.xmin and matrix.getX() < transkription_field.xmax\ and matrix.getY() > transkription_field.ymin and matrix.getY() < transkription_field.ymax - if not is_part and matrix.isRotationMatrix() and len([child.text for child in text_node.getchildren() if not re.match(r'^\s*$', child.text)]) > 0: + #if not is_part and matrix.isRotationMatrix() and len([child.text for child in text_node.getchildren() if not re.match(r'^\s*$', child.text)]) > 0: + if not is_part and len([child.text for child in text_node.getchildren() if not re.match(r'^\s*$', child.text)]) > 0: first_tspan_node = [ child for child in text_node.getchildren() if not re.match(r'^\s*$', child.text)][0] x = matrix.add2X(float(first_tspan_node.get('x'))) y = matrix.add2Y(float(first_tspan_node.get('y'))) new_x = matrix.get_new_x(x=x, y=y) new_y = matrix.get_new_y(x=x, y=y) return new_x > transkription_field.xmin and new_x < transkription_field.xmax\ and new_y > transkription_field.ymin and new_y < transkription_field.ymax return is_part @staticmethod def IS_NEARX_TRANSKRIPTION_FIELD(transform_matrix_string, transkription_field, diffx=20.0): """Returns true if matrix specifies a position that is on its x axis near the transkription_field. transform_matrix_string (str): string from which to init Matrix. transkription_field (svgscripts.TranskriptionField) diffx (float): defines threshold for positions that count as near. """ matrix = Matrix(transform_matrix_string=transform_matrix_string) MINLEFT = transkription_field.xmin - diffx MAXRIGHT = transkription_field.xmax + diffx return matrix.getY() > transkription_field.ymin and matrix.getY() < transkription_field.ymax\ and ((matrix.getX() > MINLEFT and matrix.getX() < transkription_field.xmin)\ or (matrix.getX() > transkription_field.xmax and matrix.getX() < MAXRIGHT)) @staticmethod def DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b, diff_threshold=0.001): """Returns whether the conversion factors (a-d) differ more than diff_threshold. """ if matrix_a is None or matrix_b is None: return not (matrix_a is None and matrix_b is None) return abs(matrix_a.matrix[Matrix.A] - matrix_b.matrix[Matrix.A]) > diff_threshold\ or abs(matrix_a.matrix[Matrix.B] - matrix_b.matrix[Matrix.B]) > diff_threshold\ or abs(matrix_a.matrix[Matrix.C] - matrix_b.matrix[Matrix.C]) > diff_threshold\ or abs(matrix_a.matrix[Matrix.D] - matrix_b.matrix[Matrix.D]) > diff_threshold def __eq__(self, other): """Return self.matrix == other.matrix. """ if other is None: return False return self.matrix == other.matrix def __hash__(self): """Return hash value. """ return hash((self.matrix[Matrix.E], self.matrix[Matrix.F])) Index: svgscripts/datatypes/faksimile.py =================================================================== --- svgscripts/datatypes/faksimile.py (revision 98) +++ svgscripts/datatypes/faksimile.py (revision 99) @@ -1,196 +1,199 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a faksimile page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import re from lxml import etree as ET from os import path from os.path import isdir, isfile, sep, basename from svgpathtools.parser import parse_path from .faksimile_image import FaksimileImage from .matrix import Matrix from .text_field import TextField from .word_position import WordPosition class FaksimilePage: """ This class represents a faksimile page. Args: xml_target_file (str): name of the xml file to which page info will be written. xml_source_file (str): name of the xml file that will be instantiated. """ XML_TAG = 'faksimile-page' def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, svg_source_file=None, faksimile_image=None, text_field=None): xml_file = xml_source_file if xml_source_file is not None else xml_target_file self.title = title self.page_number = page_number self.xml_file = xml_file if xml_file is not None and isfile(xml_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_file, parser) self.title = self.page_tree.getroot().get('title') self.page_number = self.page_tree.getroot().get('page-number') self.width = float(self.page_tree.getroot().get('width')) if bool(self.page_tree.getroot().get('width')) else 0.0 self.height = float(self.page_tree.getroot().get('height')) if bool(self.page_tree.getroot().get('height')) else 0.0 else: self.page_tree = ET.ElementTree(ET.Element(self.XML_TAG)) if title is not None: self.page_tree.getroot().set('title', title) if page_number is not None: self.page_tree.getroot().set('page-number', str(page_number)) if xml_target_file is not None: self.remove_tags_from_page_tree([WordPosition.FAKSIMILE]) if svg_source_file is not None: self.page_tree.getroot().set('svg-source-file', svg_source_file) if faksimile_image is not None: faksimile_image.attach_object_to_tree(self.page_tree) if text_field is not None: text_field.attach_object_to_tree(self.page_tree) self.svg_source_file = self.page_tree.getroot().get('svg-source-file') self.faksimile_image = FaksimileImage(node=self.page_tree.getroot().find('.//' + FaksimileImage.XML_TAG))\ if len(self.page_tree.getroot().findall('.//' + FaksimileImage.XML_TAG)) > 0 else None self.text_field = TextField(node=self.page_tree.getroot().find('.//' + TextField.XML_TAG))\ if len(self.page_tree.getroot().findall('.//' + TextField.XML_TAG)) > 0 else None self.word_positions = [ WordPosition(node=node) for node in self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE) ]\ if len(self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE)) > 0 else [] def append_word_position(self, word_position): """Appends word_position to word_positions and attaches it to page_tree. """ self.word_positions.append(word_position) word_position.attach_object_to_tree(self.page_tree) @classmethod def get_faksimile_pages(cls, svg_file, page_number='') -> list: """Creates and returns text fields contained in a svg_file as a list. """ svg_tree = ET.parse(svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } return cls.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces, page_number=page_number) @staticmethod def GET_FAKSIMILEPAGES(svg_tree, namespaces=None, page_number='') -> list: """Creates and returns text fields contained in a svg_tree as a list. """ THRESHOLD_X = 10 if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } source_file_name = svg_tree.docinfo.URL image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap), source_file_name) xml_dir = '.{}xml'.format(sep) faksimile_pages = list() title_string = re.sub(r'[,_][0-9]+.*\.svg', '', basename(source_file_name)) title = title_string.replace('-', ' ') rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap)\ if rect.get('id', svg_tree.getroot().nsmap).startswith(title_string)\ and rect.get('id', svg_tree.getroot().nsmap).endswith(str(page_number)) ] for text_field_rect in rect_list: tf_x = float(text_field_rect.get('x', svg_tree.getroot().nsmap)) - image.x tf_y = float(text_field_rect.get('y', svg_tree.getroot().nsmap)) - image.y tf_width = float(text_field_rect.get('width', svg_tree.getroot().nsmap)) tf_height = float(text_field_rect.get('height', svg_tree.getroot().nsmap)) id = text_field_rect.get('id', svg_tree.getroot().nsmap) target_file_name = xml_dir + sep + id + '.xml' if isdir(xml_dir) else id + '.xml' page_number = re.sub(r'.*[,_]', '', id) if page_number.startswith('0'): page_number = page_number.lstrip('0') text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y) faksimile_page = FaksimilePage(xml_target_file=target_file_name, svg_source_file=source_file_name,\ title=title, page_number=page_number, faksimile_image=image, text_field=text_field) x_min = text_field.xmin + image.x y_min = text_field.ymin + image.y #rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\ # x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces) rect_titles = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\ y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces) rect_titles += get_paths_inside_rect(svg_tree, '//ns:path/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\ y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces) for rect_title in rect_titles: rect = rect_title.getparent() x, y, height, width = 0.0, 0.0, 0.0, 0.0 if rect.tag.endswith('path'): path = parse_path(rect.get('d')) x, xmax, y, ymax = path.bbox() width = xmax - x height = ymax - y else: x = float(rect.get('x', svg_tree.getroot().nsmap)) y = float(rect.get('y', svg_tree.getroot().nsmap)) height = float(rect.get('height', svg_tree.getroot().nsmap)) width = width=float(rect.get('width', svg_tree.getroot().nsmap)) matrix = None if bool(rect.get('transform')): matrix = Matrix(transform_matrix_string=rect.get('transform')) faksimile_page.append_word_position(\ WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=rect_title.text, height=height,\ width=width, x=x, y=y, matrix=matrix, tag=WordPosition.FAKSIMILE)) faksimile_pages.append(faksimile_page) return faksimile_pages def remove_tags_from_page_tree(self, list_of_tags_to_remove): """Removes the tags specified in the list from the target tree. """ for xpath2remove in list_of_tags_to_remove: for node in self.page_tree.xpath('//' + xpath2remove): node.getparent().remove(node) def get_paths_inside_rect(svg_tree, xpath, x_min, x_max, y_min, y_max, not_id, namespaces={}): """Returns a list of all paths selected by xpath that are inside x_min, x_max, y_min, y_max and do not have id == not_id. """ paths = [] if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } for path_node in svg_tree.xpath(xpath, namespaces=namespaces): append_node = path_node if not path_node.tag.endswith('path') and not path_node.tag.endswith('rect'): path_node = path_node.getparent() x, xmax, y, ymax = -1, -1, -1, -1 + init_xy = False if path_node.tag.endswith('rect'): x = float(path_node.get('x')) if bool(path_node.get('x')) else -1 y = float(path_node.get('y')) if bool(path_node.get('y')) else -1 xmax = x + float(path_node.get('width')) if bool(path_node.get('width')) else -1 ymax = y + float(path_node.get('height')) if bool(path_node.get('height')) else -1 + init_xy = True elif path_node.tag.endswith('path') and bool(path_node.get('d')) and path_node.get('d') != 0: path = parse_path(path_node.get('d')) x, xmax, y, ymax = path.bbox() - if x > -1 and xmax > -1 and y > -1 and ymax > -1: + init_xy = True + if init_xy: if bool(path_node.get('transform')): matrix = Matrix(transform_matrix_string=path_node.get('transform')) x, xmax = matrix.get_new_x(x=x, y=y), matrix.get_new_x(x=xmax, y=ymax) y, ymax = matrix.get_new_y(x=x, y=y), matrix.get_new_y(x=xmax, y=ymax) width = xmax - x height = ymax - y if x > x_min and x < x_max\ and y > y_min and y < y_max\ and path_node.get('id') != not_id: paths.append(append_node) return paths Index: svgscripts/datatypes/positional_object.py =================================================================== --- svgscripts/datatypes/positional_object.py (revision 98) +++ svgscripts/datatypes/positional_object.py (revision 99) @@ -1,143 +1,145 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent an object with positional information. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import sys from .matrix import Matrix from .attachable_object import AttachableObject sys.path.append('py2ttl') from class_spec import SemanticClass class PositionalObject(AttachableObject,SemanticClass): """ This (super) class represents an object with positional information. Args: id (int): object id matrix (datatypes.Matrix): matrix containing information about conversion. height (float): height of width (float): width of object x (float): x position of object y (float): y position of object """ XML_TAG = 'positional-object' floatKeys = [ 'height', 'width', 'left', 'top', 'bottom'] intKeys = [ ] stringKeys = [ ] def __init__(self, node=None, id=0, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, tag=XML_TAG): self.floatKeys = [] self.floatKeys += PositionalObject.floatKeys self.intKeys = [] self.intKeys += PositionalObject.intKeys self.stringKeys = [ 'id' ] self.stringKeys += PositionalObject.stringKeys self.attachable_objects = [] if node is not None: self.id = str(node.get('id')) + if id > 0 and str(id) != self.id: + self.id = str(id) self.height = float(node.get('height')) self.width = float(node.get('width')) self.left = float(node.get('left')) self.top = float(node.get('top')) self.bottom = float(node.get('bottom')) self.transform = Matrix(node.get('transform')) if bool(node.get('transform')) and 'matrix(' in node.get('transform') else None self.tag = node.tag else: self.id = str(id) self.height = round(height, 3) self.width = round(width, 3) self.left = round(x, 3) self.top = round(y, 3) self.bottom = round(y + height, 3) self.transform = matrix self.tag = tag def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree, self.tag) for key in self.floatKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) for key in self.intKeys + self.stringKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(self.__dict__[key])) if self.transform is not None and self.transform.isRotationMatrix(): obj_node.set('transform', self.transform.toString()) for attachable_object in self.attachable_objects: attachable_object.attach_object_to_tree(obj_node) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} for intKey in cls.intKeys: properties.update(cls.create_semantic_property_dictionary(intKey, int)) for floatKey in cls.floatKeys: properties.update(cls.create_semantic_property_dictionary(floatKey, float, cardinality=1)) for stringKey in cls.stringKeys: properties.update(cls.create_semantic_property_dictionary(stringKey, str, cardinality=1)) properties.update(cls.create_semantic_property_dictionary('transform', str)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) @staticmethod def POSITIONS_OVERLAP_HORIZONTALLY(position_a, position_b): """Returns whether position a and b overlap horizontally. """ return (position_a.left < position_b.left+position_b.width)\ and (position_a.left+position_a.width > position_b.left) @staticmethod def POSITIONS_OVERLAP_VERTICALLY(position_a, position_b): """Returns whether position a and b overlap vertically. """ return (position_a.top < position_b.bottom)\ and (position_a.bottom > position_b.top) @staticmethod def POSITIONS_ARE_STACKED(position_a, position_b): """Returns whether position a and b are stacked, i.e. are above each other. """ return PositionalObject.POSITIONS_OVERLAP_HORIZONTALLY(position_a, position_b)\ and (not PositionalObject.POSITIONS_OVERLAP_VERTICALLY(position_a, position_b)\ or abs(position_a.top-position_b.top) > (position_a.height/4 + position_b.height/4)) Index: svgscripts/datatypes/text_connection_mark.py =================================================================== --- svgscripts/datatypes/text_connection_mark.py (revision 98) +++ svgscripts/datatypes/text_connection_mark.py (revision 99) @@ -1,92 +1,101 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a text connection mark ("Anschlusszeichen"). """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import sys -from .footnotes import extract_footnotes_as_strings +from .footnotes import extract_footnotes from .reference import Reference from .special_word import SpecialWord +from .transkriptionField import TranskriptionField class TextConnectionMark(SpecialWord): """ This class represents a text connection mark. """ XML_TAG = 'text-connection-mark' XML_SUB_TAG = Reference.XML_TAG SPECIAL_CHAR_LIST = [ '*', 'W' ] + FOOTNOTE_CONTAINS = [ 'Anschlußzeichen', 'Hinzufügungszeichen' ] def __init__(self, id=0, line_number=-1, text='*', transkription_positions=[], faksimile_positions=[], text_source=None): super(TextConnectionMark, self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.text_source = text_source def add_content(self, node): """Adds content to TextConnectionMark. """ self.text_source = Reference.create_cls(node=node) def attach_word_to_tree(self, target_tree): """Attaches TextConnectionMark to tree target_tree. """ node = super(TextConnectionMark,self).attach_word_to_tree(target_tree) if self.text_source is not None: self.text_source.attach_object_to_tree(node) - @staticmethod - def find_content_in_footnotes(list_of_text_connection_marks, transkription_field, svg_tree, title='', page_number=''): + @classmethod + def find_content_in_footnotes(cls, page, transkription_field=None, svg_tree=None, title='', page_number='', footnotes=None, skip_after=-1.0): """Find content for the TextConnectionMark. """ - footnotes = extract_footnotes_as_strings(transkription_field=transkription_field, svg_tree=svg_tree, contains_string='Anschlußzeichen') - for text_connection_mark in list_of_text_connection_marks: - relevant_footnotes = [ footnote_string for footnote_string in footnotes if footnote_string.strip().startswith(str(text_connection_mark.line_number)+ ':') ] + if footnotes is None: + if svg_tree is None: + svg_tree = ET.parse(page.source) + if transkription_field is None: + transkription_field = TranskriptionField(page.source) + footnotes = extract_footnotes(page, transkription_field=transkription_field, svg_tree=svg_tree, contains_strings=cls.FOOTNOTE_CONTAINS, skip_after=skip_after) + else: + footnotes = [ footnote for footnote in footnotes if True in [ contains_string in footnote.content for contains_string in cls.FOOTNOTE_CONTAINS ] ] + for text_connection_mark in page.text_connection_marks: + relevant_footnotes = [ footnote.content for footnote in footnotes if footnote.content.strip().startswith(str(text_connection_mark.line_number)+ ':') ] if len(relevant_footnotes) > 0: footnote_string = relevant_footnotes[0].strip() line_number = int(footnote_string.split(':')[0]) is_uncertain = footnote_string.endswith('?') - reference_string = footnote_string.replace('?', '').split('zu')[1].strip() + reference_string = footnote_string.replace('?', '').split('zu ')[1].strip() text_connection_mark.text_source = Reference.create_cls(is_uncertain=is_uncertain,\ - reference_string=reference_string, title=title, page_number=page_number) + reference_string=reference_string, title=page.title, page_number=page.number) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = super(TextConnectionMark,cls).get_semantic_dictionary() dictionary['properties'].update(cls.create_semantic_property_dictionary('text_source', Reference,\ cardinality=1, name='textConnectionMarkHasTextSource', label='text connection mark has a text source')) return cls.return_dictionary_after_updating_super_classes(dictionary) @classmethod def get_special_char_list(cls): """Returns a list of the chars that define this special word. """ return cls.SPECIAL_CHAR_LIST Index: svgscripts/datatypes/super_page.py =================================================================== --- svgscripts/datatypes/super_page.py (revision 98) +++ svgscripts/datatypes/super_page.py (revision 99) @@ -1,293 +1,294 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a super page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile, basename, dirname from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import sys import warnings from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .mark_foreign_hands import MarkForeignHands from .text_connection_mark import TextConnectionMark from .text_field import TextField from .writing_process import WritingProcess class SuperPage: """ This super class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition' FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile' ADD2Y = 7 PAGE_RECTO = 'recto' PAGE_VERSO = 'verso' STATUS_MERGED_OK = 'faksimile merged' STATUS_POSTMERGED_OK = 'words processed' UNITTESTING = False XML_TAG = 'page' def __init__(self, xml_file, title=None, page_number='', orientation='North', multipage_index=-1, page_type=PAGE_VERSO, should_xml_file_exist=False): self.properties_dictionary = {\ 'faksimile_image': (FaksimileImage.XML_TAG, None, FaksimileImage),\ 'faksimile_svgFile': ('data-source/@file', None, str),\ 'multipage_index': ('page/@multipage-index', multipage_index, int),\ 'marginals_source': ('page/@marginals-source', None, str),\ 'number': ('page/@number', str(page_number), str),\ 'orientation': ('page/@orientation', orientation, str),\ 'page_type': ('page/@pageType', page_type, str),\ 'pdfFile': ('pdf/@file', None, str),\ 'source': ('page/@source', None, str),\ 'svg_file': ('svg/@file', None, str),\ 'svg_image': (SVGImage.XML_TAG, None, SVGImage),\ 'text_field': (FaksimileImage.XML_TAG + '/' + TextField.XML_TAG, None, TextField),\ 'title': ('page/@title', title, str),\ } + self.bak_file = None self.online_properties = [] self.line_numbers = [] self.lines = [] self.mark_foreign_hands = [] self.page_tree = None self.sonderzeichen_list = [] self.style_dict = {} self.text_connection_marks = [] self.word_deletion_paths = [] self.word_insertion_marks = [] self.words = [] self.writing_processes = [] self.xml_file = xml_file if not self.is_page_source_xml_file(): msg = f'ERROR: xml_source_file {self.xml_file} is not of type "{FILE_TYPE_SVG_WORD_POSITION}"' raise Exception(msg) self._init_tree(should_xml_file_exist=should_xml_file_exist) def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None): """Adds a list of classes that are sonderzeichen and a style dictionary to page. """ self.sonderzeichen_list = sonderzeichen_list self.letterspacing_list = letterspacing_list self.style_dict = style_dict if style_node is not None: self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') } self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ] self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('letterspacing-list')) ] elif bool(self.style_dict): style_node = ET.SubElement(self.page_tree.getroot(), 'style') if len(self.sonderzeichen_list) > 0: style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list)) if len(self.letterspacing_list) > 0: style_node.set('letterspacing-list', ' '.join(self.letterspacing_list)) for key in self.style_dict.keys(): self.style_dict[key]['name'] = key ET.SubElement(style_node, 'class', attrib=self.style_dict[key]) fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value } fontsizes = sorted(fontsize_dict.values(), reverse=True) # create a mapping between fontsizes and word stages self.fontsizekey2stage_mapping = {} for fontsize_key, value in fontsize_dict.items(): if value >= fontsizes[0]-1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION }) elif value <= fontsizes[-1]+1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION }) else: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION }) def get_biggest_fontSize4styles(self, style_set={}): """Returns biggest font size from style_dict for a set of style class names. [:returns:] (float) biggest font size OR 1 if style_dict is empty """ if bool(self.style_dict): sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True) return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1 else: return 1 def get_line_number(self, y): """Returns line number id for element at y. [:return:] (int) line number id or -1 """ if len(self.line_numbers) > 0: result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ] return result_list[0] if len(result_list) > 0 else -1 else: return -1 def init_all_properties(self, overwrite=False): """Initialize all properties. """ for property_key in self.properties_dictionary.keys(): if property_key not in self.online_properties: self.init_property(property_key, overwrite=overwrite) def init_property(self, property_key, value=None, overwrite=False): """Initialize all properties. Args: property_key: key of property in self.__dict__ value: new value to set to property overwrite: whether or not to update values from xml_file (default: read only) """ if value is None: if property_key not in self.online_properties: xpath, value, cls = self.properties_dictionary.get(property_key) if len(self.page_tree.xpath('//' + xpath)) > 0: value = self.page_tree.xpath('//' + xpath)[0] if value is not None: if cls.__module__ == 'builtins': self.update_tree(value, xpath) self.__dict__.update({property_key: cls(value)}) else: value = cls(node=value)\ if type(value) != cls\ else value self.__dict__.update({property_key: value}) self.__dict__.get(property_key).attach_object_to_tree(self.page_tree) else: self.__dict__.update({property_key: value}) self.online_properties.append(property_key) elif overwrite or property_key not in self.online_properties: xpath, default_value, cls = self.properties_dictionary.get(property_key) if cls.__module__ == 'builtins': self.__dict__.update({property_key: cls(value)}) self.update_tree(value, xpath) else: self.__dict__.update({property_key: value}) self.__dict__.get(property_key).attach_object_to_tree(self.page_tree) self.online_properties.append(property_key) def is_locked(self): """Return true if page is locked. """ return len(self.page_tree.xpath('//metadata/lock')) > 0 def is_page_source_xml_file(self, source_tree=None): """Return true if xml_file is of type FILE_TYPE_SVG_WORD_POSITION. """ if not isfile(self.xml_file): return True if source_tree is None: source_tree = ET.parse(self.xml_file) return source_tree.getroot().find('metadata/type').text == self.FILE_TYPE_SVG_WORD_POSITION def lock(self, reference_file, message=''): """Lock tree such that ids of words etc. correspond to ids in reference_file, optionally add a message that will be shown. """ if not self.is_locked(): metadata = self.page_tree.xpath('./metadata')[0]\ if len(self.page_tree.xpath('./metadata')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'metadata') lock = ET.SubElement(metadata, 'lock') ET.SubElement(lock, 'reference-file').text = reference_file if message != '': ET.SubElement(lock, 'message').text = message def unlock(self): """Lock tree such that ids of words etc. correspond to ids in reference_file, optionally add a message that will be shown. """ if self.is_locked(): lock = self.page_tree.xpath('//metadata/lock')[0] lock.getparent().remove(lock) def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') def update_property_dictionary(self, property_key, default_value): """Update properties_dictionary. """ content = self.properties_dictionary.get(property_key) if content is not None: self.properties_dictionary.update({property_key: (content[0], default_value, content[2])}) else: msg = f'ERROR: properties_dictionary does not contain a key {property_key}!' raise Exception(msg) def update_tree(self, value, xpath): """Update tree. """ node_name = dirname(xpath) node = self.page_tree.xpath('//' + node_name)[0]\ if len(self.page_tree.xpath('//' + node_name)) > 0\ else ET.SubElement(self.page_tree.getroot(), node_name) node.set(basename(xpath).replace('@', ''), str(value)) def _init_tree(self, should_xml_file_exist=False): """Initialize page_tree from xml_file if it exists. """ if isfile(self.xml_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(self.xml_file, parser) elif not should_xml_file_exist: self.page_tree = ET.ElementTree(ET.Element('page')) self.page_tree.docinfo.URL = self.xml_file else: msg = f'ERROR: xml_source_file {self.xml_file} does not exist!' raise FileNotFoundError(msg) Index: svgscripts/datatypes/reference.py =================================================================== --- svgscripts/datatypes/reference.py (revision 98) +++ svgscripts/datatypes/reference.py (revision 99) @@ -1,167 +1,173 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a text reference. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import re import sys from .attachable_object import AttachableObject sys.path.append('py2ttl') from class_spec import SemanticClass +NON_INT = re.compile('\D+') class Reference(AttachableObject,SemanticClass): """ This class represents a text reference. Args: id (int): object id first_line (int) first line of reference last_line (int) last line of reference is_uncertain (bool) whether reference is uncertain title (str) title of reference page_number (str) page_number of reference tag (str) xml tag """ XML_TAG = 'reference' intKeys = [ 'first_line', 'last_line'] boolKeys = [ 'is_uncertain' ] stringKeys = [ 'title', 'page_number', 'word_reference' ] def __init__(self, node=None, id=0, first_line=-1, last_line=-1, is_uncertain=False, title=None, page_number=None, word_reference=None, tag=XML_TAG): self.intKeys = [] self.intKeys += Reference.intKeys self.intKeys.append('id') self.stringKeys = [] self.stringKeys += Reference.stringKeys self.boolKeys = [] self.boolKeys += Reference.boolKeys self.id = id self.first_line = first_line self.last_line = last_line self.is_uncertain = is_uncertain self.title = title self.page_number = page_number self.word_reference = word_reference self.tag = tag def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree, self.tag) for key in self.boolKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(self.__dict__[key]).lower()) for key in self.intKeys: if self.__dict__[key] is not None and self.__dict__[key] > -1: obj_node.set(key.replace('_','-'), str(self.__dict__[key])) for key in self.stringKeys: if self.__dict__[key] is not None and self.__dict__[key] != '': obj_node.set(key.replace('_','-'), str(self.__dict__[key])) @classmethod def create_cls_from_node(cls, node): """Creates a Reference from a (lxml.etree.Element) node. :return: (datatypes.reference) Reference """ instance = cls() for key in instance.boolKeys: xml_key = key.replace('_', '-') if bool(node.get(xml_key)): instance.__dict__[key] = node.get(xml_key) == 'true' for key in instance.intKeys: xml_key = key.replace('_', '-') if bool(node.get(xml_key)): instance.__dict__[key] = int(node.get(xml_key)) for key in instance.stringKeys: xml_key = key.replace('_', '-') if bool(node.get(xml_key)): instance.__dict__[key] = node.get(xml_key) return instance @classmethod def create_cls(cls, node=None, id=0, is_uncertain=False, reference_string='', title='', page_number=''): """Creates a Reference from a (lxml.etree.Element) node or a reference_string. :return: (datatypes.reference) Reference """ if node is not None: return cls.create_cls_from_node(node) else: first_line = -1 last_line = -1 word_reference = None if re.match(r'[0-9]+([a-z]+)*,[0-9]+(-[0-9]+)*', reference_string): page_number = reference_string.split(',')[0] line_numbers = reference_string.split(',')[1].split('-') - first_line = int(line_numbers[0]) - last_line = int(line_numbers[1]) if len(line_numbers) > 1 else -1 + first_line = _save_get_int(line_numbers[0]) + last_line = _save_get_int(line_numbers[1]) if len(line_numbers) > 1 else -1 else: if ',' not in reference_string: if re.match(r'\D+.*', reference_string): word_reference = reference_string.strip() else: line_numbers = reference_string.split('-') - first_line = int(line_numbers[0]) - last_line = int(line_numbers[1]) if len(line_numbers) > 1 else -1 + first_line = _save_get_int(line_numbers[0]) + last_line = _save_get_int(line_numbers[1]) if len(line_numbers) > 1 else -1 else: if ' ' not in reference_string: raise Exception('String "{}" is not a valid reference_string'.format(reference_string)) title = reference_string.split(' ')[0] return cls.create_cls(id=id, is_uncertain=is_uncertain, reference_string=reference_string[len(title)+1:],\ title=title, page_number=page_number) return cls(id=id, is_uncertain=is_uncertain, first_line=first_line, last_line=last_line,\ title=title, page_number=page_number, word_reference=word_reference) + @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} properties.update({'first_line': { 'class': int, 'cardinality': 1, 'cardinality_restriction': 'maxCardinality', 'name': 'firstLineOfReference',\ 'label': 'first line of reference'}}) properties.update({'last_line': { 'class': int, 'cardinality': 1, 'cardinality_restriction': 'maxCardinality', 'name': 'lastLineOfReference',\ 'label': 'last line of reference'}}) properties.update({'word_reference': { 'class': str, 'cardinality': 1, 'cardinality_restriction': 'maxCardinality', 'name': 'wordReference',\ 'label': 'refers to word on same line'}}) properties.update({'is_uncertain': { 'class': bool, 'cardinality': 0, 'name': 'IsUncertain', 'label': 'whether something is uncertain'}}) properties.update(cls.create_semantic_property_dictionary('title', str, cardinality=0)) properties.update(cls.create_semantic_property_dictionary('page_number', str, cardinality=0)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) +def _save_get_int(line_reference) -> int: + """Return line as int and remove none int str at end of str. + """ + return int(NON_INT.sub('', line_reference)) Index: svgscripts/datatypes/style.py =================================================================== --- svgscripts/datatypes/style.py (revision 98) +++ svgscripts/datatypes/style.py (revision 99) @@ -1,205 +1,206 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent the style of a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy from lxml import etree as ET import re import sys from .color import Color sys.path.append('py2ttl') from class_spec import SemanticClass class Style(SemanticClass): """ This class represents the style of a word. Args: manuscript: a ArchivalManuscriptUnity """ NIETSCHES_FONTS = { 'german': 'deutsche Schreibschrift', 'latin': 'lateinische Schreibschrift' } COLOR_KEYS = [ 'black', 'red', 'blue', 'green', 'grey' ] RELEVANT_STYLE_KEYS = [ 'font-family', 'fill', 'stroke' ] ADDITIONAL_STYLE_KEYS = [ 'font-size' ] PERCENTS = [ '80%', '70%' ] WRITING_INSTRUMENTS = { (COLOR_KEYS[0], False): 'schwarze Tinte',\ (COLOR_KEYS[0], True): 'Bleistift',\ (COLOR_KEYS[4], True): 'Bleistift',\ (COLOR_KEYS[4], False): 'Bleistift',\ (COLOR_KEYS[1], False): 'braune Tinte',\ (COLOR_KEYS[1], True): 'Rotstift',\ (COLOR_KEYS[2], False): 'violette Tinte',\ (COLOR_KEYS[2], True): 'Blaustift',\ (COLOR_KEYS[3], False): '„Tinte der letzten Korrektur“',\ (COLOR_KEYS[3], True): '„Tinte der letzten Korrektur“'} def __init__(self, manuscript=None, writing_process_id=-1, extended_styles=False, deletion_color=None): self.color = Color.create_cls(manuscript=manuscript) self.css_styles = [] self.css_string = None self.deletion_color = deletion_color self.is_german = True self.font = self.NIETSCHES_FONTS['german'] self.font_family = 'Weidemann-Book' self.font_size = '' self.manuscript = manuscript self.relevant_key_map = {} relevant_style_keys = self.RELEVANT_STYLE_KEYS + self.ADDITIONAL_STYLE_KEYS\ if extended_styles else self.RELEVANT_STYLE_KEYS for key in relevant_style_keys: if not key.startswith('font'): self.relevant_key_map.update({key: self.set_color}) elif key == 'font-family': self.relevant_key_map.update({key: self.set_font}) elif key == 'font-size': self.relevant_key_map.update({key: self.set_size}) self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, False)] self.writing_process_id = writing_process_id def create_a_copy_wo_writing_process_id(self): new_self = copy.deepcopy(self) new_self.writing_process_id = -1 return new_self def create_a_copy(self, reduce_writing_process_id=False): writing_process_id = self.writing_process_id\ if not reduce_writing_process_id\ else self.writing_process_id-1 copy = Style(manuscript=self.manuscript, writing_process_id=writing_process_id) copy.color = self.color copy.font_family = self.font_family copy.process_style_classes() if copy.manuscript is not None: copy.manuscript.update_styles(copy) return copy def create_css_styles(self): """Create css styles. """ if self.deletion_color is not None: self.css_styles.append('text-decoration:line-through;') self.css_styles.append(f'text-decoration-color:{self.deletion_color.hex_color};') self.css_styles.append(f'-webkit-text-decoration-color:{self.deletion_color.hex_color};') if self.font_family.endswith('Bold'): self.css_styles.append(f'font-weight:bold;') #if self.font_size != '': # self.css_styles.append(f'font-size:{self.font_size};') if self.writing_process_id > 0: self.css_styles.append(f'font-size:{self.PERCENTS[self.writing_process_id-1]};') self.css_styles.append(f'color:{self.color.hex_color};') self.css_string = ''.join(self.css_styles) @classmethod def create_cls(cls, page, style_string, manuscript=None, create_css=False, deletion_color=None, writing_process_id=-1): """Creates a Style from a style_string. :return: (datatypes.style) Style """ style = cls(manuscript=manuscript, extended_styles=create_css, deletion_color=deletion_color, writing_process_id=writing_process_id) style_dict = { key: key_dict for key, key_dict in page.style_dict.items()\ if any(relevant_key in key_dict.keys() for relevant_key in style.relevant_key_map.keys()) } - for style_key in style_string.split(' '): - if style_key in style_dict.keys(): - dictionary = style_dict[style_key] - for key, set_function in [ (key, func) for key, func in style.relevant_key_map.items() if key in dictionary.keys() ]: - if callable(set_function): - set_function(dictionary[key]) + if style_string is not None: + for style_key in style_string.split(' '): + if style_key in style_dict.keys(): + dictionary = style_dict[style_key] + for key, set_function in [ (key, func) for key, func in style.relevant_key_map.items() if key in dictionary.keys() ]: + if callable(set_function): + set_function(dictionary[key]) style.process_style_classes() if create_css: style.create_css_styles() return style @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ properties = {} properties.update(cls.create_semantic_property_dictionary('font', str, cardinality=1,\ name='styleHasFont', label='style has font', comment='Connects a style with the kind of font Nietzsche used in writing.')) properties.update(cls.create_semantic_property_dictionary('writing_instrument', str, cardinality=1,\ name='styleHasWritingInstrument', label='style has writing instrument', comment='Connects a style with the description of a writing instrument.')) properties.update(cls.create_semantic_property_dictionary('color', Color, cardinality=1,\ name='styleHasColor', label='style has color', comment='Connects a style with a color.')) #properties.update(cls.create_semantic_property_dictionary('css_styles', str,\ properties.update(cls.create_semantic_property_dictionary('css_string', str,\ subPropertyOf=cls.STOFF_STYLE_HAS_CSS_URL_STRING,\ name='styleHasCSS', label='style has css', comment='Connects a style with CSS style.')) dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } return cls.return_dictionary_after_updating_super_classes(dictionary) def process_style_classes(self): """Infere writing instrument from font-family and color. """ if self.font_family.startswith('NewsGothic'): self.is_german = False self.font = self.NIETSCHES_FONTS['latin'] if self.color.name in self.COLOR_KEYS: self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, self.font_family.endswith('Bold'))] def set_color(self, hex_color: str): if hex_color != 'none': self.color = Color.create_cls(hex_color=hex_color, manuscript=self.manuscript) def set_font(self, font_family: str): self.font_family = font_family def set_size(self, font_size: str): self.font_size = font_size @classmethod def remove_irrelevant_style_keys(cls, style_string, page, extended_styles=False) -> str: """Return a style_string without irrelevant style keys. """ relevant_style_keys = cls.RELEVANT_STYLE_KEYS + cls.ADDITIONAL_STYLE_KEYS\ if extended_styles else cls.RELEVANT_STYLE_KEYS return ' '.join(sorted( style_key for style_key in style_string.split(' ')\ if len(\ [ key for key in page.style_dict[style_key].keys()\ if key in relevant_style_keys ]\ ) > 0 )) def __eq__(self, other): """Returns true if self is qualitatively identical to other. Reason: For qualities, the idea of numerical identity is silly. """ if other is None: return False return self.color == other.color\ and self.font_family == other.font_family\ and self.writing_process_id == other.writing_process_id\ and self.css_styles == other.css_styles\ and self.font_size == other.font_size def __hash__(self): """Return a hash value for self. """ return hash((self.color.__hash__, self.font_family, self.writing_process_id)) Index: svgscripts/datatypes/word.py =================================================================== --- svgscripts/datatypes/word.py (revision 98) +++ svgscripts/datatypes/word.py (revision 99) @@ -1,832 +1,862 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy import inspect from lxml import etree as ET from operator import attrgetter import re import string import sys import warnings from .box import Box from .editor_comment import EditorComment from .matrix import Matrix from .path import Path from .simple_word import SimpleWord from .style import Style from .word_deletion_path import WordDeletionPath from .word_position import WordPosition from .transkription_position import TranskriptionPosition from .writing_process import WritingProcess SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation) def execute_function_on_parts(word_parts, func_name): """Execute function on parts and add those parts instead of original word to word_parts. :return: new word_parts, output from func """ copy_parts = word_parts[:] for word in word_parts: output = eval('word.{0}()'.format(func_name)) if len(word.word_parts) > 0: for part_word in word.word_parts: copy_parts.insert(copy_parts.index(word), part_word) copy_parts.remove(word) word.word_parts = [] return copy_parts, output def update_transkription_position_ids(word): """Update transkription_position' ids according to index. """ word_part_ids = [ wp.id for wp in word.word_parts ] if len(word_part_ids) != len(set(word_part_ids)): for id, wp in enumerate(word.word_parts): wp.id = id for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))): transkription_position.id = index transkription_position.has_box = None transkription_position.deleted = False class Word(SimpleWord): """ This class represents a word. """ COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ] APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' } DATA = 'debug-data' RDFS_SUBCLASSOF_LIST = ['http://www.e-editiones.ch/ontology/text#HandwrittenText'] XML_TAG = 'word' XML_EARLIER_VERSION = 'earlier-version' XML_OVERWRITES = 'overwrites' XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\ 'isDeletionOfWord': 'deletesEarlierPart',\ 'isExtensionOfWord': 'extendsEarlierVersion',\ 'isTransformationOfWord': 'transformsEarlierPart' } def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None): super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.corrections = [] self.deleted = deleted self.deletion_paths = [] self.debug_container = {} self.debug_msg = None self.earlier_version = earlier_version self.edited_text = None self.editor_comment = None self.isClarificationOfWord = None self.isDeletionOfWord = None self.isExtensionOfWord = None self.isTransformationOfWord = None if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0: self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ]) self.overwrites_word = None self.styles = styles\ if styles is not None\ else [] self.verified = None self.writing_process_id = writing_process_id self.writing_processes = [] self.word_insertion_mark = None self.word_box = None self.word_parts = word_parts if word_parts is not None else [] self.word_part_objs = word_part_objs if word_part_objs is not None else [] def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Add a word deletion path to word. """ if len(self.word_parts) > 0: for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) elif self.deleted and len(self.transkription_positions) > 0: word_path = Path.create_path_from_transkription_position(self.transkription_positions[0],\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) self.deletion_paths = [ deletion_path for deletion_path in deletion_paths\ if do_paths_intersect_saveMode(deletion_path, word_path) ] def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = super(Word,self).attach_word_to_tree(target_tree) if self.deleted is not None: word_node.set('deleted', str(self.deleted).lower()) if self.verified is not None: word_node.set('verified', str(self.verified).lower()) if self.edited_text is not None: word_node.set('edited-text', self.edited_text) if self.editor_comment is not None: self.editor_comment.attach_object_to_tree(word_node) if self.writing_process_id > -1: word_node.set('writing-process-id', str(self.writing_process_id)) for index, word_part in enumerate(self.word_parts): word_part.id = index word_part.attach_word_to_tree(word_node) if self.earlier_version is not None: earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION) self.earlier_version.attach_word_to_tree(earlier_node) if self.overwrites_word is not None\ and len(self.overwrites_word.transkription_positions) > 0: overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES) self.overwrites_word.attach_word_to_tree(overwrite_node) if self.word_box is not None: self.word_box.attach_object_to_tree(word_node) if len(self.corrections) > 0: word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ]))) for key in self.XML_CORRECTION_DICT.keys(): if self.__dict__[key] is not None: word_node.set(self.XML_CORRECTION_DICT[key], 'true') return word_node def belongs_to_multiple_writing_processes(self, include_parts=False): """Returns true if transkription_positions belong to different WritingProcesses. """ if len(self.word_parts) > 0 and include_parts: return len(set(word.writing_process_id for word in self.word_parts)) > 1 return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1 def set_parent_word_writing_process_id(self): """Set writing_process_id for parent word. """ ids = set(word.transkription_positions[0].style for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None) if len(ids) > 1: self.writing_process_id = max([style.writing_process_id for style in ids]) if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\ for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\ > 1: self.writing_process_id += 1 @classmethod def create_cls(cls, word_node): """Creates a word from a (lxml.Element) node. [:return:] Word """ cls = super(Word,cls).create_cls(word_node) cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1 cls.split_strings = None cls.join_string = word_node.get('join') if bool(word_node.get('split')): cls.split_strings = word_node.get('split').split(' ') if ''.join(cls.split_strings) != cls.text: error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\ format(word_node.getroottree().docinfo.URL, str(cls.id))\ + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\ + 'Text attribute: "{0}".\n'.format(cls.text) raise Exception(error_msg) cls.verified = word_node.get('verified') == 'true'\ if bool(word_node.get('verified')) else None cls.deleted = word_node.get('deleted') == 'true'\ if bool(word_node.get('deleted')) else None cls.edited_text = word_node.get('edited-text') cls.editor_comment = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ][0]\ if len([ node for node in word_node.xpath('./' + EditorComment.XML_TAG) ]) > 0 else None cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ] if bool(word_node.get('corrections')): for index in [ int(i) for i in word_node.get('corrections').split(' ') ]: if index < len(cls.word_parts): cls.corrections.append(cls.word_parts[index]) cls.earlier_version = None if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0: cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0] for key_value in cls.XML_CORRECTION_DICT.values(): if word_node.get(key_value) == 'true': cls.__dict__[key_value] = True if cls.earlier_version is not None: for word_part in cls.word_parts: for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\ and len(cls.word_parts) <= len(cls.earlier_version.word_parts): try: word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id] except Exception: msg = f'{cls.id} {cls.text}: {word_part.id}' raise Exception(msg) for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls.earlier_version for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\ if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\ else None cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\ if len(word_node.xpath('./' + Box.XML_TAG)) > 0\ else None return cls + @classmethod + def join_words(cls, list_of_words): + """Creates a word from a list of words. + + [:return:] Word + """ + if len(list_of_words) > 1: + deleted = True in [ word.deleted for word in list_of_words ]\ + and len(set([ word.deleted for word in list_of_words ])) == 1 + line_number = list_of_words[0].line_number\ + if len(set([ word.line_number for word in list_of_words ])) == 1\ + else -1 + for word in list_of_words: + if len(word.word_parts) > 0: + index = list_of_words.index(word) + list_of_words.remove(word) + for part_word in reversed(word.word_parts): + list_of_words.insert(index, part_word) + new_word = cls(id=list_of_words[0].id, text=''.join([word.text for word in list_of_words]),\ + line_number=line_number, deleted=deleted, word_parts=list_of_words) + if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]: + change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0] + new_word.edited_text = new_word.text.replace(change_text, change_text[:-1]) + for id, word in enumerate(new_word.word_parts): word.id = id + return new_word + if len(list_of_words) > 0: + return list_of_words[0] + else: + return None + def create_earlier_version(self, root_word=None, id=0): """Create an earlier version of word. """ if root_word is None: root_word = self root_word.set_parent_word_writing_process_id() word_parts = [] non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\ if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ] non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts) if non_single_punctuation_word_parts_length > 0\ and len([ word_part for word_part in non_single_punctuation_word_parts\ if word_part.deleted ])\ == non_single_punctuation_word_parts_length: self.deleted = True for word_part in non_single_punctuation_word_parts: word_part.deleted = False for id, word_part in enumerate(self.word_parts): earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id) if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) elif word_part.overwrites_word is not None\ and (len(word_part.transkription_positions) > 0\ and word_part.overwrites_word.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style\ != word_part.overwrites_word.transkription_positions[0].style): word_part.overwrites_word.id = word_part.id word_parts.append(word_part.overwrites_word) word_part.isTransformationOfWord = word_part.overwrites_word #print(f'transform: {self.text}') if word_part not in self.corrections: self.corrections.append(word_part) elif root_word.writing_process_id > -1\ and (len(word_part.transkription_positions) > 0\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style.writing_process_id\ == root_word.writing_process_id): word_part.extendsEarlierVersion = True #print('extends') if word_part not in self.corrections: self.corrections.append(word_part) else: if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) else: #print(f'default: {self.text}') word_parts.append(earlierWordPart) text = ''.join([ word.text for word in word_parts ])\ if len(word_parts) > 0\ else self.text if len(word_parts) == 1: self.transkription_positions += word_parts[0].transkription_positions self.faksimile_positions += word_parts[0].faksimile_positions word_parts = [] new_transkription_positions = copy.deepcopy(self.transkription_positions) if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None: writing_process_id = self.transkription_positions[0].style.writing_process_id for new_tp in new_transkription_positions: new_tp.style.writing_process_id = writing_process_id return Word(id=id, text=text, transkription_positions=new_transkription_positions,\ faksimile_positions=self.faksimile_positions, line_number=self.line_number,\ word_parts=word_parts) def create_correction_history(self, page=None, box_style=None): """Create correction history. """ if self.word_box is not None: manuscript = self.transkription_positions[0].style.manuscript\ if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None\ else None style = Style() if box_style is not None: style = box_style if page is not None: style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript) for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]: style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions) for transkription_position in transkription_positions: transkription_position.style = style self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\ line_number=self.line_number) for word_part in self.word_parts: word_part.create_correction_history(page=page, box_style=box_style) if len(self.word_parts) > 0: earlier_version = self.create_earlier_version() extending_words = self._get_parts_with_property_key('extendsEarlierVersion') if len(extending_words) > 0: for word in extending_words: word.isExtensionOfWord = earlier_version if self.has_mixed_status('deleted', include_parts=True): self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ]) if len(self.corrections) > 0: self.earlier_version = earlier_version @staticmethod def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): """Creates a word from a (lxml.Element) node or word_part_objs. [:return:] Word """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number text = word_node.get('text') deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true' transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ if len(word_node.findall('.//' + Word.DATA)) > 0\ else [ item.attrib for item in word_node.findall('.//part')] return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file WIDTH = 5 TOPCORRECTION = 2.0 FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize height = height x = round(float(word_part_objs[0]['x']), 3) if(page is not None and bool(page.style_dict)): HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size if endSign is not None and '%' in endSign: lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ if bool(page.style_dict[key].get('font-size'))] lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR elif endSign is not None and '%' in endSign: endX = float(endX) + WIDTH bottom = round(float(word_part_objs[0]['y']), 3) y = round(bottom - height + TOPCORRECTION, 3) width = round(float(endX) - x, 3) transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] text = ''.join([ dict['text'] for dict in word_part_objs]) line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) word.debug_msg = debug_msg return word else: error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(Word,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\ cardinality=1, cardinality_restriction='minCardinality',\ name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\ name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\ name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\ comment='Word has been deleted by the author using a deletion path.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comment', EditorComment,\ name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\ name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\ name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\ name='isClarificationOfWord', label='word is a clarification of word',\ comment='The author has used this part of the word in order to clarify the appearance of that word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\ name='isDeletionOfWord', label='word is a deletion of word',\ comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\ name='isExtensionOfWord', label='word is a extension of word',\ comment='The author has used this part of a word in order to extend an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\ name='isTransformationOfWord', label='word is a transformation of word',\ comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\ name='overwritesWord', label='word overwrites word',\ comment='The author has used this word in order to overwrite that word.')) # This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING, # cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class. dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\ name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\ subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING)) super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\ name='isCorrectionOfWord', label='word is a correction of word',\ comment='The author has used this word in order to correct that word.') for key in cls.XML_CORRECTION_DICT.keys(): correction_dict = dictionary[cls.PROPERTIES_KEY].get(key) correction_dict.update(super_property_dictionary) dictionary[cls.PROPERTIES_KEY].update({key: correction_dict}) return cls.return_dictionary_after_updating_super_classes(dictionary) def has_mixed_status(self, property_key, include_parts=False, concerns_word=True): """Returns true if transkription_positions have mixed status concerning the property_key in their __dict__. """ if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions): return False if len(self.word_parts) > 0 and include_parts: if concerns_word: if False in set(property_key in word.__dict__.keys() for word in self.word_parts): return False return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1 else: return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\ if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1 return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1 def init_word(self, page): """Initialize word with objects from page. """ super(Word,self).init_word(page) if self.writing_process_id > -1: self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ] writing_processes = self.writing_processes for word_part in self.word_parts: word_part.init_word(page) self.lines += word_part.lines self.writing_processes += word_part.writing_processes self.lines = [ line for line in set(self.lines) ] self.writing_processes = [ wp for wp in set(self.writing_processes)] if self.overwrites_word is not None: self.overwrites_word.init_word(page) if self.earlier_version is not None: if self.earlier_version.writing_process_id == -1: self.earlier_version.writing_process_id = self.writing_process_id-1 if self.earlier_version.line_number == -1: self.earlier_version.line_number = self.line_number self.earlier_version.init_word(page) def join(self, other_word, append_at_end_of_new_word=True): """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. """ if append_at_end_of_new_word: self.text = self.text + other_word.text for position in other_word.transkription_positions: position.id = str(len(self.transkription_positions)) self.transkription_positions.append(position) else: self.text = other_word.text + self.text index = 0 for position in other_word.transkription_positions: self.transkription_positions.insert(index, position) index += 1 while index < len(self.transkription_positions): self.transkription_positions[index].id = str(index) index += 1 self.simplify_transkription_positions() def partition_according_to_deletion(self): """Partition a word according to its transkription_positions' deletion status ->split word and add partial words as its parts. """ if self.has_mixed_status('deleted'): transkription_positions = [] last_status = None for transkription_position in self.transkription_positions: if transkription_position.deleted != last_status\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.deleted if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] self.line_number = -1 self.deleted = False elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion') elif not self.deleted\ and len(self.transkription_positions) > 0\ and self.transkription_positions[0].deleted: self.deleted = True def partition_according_to_writing_process_id(self): """Partition a word according to its transkription_positions' writing_process_ids ->split word and add partial words as its parts. """ if self.belongs_to_multiple_writing_processes(): last_writing_process_id = -1 transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.writing_process_id != last_writing_process_id\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_writing_process_id = transkription_position.writing_process_id if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id') if self.belongs_to_multiple_writing_processes(include_parts=True): self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0] elif len(self.transkription_positions) > 0: self.writing_process_id = self.transkription_positions[0].writing_process_id def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False): """Determines whether word is over a word box. """ word_over_box = None if len(self.word_parts) > 0: for word in self.word_parts: current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None)) if current_word is not None and current_word.word_box is not None: word_over_box = current_word else: new_tp_dict = {} for index, transkription_position in enumerate(self.transkription_positions): if previous_word_has_box and index == 0: if len(transkription_position.positional_word_parts) > 0: transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2 #print(f'{self.text}: {transkription_position.positional_word_parts[0].left}') else: transkription_position.left += 1 word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) containing_boxes = [ box_path for box_path in box_paths\ if word_path.is_partially_contained_by(box_path)\ or box_path.do_paths_intersect(word_path) ] if len(containing_boxes) > 0: if previous_word_has_box: print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}') self._set_box_to_transkription_position(containing_boxes[0], word_path,\ transkription_position, new_tp_dict, tr_xmin) box_paths.remove(containing_boxes[0]) for replace_tp in new_tp_dict.keys(): for tp in new_tp_dict.get(replace_tp): self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp) self.transkription_positions.remove(replace_tp) word_over_box = self._get_partial_word_over_box() update_transkription_position_ids(self) return word_over_box def set_word_insertion_mark(self, word_insertion_mark): """Sets word_insertion_mark """ self.word_insertion_mark = word_insertion_mark def set_writing_process_id_to_transkription_positions(self, page): """Determines the writing process id of the transkription_positions. """ for transkription_position in self.transkription_positions: if len(transkription_position.positional_word_parts) > 0: for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): if font_key in page.fontsizekey2stage_mapping.keys(): transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) def simplify_transkription_positions(self): """Merge transkription_positions if possible. """ index = len(self.transkription_positions)-1 while index > 0\ and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]: current_tp = self.transkription_positions[index] index -= 1 previous_tp = self.transkription_positions[index] if previous_tp.is_mergebale_with(current_tp): positional_word_parts = previous_tp.positional_word_parts positional_word_parts += current_tp.positional_word_parts transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id) if len(transkription_positions) == 1: transkription_positions[0].writing_process_id = previous_tp.writing_process_id\ if previous_tp.writing_process_id != -1\ else current_tp.writing_process_id self.transkription_positions.pop(index+1) self.transkription_positions[index] = transkription_positions[0] #print(self.text, len(self.transkription_positions)) def split(self, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) return previousWord, currentWord, nextWord def split_according_to_status(self, status, splits_are_parts=False): """Split a word according to its transkription_positions' text. :return: a list of new word.Word """ new_words = [] if self.has_mixed_status(status): last_status = None transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.__dict__[status] != last_status\ and len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.__dict__[status] if len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) if splits_are_parts: self.word_parts += new_words if len(self.word_parts) > 0: self.transkription_positions = [] return new_words def undo_partitioning(self): """Undo partitioning. """ if len(self.word_parts) > 0: for word_part in self.word_parts: word_part.undo_partitioning() if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]): self.transkription_positions += word_part.transkription_positions self.earlier_version = None self.edited_text = None self.word_box = None self.word_parts = [] self.corrections = [] self.earlier_versions = [] self.box_paths = [] def _create_new_word(self, transkription_positions, status, new_id=0): """Create a new word from self and transkription_positions. """ newWord = Word(id=new_id, transkription_positions=transkription_positions) for key in self.COPY_PROPERTY_KEY: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys(): newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status]) else: newWord.__dict__[status] = transkription_positions[0].__dict__[status] return newWord def _get_parts_with_property_key(self, property_key): """Return a list of word_parts with property == property_key. """ word_parts = [] for word_part in self.word_parts: if property_key in word_part.__dict__.keys(): word_parts.append(word_part) else: word_parts += word_part._get_parts_with_property_key(property_key) return word_parts def _get_partial_word_over_box(self): """Partition a word according to its transkription_positions' has_box ->split word and add partial words as its parts. :return: word over box or self """ word_over_box = None if self.has_mixed_status('has_box'): transkription_positions = [] last_word_box = None for transkription_position in self.transkription_positions: if transkription_position.has_box != last_word_box\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box transkription_positions = [] transkription_positions.append(transkription_position) last_word_box = transkription_position.has_box if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box self.transkription_positions = [] elif len(self.word_parts) > 0: #self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box') for word_part in self.word_parts: if word_over_box is None: word_over_box = word_part._get_partial_word_over_box() else: break elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1: word_over_box = self word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box return word_over_box def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin): """Set box_path to transkription_position that is contained by box_path. Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary. """ if box_path.contains_path(word_path): transkription_position.has_box = box_path elif box_path.contains_start_of_path(word_path): split_position = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[0].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path elif box_path.contains_end_of_path(word_path): split_position = box_path.path.bbox()[0] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path else: # box_path in the middle of word_pathz split_position1 = box_path.path.bbox()[0] - tr_xmin split_position2 = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position1, split_position2) if len(new_tps) >= 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path def do_paths_intersect_saveMode(mypath1, mypath2): """Returns true if paths intersect, false if not or if there was an exception. """ try: return mypath1.path.intersect(mypath2.path, justonemode=True)\ or mypath1.is_partially_contained_by(mypath2) except AssertionError: return False Index: svgscripts/datatypes/footnotes.py =================================================================== --- svgscripts/datatypes/footnotes.py (revision 98) +++ svgscripts/datatypes/footnotes.py (revision 99) @@ -1,336 +1,347 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract footnotes from a svg file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET import warnings __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from .atypical_writing import AtypicalWriting from .clarification import Clarification from .editor_correction import EditorCorrection from .line_continuation import LineContinuation from .matrix import Matrix from .standoff_tag import StandoffTag from .text import Text from .transkriptionField import TranskriptionField from .uncertain_decipherment import UncertainDecipherment UNITTESTING = False DEBUG = False class FootnoteColumns: """This class represents footnote columns. """ REFERENCE_PATTERN = re.compile('.*(\d+-)*[0-9]+:') EXTENDED_REFERENCE_PATTERN = re.compile('.*(\d+(-|/))*[0-9]+:') REFERENCE_GROUP = re.compile('(.*\D)((\d+-)*[0-9]+:)') EXCEPTION = re.compile('((\d+/)+[0-9]+:)') - def __init__(self, nsmap, nodes, bottom_values, style_dict, debug=False): + def __init__(self, nsmap, nodes, bottom_values, style_dict, debug=False, skip_after=-1.0): self.bottom_values = bottom_values self.footnote_columns = [] self.footnote_keys = {} self.index = 0 self.nodes = nodes self.nsmap = nsmap + self.skip_after = skip_after self.style_dict = style_dict self.debug = debug self._init_columns() def _init_columns(self): """Initialize footnote column positions by creating lists in self.footnote_columns and adding the positions a keys to self.footnote_keys while the index of self.footnote_columns are their values. """ first_line_fn_nodes = sorted([ item for item in self.nodes\ - if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == round(self.bottom_values[0], 1) ],\ + if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == round(self.bottom_values[0], 1)\ + and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after],\ key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX()) current_nodes = [] for node in first_line_fn_nodes: matrix = Matrix(transform_matrix_string=node.get('transform')) if len(node.getchildren()) > 0: for tspan in node.findall('tspan', self.nsmap): x = matrix.add2X(float(tspan.get('x'))) current_nodes.append({ 'x': x, 'text': tspan.text }) elif node.text is not None: x = matrix.getX() current_nodes.append({ 'x': x, 'text': node.text }) if re.match(self.EXTENDED_REFERENCE_PATTERN,\ ''.join([ item.get('text') for item in current_nodes])): current_nodes = self._remove_unused_texts(current_nodes) self.footnote_columns.append([]) self.footnote_keys.update({ round(current_nodes[0].get('x')): len(self.footnote_columns)-1 }) current_nodes = [] if len(self.footnote_keys) == 0: raise Exception(f'ERROR: there are no footnote_keys') def _remove_unused_texts(self, nodes): """Remove tspan that contain text that is not a line reference. """ threshold = 100 node_text = ''.join([ item.get('text') for item in nodes]) match = re.match(self.REFERENCE_GROUP, node_text) if match is not None and match.group(1) is not None\ and not re.match(self.EXCEPTION, node_text): unused_text = '' index = 0 for item in nodes: unused_text += item.get('text') if match.group(1).startswith(unused_text): index += 1 else: break if len(nodes) > index+1: counter = 0 has_gap = False for item in nodes[index:]: if len(nodes) > index+counter+1\ and nodes[index+counter+1].get('x')-nodes[index+counter].get('x') > threshold: index += counter+1 has_gap = True break counter += 1 if has_gap: return nodes[index+1:] return nodes[index:] return nodes def append(self, footnote): """Append footnote to a column """ self.footnote_columns[self.index].append(footnote) @classmethod - def create_cls(cls, style_dict=None, page=None, transkription_field=None, svg_tree=None, svg_file=None, marginals_on_extra_page=False): + def create_cls(cls, style_dict=None, page=None, transkription_field=None, svg_tree=None, svg_file=None, marginals_on_extra_page=False, skip_after=-1.0): """Returns all footnotes as a list of Text. """ if page is not None and page.source is not None and svg_file is None: svg_file = page.source\ if page.marginals_source is None\ else page.marginals_source if transkription_field is None and svg_file is not None: multipage_index = -1\ if page is None\ else page.multipage_index transkription_field = TranskriptionField(svg_file, multipage_index=multipage_index) if svg_tree is None and svg_file is not None: svg_tree = ET.parse(svg_file) if style_dict is None and page is not None: style_dict = StandoffTag.create_relevant_style_dictionary(page) if page is not None and page.marginals_source is not None: marginals_on_extra_page = True svg_tree = ET.parse(page.marginals_source) nodes_in_footnote_area = cls.EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field, marginals_on_extra_page=marginals_on_extra_page) - bottom_values = cls.GET_BOTTOM_VALUES(nodes_in_footnote_area) + bottom_values = cls.GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area) if len(bottom_values) == 0: return None else: - return cls(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, style_dict) + return cls(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, style_dict, skip_after=skip_after) - def extract_footnotes(self, contains_string='') -> list: + def extract_footnotes(self, contains_string='', contains_strings=None) -> list: """Returns all footnotes as a list of Text. """ left_value = -1 - for bottom_value in set([ round(bottom_value, 1) for bottom_value in self.bottom_values]): - nodes_on_line = [ item for item in self.nodes if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == bottom_value ] - nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) + for bottom_value in self.bottom_values: + nodes_on_line = sorted([ item for item in self.nodes\ + if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == bottom_value\ + and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after\ + ],\ + key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) footnote = None matrix = None for node in nodes_on_line: matrix = Matrix(transform_matrix_string=node.get('transform')) footnote, left_value = self._process_content_and_markup(node, footnote, matrix) if footnote is not None: self.append(footnote) footnotes = self.toList() + if contains_strings is not None: + footnotes = [ footnote for footnote in footnotes if True in [ contains_string in footnote.content for contains_string in contains_strings] ] if contains_string != '': footnotes = [ footnote for footnote in footnotes if contains_string in footnote.content ] return footnotes def get_index(self, left_value) -> int: """Return index of column for left value. """ index = -1 if round(left_value) in self.footnote_keys.keys(): index = self.footnote_keys[round(left_value)] else: for key, value in self.footnote_keys.items(): if abs(key - round(left_value)) < 2: index = value break return index def register_index(self, left_value): """Register index for next column to be used. """ index = self.get_index(left_value) if index > -1: self.index = index else: error_value = round(left_value) msg = f'Left value not part of columns: {error_value} -> {self.footnote_keys}' raise Exception(msg) def toList(self): """Return footnotes as a list of Text. """ footnotes = [] for footnote_list in self.footnote_columns: for footnote in footnote_list: if re.match(self.REFERENCE_PATTERN, footnote.content): footnotes.append(footnote) - else: + elif len(footnotes) > 0: footnotes[-1].join(footnote) + else: + print([ footnote.content for footnote in self.footnote_columns[1]]) + print(self.footnote_keys) + raise Exception(f'List of footnotes empty and footnote "{footnote.content}" does not match {self.REFERENCE_PATTERN.pattern}!') return footnotes def _process_content_and_markup(self, node, footnote, matrix): """Process content and markup of node. [:return:] (footnote: Text, left_value: float) """ startIndex = 0 next_text = node.text left_value = matrix.getX() items = [ item for item in node.findall('tspan', self.nsmap)] if len(items) > 0: next_text = ''.join([ item.text for item in items]) left_value = matrix.add2X(float(items[0].get('x'))) elif bool(node.get('x')): left_value = matrix.add2X(float(node.get('x'))) if footnote != None and\ ((re.match(r'.*[0-9]+:', next_text)\ and re.match(r'.*[0-9]+:', footnote.content)\ and not re.match(r'.*\d-', footnote.content))\ or (self.get_index(left_value) > -1\ and self.get_index(left_value) != self.index)): if DEBUG and re.match(r'.*[0-9]+:', next_text)\ and not re.match(r'.*[0-9]+:', footnote.content): print(footnote, next_text) self.append(footnote) footnote = None if len(items) > 0: for item in items: footnote, left_value = self._process_content_and_markup(item, footnote, matrix) else: if footnote is None: footnote = Text(content=next_text) try: self.register_index(left_value) except Exception: print(self.footnote_columns) raise Exception(f'{footnote}') else: startIndex = footnote.append(next_text) if bool(node.get('class')): standoff_markups = StandoffTag.create_cls(startIndex, len(footnote.content), node.get('class'), style_dict=self.style_dict) if len(standoff_markups) > 0: if len(footnote.standoff_markups) > 0: standoff_markups = footnote.standoff_markups[-1].join_list(standoff_markups) if len(standoff_markups) > 0: footnote.standoff_markups += standoff_markups return footnote, left_value @staticmethod def EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field=None, marginals_on_extra_page=False) ->list: """Return a list of nodes that are in footnote area. """ if transkription_field is None and svg_tree is not None: transkription_field = TranskriptionField(svg_tree.docinfo.URL) nodes_in_footnote_area = [ item for item in filter(lambda node: Matrix.NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, transkription_field,\ marginals_on_extra_page=marginals_on_extra_page),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] for node in nodes_in_footnote_area: if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, marginals_on_extra_page=marginals_on_extra_page): for child in node.getchildren(): if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, x=float(child.get('x')), marginals_on_extra_page=marginals_on_extra_page): node.remove(child) return nodes_in_footnote_area @staticmethod - def GET_BOTTOM_VALUES(nodes_in_footnote_area) ->list: - """Return sorted list of bottom values. + def GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area) ->list: + """Return sorted list of unique bottom values. """ - return sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ]) + return sorted([ bottom_value for bottom_value in set(round(Matrix(transform_matrix_string=item.get('transform')).getY(),1) for item in nodes_in_footnote_area) ]) def extract_footnotes_as_strings(transkription_field=None, svg_tree=None, svg_file=None, contains_string='', marginals_extra=False): """Returns all footnotes as a list of strings. """ if transkription_field is None and svg_file is not None: transkription_field = TranskriptionField(svg_file) if svg_tree is None and svg_file is not None: svg_tree = ET.parse(svg_file) footnotes = [] nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field, marginals_on_extra_page=marginals_extra),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ]) for bottom_value in bottom_values: nodes_on_line = [ item for item in nodes_in_footnote_area if Matrix(transform_matrix_string=item.get('transform')).getY() == bottom_value ] nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) footnote_string = '' for node in nodes_on_line: if len(node.getchildren()) == 0: if footnote_string != '' and re.match(r'.*[0-9]+:', node.text): footnotes.append(footnote_string) footnote_string = node.text else: footnote_string += node.text else: next_string = ''.join([ item.text for item in node.findall('tspan', svg_tree.getroot().nsmap)]) if footnote_string != '' and re.match(r'.*[0-9]+:', next_string): footnotes.append(footnote_string) footnote_string = next_string else: footnote_string += next_string footnotes.append(footnote_string) if contains_string != '': footnotes = [ footnote_string for footnote_string in footnotes if contains_string in footnote_string ] return footnotes -def extract_footnotes(page, transkription_field=None, svg_tree=None, svg_file=None, contains_string='') ->list: +def extract_footnotes(page, transkription_field=None, svg_tree=None, svg_file=None, contains_string='', contains_strings=None, skip_after=-1.0) ->list: """Returns all footnotes as a list of Text. """ marginals_on_extra_page = False if page.marginals_source is not None: marginals_on_extra_page = True svg_tree = ET.parse(page.marginals_source) if transkription_field is None: transkription_field = TranskriptionField(page.source) footnote_columns = FootnoteColumns.create_cls(page=page, transkription_field=transkription_field,\ - svg_tree=svg_tree, svg_file=svg_file, marginals_on_extra_page=marginals_on_extra_page) + svg_tree=svg_tree, svg_file=svg_file, marginals_on_extra_page=marginals_on_extra_page, skip_after=skip_after) if footnote_columns is None: return [] - return footnote_columns.extract_footnotes(contains_string=contains_string) + return footnote_columns.extract_footnotes(contains_string=contains_string, contains_strings=contains_strings) if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/datatypes/simple_word.py =================================================================== --- svgscripts/datatypes/simple_word.py (revision 98) +++ svgscripts/datatypes/simple_word.py (revision 99) @@ -1,124 +1,125 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent a simple word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc from lxml import etree as ET import sys from .line import Line from .faksimile_position import FaksimilePosition from .transkription_position import TranskriptionPosition from .word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass class SimpleWord(SemanticClass, metaclass=abc.ABCMeta): """ This class represents a simple word. """ XML_TAG = 'simple-word' XML_SUB_TAG = 'content' def __init__(self, id=0, line_number=-1, line=None, text='', deleted=False, transkription_positions=None, faksimile_positions=None): self.id = id self.text = text self.line_number = line_number self.lines = [] if line is not None: self.lines.append(line) self.transkription_positions = transkription_positions if transkription_positions is not None else [] self.faksimile_positions = faksimile_positions if faksimile_positions is not None else [] def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() if len(target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)) > 0: word_node = target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)[0] word_node.getparent().remove(word_node) word_node = ET.SubElement(target_tree, self.XML_TAG, attrib={'id': str(self.id)}) word_node.set('text', self.text) if self.line_number > -1: word_node.set('line-number', str(self.line_number)) - for transkription_position in self.transkription_positions: + for id, transkription_position in enumerate(self.transkription_positions): + transkription_position.id = id transkription_position.attach_object_to_tree(word_node) for faksimile_position in self.faksimile_positions: faksimile_position.attach_object_to_tree(word_node) return word_node @classmethod def create_cls(cls, word_node): """Creates a cls from a (lxml.Element) node. [:return:] cls """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else -1 text = word_node.get('text') - transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('./' + WordPosition.TRANSKRIPTION) ] + transkription_positions = [ TranskriptionPosition(id=id, node=node) for id, node in enumerate(word_node.findall('./' + WordPosition.TRANSKRIPTION)) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('./' + WordPosition.FAKSIMILE) ] return cls(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) else: error_msg = 'word_node has not been defined' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'lines': {cls.CLASS_KEY: Line,\ cls.CARDINALITY: 1,\ cls.CARDINALITY_RESTRICTION: 'minCardinality',\ cls.PROPERTY_NAME: 'wordBelongsToLine',\ cls.PROPERTY_LABEL: 'word belongs to a line',\ cls.PROPERTY_COMMENT: 'Relating a word to a line.'}} properties.update(cls.create_semantic_property_dictionary('transkription_positions', TranskriptionPosition,\ name='hasTranskriptionPosition', cardinality=1, cardinality_restriction='minCardinality')) properties.update(cls.create_semantic_property_dictionary('faksimile_positions', FaksimilePosition,\ name='hasFaksimilePosition')) #, cardinality=1, cardinality_restriction='minCardinality')) properties.update(cls.create_semantic_property_dictionary('text', str, cardinality=1,\ subPropertyOf=cls.HOMOTYPIC_HAS_TEXT_URL_STRING)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def init_word(self, page): """Initialize word with objects from page. """ for transkription_position in self.transkription_positions: transkription_position.svg_image = page.svg_image self.faksimile_positions = FaksimilePosition.create_list_of_cls(self.faksimile_positions, page.faksimile_image, page.text_field) if self.line_number > -1: self.lines += [ line for line in page.lines if line.id == self.line_number ] Index: svgscripts/process_words_post_merging.py =================================================================== --- svgscripts/process_words_post_merging.py (revision 98) +++ svgscripts/process_words_post_merging.py (revision 99) @@ -1,475 +1,483 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path as PathlibPath from progress.bar import Bar import re import shutil import string from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.path import Path as SVGPath from svgpathtools.path import Line import sys import tempfile import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.box import Box from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.word import Word, do_paths_intersect_saveMode, update_transkription_position_ids -from util import back_up +from extract_line_continuation import extract_line_continuations +from util import back_up, process_warnings4status from process_files import update_svgposfile_status from process_footnotes import categorize_footnotes sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False DEBUG_WORD = None MERGED_DIR = 'merged' +WARNING_FOOTNOTES_ERROR = 'footnotes not processed' +WARNING_LINE_CONTINUATION = 'line continuation fail' def categorize_paths(page, transkription_field=None): """Categorize all paths that are part of the transkription field. :return: a dictionary containig a list for each category of path. """ if page.source is not None and isfile(page.source): MAX_HEIGHT_LINES = 1 max_line = sorted(\ [line_number.bottom-line_number.top for line_number in page.line_numbers if line_number.id % 2 == 0],\ reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17 tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0 tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0 paths, attributes = svg_to_paths.svg2paths(page.source) allpaths_on_tf = [] allpaths_outside_tf = [] attributes_outside_tf = [] if transkription_field is None: transkription_field = TranskriptionField(page.source) for index, path in enumerate(paths): attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and path.bbox()[0] >= tr_xmin\ and path.bbox()[1] <= transkription_field.xmax: allpaths_on_tf.append(Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) elif len(path) > 0\ and path != transkription_field.path: allpaths_outside_tf.append(Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) #print(index, allpaths_outside_tf[len(allpaths_outside_tf)-1].path, path) attributes_outside_tf.append(attribute) path_dict = { 'text_area_deletion_paths': [],\ 'deletion_or_underline_paths': [],\ 'box_paths': [],\ 'dots_paths': [],\ 'word_connector_paths': [],\ 'uncategorized_paths': [] } for mypath in allpaths_on_tf: xmin, xmax, ymin, ymax = mypath.path.bbox() start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin) if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1: path_dict.get('dots_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed(): path_dict.get('box_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed(): path_dict.get('word_connector_paths').append(mypath) elif abs(ymax-ymin) < MAX_HEIGHT_LINES: mypath.start_line_number = start_line_number path_dict.get('deletion_or_underline_paths').append(mypath) elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin): # Check for "ladder", i.e. a path with 3 segments (seg0 is horizontal on line x, seg1 moves to line x+1, seg2 is horizontal on line x+1) if start_line_number + 1 == page.get_line_number(mypath.path.end.imag-tr_ymin)\ and len(mypath.path._segments) == 3\ and abs(mypath.path._segments[0].bbox()[3]-mypath.path._segments[0].bbox()[2]) < MAX_HEIGHT_LINES\ and abs(mypath.path._segments[2].bbox()[3]-mypath.path._segments[2].bbox()[2]) < MAX_HEIGHT_LINES: for index in 0, 2: new_path = Path(parent_path=mypath, path=SVGPath(mypath.path._segments[index])) new_path.start_line_number = page.get_line_number(new_path.path.start.imag-tr_ymin) path_dict.get('deletion_or_underline_paths').append(new_path) else: path_dict.get('text_area_deletion_paths').append(mypath) else: path_dict.get('uncategorized_paths').append(mypath) underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin) path_dict.update({'underline_path': underline_path}) path_dict['uncategorized_paths'] += process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\ paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line) return path_dict elif not UNITTESTING: error_msg = 'Svg source file {} does not exist!'.format(page.source)\ if page.source is not None else 'Page does not contain a source file!' raise FileNotFoundError(error_msg) return {} def copy_page_to_merged_directory(page, manuscript_file=None): """Copy page to directory that contains the first version of all svg_pos_files that have been merged with the faksimile position data. MERGED_DIR is a subfolder of svg_pos_files-directory. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) target_dir = svg_pos_file.parent / MERGED_DIR if not target_dir.is_dir(): target_dir.mkdir() target_pos_file = target_dir / svg_pos_file.name save_page(page, str(svg_pos_file), target_svg_pos_file=str(target_pos_file), status=STATUS_MERGED_OK, manuscript_file=manuscript_file) def find_special_words(page, transkription_field=None): """Find special words, remove them from words, process their content. """ if page.source is None or not isfile(page.source): raise FileNotFoundError('Page does not have a source!') if transkription_field is None: transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) special_char_list = MarkForeignHands.get_special_char_list() special_char_list += TextConnectionMark.get_special_char_list() single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ] if not UNITTESTING: bar = Bar('find special words', max=len(single_char_words)) for word in single_char_words: not bool(UNITTESTING) and bar.next() if word.text == MarkForeignHands.CLASS_MARK: id = len(page.mark_foreign_hands) page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id)) page.words.remove(word) elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\ or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\ and any(style in page.sonderzeichen_list for style\ in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))): id = len(page.text_connection_marks) page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id)) page.words.remove(word) not bool(UNITTESTING) and bar.finish() svg_tree = ET.parse(page.source) page.update_page_type(transkription_field=transkription_field) page.update_line_number_area(transkription_field, svg_tree=svg_tree) if page.marginals_source is not None: svg_tree = ET.parse(page.marginals_source) italic_classes = [ key for key in page.style_dict\ if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ] if len(page.mark_foreign_hands) > 0: MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\ SonderzeichenList=page.sonderzeichen_list) if len(page.text_connection_marks) > 0: - TextConnectionMark.find_content_in_footnotes(page.text_connection_marks, transkription_field, svg_tree,\ - title=page.title, page_number=page.number) + TextConnectionMark.find_content_in_footnotes(page, transkription_field, svg_tree) def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks all words that intersect with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] list of .path.Path that might be word_underline_paths """ if not UNITTESTING: bar = Bar('mark words that intersect with deletion paths', max=len(page.words)) for word in page.words: not bool(UNITTESTING) and bar.next() word = mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) for part_word in word.word_parts: part_word = mark_word_if_it_intersects_with_paths_as_deleted(part_word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) word.partition_according_to_deletion() not bool(UNITTESTING) and bar.finish() # return those paths in deletion_paths that are not in page.word_deletion_paths return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ] def mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks word if it intersects with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] word """ word.deleted = False for transkription_position in word.transkription_positions: word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) intersecting_paths = [ deletion_path for deletion_path in deletion_paths\ if do_paths_intersect_saveMode(deletion_path, word_path) ] if DEBUG_WORD is not None and word.text == DEBUG_WORD.text and word.line_number == DEBUG_WORD.line_number: relevant_paths = [ path for path in deletion_paths if path.start_line_number == DEBUG_WORD.line_number ] #print(word.line_number, word_path.path.bbox(), [ path.path.bbox() for path in relevant_paths]) if len(intersecting_paths) > 0: #print(f'{word.line_number}: {word.id}, {word.text}: {intersecting_paths}') transkription_position.deleted = True for deletion_path in intersecting_paths: if deletion_path.parent_path is not None: deletion_path = deletion_path.parent_path if deletion_path not in page.word_deletion_paths: deletion_path.tag = Path.WORD_DELETION_PATH_TAG deletion_path.attach_object_to_tree(page.page_tree) page.word_deletion_paths.append(deletion_path) return word def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None): """Process words after merging with faksimile word positions. """ if page is None and svg_pos_file is None: raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!') if page is None: page = Page(svg_pos_file) if page.source is None or not isfile(page.source): raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file)) if svg_pos_file is None: svg_pos_file = page.page_tree.docinfo.URL if new_words is not None: page.words = sorted(new_words, key=attrgetter('id')) for word_node in page.page_tree.xpath('.//word'): word_node.getparent().remove(word_node) manuscript = ArchivalManuscriptUnity.create_cls(manuscript_file)\ if manuscript_file is not None\ else None copy_page_to_merged_directory(page, manuscript_file=manuscript_file) transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) update_faksimile_line_positions(page) - find_special_words(page, transkription_field=transkription_field) - #update_writing_process_ids(page) + status = STATUS_MERGED_OK page.update_styles(manuscript=manuscript, partition_according_to_styles=True) - #TODO: find_hyphenated_words(page) categorize_paths(page, transkription_field=transkription_field) - categorize_footnotes(page) - save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=STATUS_POSTMERGED_OK, manuscript_file=manuscript_file) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('default') + try: + find_special_words(page, transkription_field=transkription_field) + categorize_footnotes(page) + extract_line_continuations(page, warning_message=WARNING_LINE_CONTINUATION) + except Exception: + warnings.warn(WARNING_FOOTNOTES_ERROR) + status = process_warnings4status(w, [ WARNING_FOOTNOTES_ERROR, WARNING_LINE_CONTINUATION ], status, STATUS_POSTMERGED_OK) + save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17) -> list: """Process word boxes: partition words according to word boxes. [:return:] a list of paths that are not boxes """ MAX_HEIGHT_LINES = 1 not_boxes = [] if not UNITTESTING: bar = Bar('process word boxes', max=len(page.words)) svg_tree = ET.parse(page.source) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } allpaths_on_margin_field = [] if paths is None or attributes is None: paths = [] raw_paths, attributes = svg_to_paths.svg2paths(page.source) for index, raw_path in enumerate(raw_paths): paths.append(Path.create_cls(id=index, path=raw_path, style_class=attributes[index].get('class'), page=page)) for index, mypath in enumerate(paths): path = mypath.path xmin, xmax, ymin, ymax = path.bbox() attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\ or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\ and abs(ymax-ymin) < max_line: allpaths_on_margin_field.append(mypath)#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) box_line_number_dict = {} for box_path in sorted(box_paths, key=lambda path: path.get_median_y()): line_number = page.get_line_number(box_path.get_median_y(tr_ymin=transkription_field.ymin)) if line_number > 0: if line_number not in box_line_number_dict.keys(): box_line_number_dict.update({ line_number: [ box_path ]}) else: box_line_number_dict.get(line_number).append(box_path) boxes = [] for line_number in box_line_number_dict.keys(): box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x()) margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\ if page.get_line_number(margin_box.get_median_y(tr_ymin=transkription_field.ymin)) == line_number ],\ key=lambda path: path.get_x()) threshold = 3 if line_number % 2 == 0 else 1.5 if len(margin_boxes_on_line) > 0: for box_path in box_paths_on_line: #print(line_number, box_path.path.d(), len(margin_boxes_on_line)) box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\ transkription_field=transkription_field, namespaces=namespaces, threshold=threshold) if box is not None: boxes.append(box) else: not_boxes += box_paths_on_line if len(boxes) > 0: for word in page.words: word.process_boxes(boxes, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin) word.create_correction_history(page) if not bool(UNITTESTING): bar.next() elif word.earlier_version is not None: #print(f'{word.text} -> {word.earlier_version.text}') if word.earlier_version.earlier_version is not None: print(f'{word.earlier_version.earlier_version.text}') not bool(UNITTESTING) and bar.finish() return not_boxes def reset_page(page): """Reset all words that have word_parts in order to run the script a second time. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) first_merge_version = svg_pos_file.parent / MERGED_DIR / svg_pos_file.name if first_merge_version.exists(): page = Page(str(first_merge_version)) else: word_with_wordparts = [ word for word in page.words if len(word.word_parts) > 0 ] word_with_wordparts += [ word for word in page.words if word.earlier_version is not None ] page_changed = False if len(word_with_wordparts) > 0: for word in word_with_wordparts: word.undo_partitioning() update_transkription_position_ids(word) page_changed = True no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if len(no_line_numbers) > 0: for word in no_line_numbers: if len(word.transkription_positions) > 0: word.line_number = page.get_line_number((word.transkription_positions[0].top+word.transkription_positions[0].bottom)/2) else: msg = f'Word {word.id} {word.text} has no transkription_position!' warnings.warn(msg) page_changed = True if page_changed: page.update_and_attach_words2tree() def save_page(page, svg_pos_file, target_svg_pos_file=None, status=None, manuscript_file=None): """Save page to target_file and update status of file. """ page.update_and_attach_words2tree() if not UNITTESTING: if target_svg_pos_file is None: target_svg_pos_file = svg_pos_file if status is not None: update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status) write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def update_faksimile_line_positions(page): """Update faksimile_positions of the lines """ num_lines = len(page.line_numbers) ymin = page.text_field.ymin\ if page.text_field is not None\ else 0.0 for line_number in page.line_numbers: if len([ word.faksimile_positions[0] for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) > 0: line_number.faksimile_inner_top = min([ word.faksimile_positions[0].top for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) line_number.faksimile_inner_bottom = max([ word.faksimile_positions[0].bottom for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) if line_number.id % 2 == 0: line_number.faksimile_outer_top = line_number.faksimile_inner_top - ymin line_number.faksimile_outer_bottom = line_number.faksimile_inner_bottom - ymin for index, line_number in enumerate(page.line_numbers): if line_number.faksimile_inner_bottom == 0.0\ or line_number.faksimile_inner_bottom < line_number.faksimile_inner_top: if index == 0 and num_lines > 1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].top elif index == num_lines-1 and page.text_field is not None: line_number.faksimile_inner_bottom = round(page.text_field.height + page.text_field.ymin, 3) elif index > 0 and index < num_lines-1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].faksimile_inner_top\ if page.line_numbers[index+1].faksimile_inner_top > page.line_numbers[index-1].faksimile_inner_bottom\ else page.line_numbers[index-1].faksimile_inner_bottom line_number.attach_object_to_tree(page.page_tree) def update_writing_process_ids(page): """Update the writing_process_ids of the words and split accordingly. """ for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to process words after they have been merged with faksimile data. svgscripts/process_words_post_merging.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -i|--include-missing-line-number run script on files that contain words without line numbers -r|--rerun rerun script on a svg_pos_file that has already been processed :return: exit code (int) """ status_not_contain = STATUS_POSTMERGED_OK include_missing_line_number = False try: opts, args = getopt.getopt(argv, "hir", ["help", "include-missing-line-number", "rerun" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-i', '--include-missing-line-number'): include_missing_line_number = True elif opt in ('-r', '--rerun'): status_not_contain = '' if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=status_not_contain): reset_page(page) no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if not include_missing_line_number and len(no_line_numbers) > 0: not UNITTESTING and print(Fore.RED + f'Page {page.title}, {page.number} has words with no line number!') for word in no_line_numbers: not UNITTESTING and print(f'Word {word.id}: {word.text}') else: back_up(page, page.xml_file) not UNITTESTING and print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL) post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/extract_footnotes.py =================================================================== --- svgscripts/extract_footnotes.py (revision 98) +++ svgscripts/extract_footnotes.py (revision 99) @@ -1,90 +1,88 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract footnotes from a svg file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import getopt import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.transkriptionField import TranskriptionField -from datatypes.footnotes import extract_footnotes, extract_footnotes_as_strings +from datatypes.footnotes import extract_footnotes def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract footnotes from a svg file. svgscripts/extract_footnotes.py [OPTIONS] a xml file containing information about the position of the svg words. OPTIONS: -h|--help: show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 svg_file = args[0] if svg_file.endswith('xml'): page = Page(svg_file) svg_file = page.source else: usage() return 2 - #footnotes = extract_footnotes_as_strings(svg_file=svg_file) - #print(footnotes) footnotes = extract_footnotes(page, svg_file=svg_file) for footnote in footnotes: print(footnote.content) for markup in footnote.standoff_markups: print(f'->{markup.markup}, start:{markup.startIndex}, end:{markup.endIndex}') return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/extract_line_continuation.py =================================================================== --- svgscripts/extract_line_continuation.py (revision 98) +++ svgscripts/extract_line_continuation.py (revision 99) @@ -1,218 +1,219 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract line continuations. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import lxml.etree as ET import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import warnings __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from datatypes.box import text_node_is_inside_match_box, tspan_node_is_inside_match_box from datatypes.line import Line from datatypes.line_continuation import LineContinuation from datatypes.matrix import Matrix from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.reference import Reference from datatypes.transkriptionField import TranskriptionField from util import back_up sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT UNITTESTING = False DEBUG = False -def extract_line_continuations(page: Page, svg_file=None): +def extract_line_continuations(page: Page, svg_file=None, warning_message='WARNING'): """Extract line continuations. """ if svg_file is None: if page.source is None or not isfile(page.source): raise Exception('Function "extract_line_continuations" needs a page with a valid source or a svg_file!') svg_file = page.source + if not UNITTESTING: + print(Fore.CYAN + f'Extracting line continuations on {page.title}, {page.number} ...' + Style.RESET_ALL) svg_tree = ET.parse(svg_file) transkription_field = TranskriptionField(svg_file) page.update_line_number_area(transkription_field, svg_tree=svg_tree) for line in page.lines: line.editor_comments = [] namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } arrow_style_key = [ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen'][0]\ if len([ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen']) > 0\ else None if arrow_style_key is not None: for arrow in _extract_arrow_nodes(svg_tree, arrow_style_key, transkription_field, namespaces): matrix = Matrix(transform_matrix_string=arrow.get('transform'))\ if not arrow.tag.endswith('tspan')\ else Matrix(transform_matrix_string=arrow.getparent().get('transform')) line = _get_line_of_arrow(arrow, page, transkription_field) if line is not None: reference_counter = 0 reference = None while reference is None and reference_counter < 2: reference = _get_reference(svg_tree, arrow, matrix, transkription_field, namespaces, is_from_reference=(reference_counter==0)) reference_counter += 1 if reference is not None: line.editor_comments.append(LineContinuation(reference=reference, to_reference=(reference_counter>1))) else: to_reference = (matrix.getX() > transkription_field.xmax) line.editor_comments.append(LineContinuation(reference=Reference(), to_reference=to_reference)) else: y = round(matrix.getY() - transkription_field.ymin, 2) - warnings.warn(f'There is no line for {y}') + warnings.warn(f'{warning_message}: There is no line for {y}') for line in page.lines: line.attach_object_to_tree(page.page_tree) if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def _extract_arrow_nodes(svg_tree: ET.ElementTree, arrow_style_key: str, transkription_field=None, namespaces=None) ->list: """Extract arrow nodes from svg_tree. """ if transkription_field is None: transkription_field = TranskriptionField(svg_tree.docinfo.URL) if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } return [ arrow for arrow in svg_tree.xpath('//ns:text[contains(@class, "{0}")]'.format(arrow_style_key)\ + '|//ns:tspan[contains(@class, "{0}")]'.format(arrow_style_key),\ namespaces=namespaces)\ if arrow.text == ')' and node_is_on_marginalia(arrow, transkription_field) ] def _get_arrow_y(arrow: ET.Element, matrix=None) ->float: """Return y of arrow node. """ if matrix is None: matrix = Matrix(transform_matrix_string=arrow.get('transform'))\ if not arrow.tag.endswith('tspan')\ else Matrix(transform_matrix_string=arrow.getparent().get('transform')) if arrow.tag.endswith('tspan'): return matrix.add2Y(add_to_y=arrow.get('y')) else: return matrix.getY() def _get_line_of_arrow(arrow: ET.Element, page: Page, transkription_field: TranskriptionField, matrix=None) ->Line: """Return Line next to arrow. """ arrow_y = _get_arrow_y(arrow, matrix=matrix) line_number = page.get_line_number(round(arrow_y - transkription_field.ymin, 2) -.5) lines = [ line for line in page.lines if line.id == line_number ] if len(lines) > 0: return lines[0] return None def _get_reference(svg_tree: ET.ElementTree, arrow: ET.Element, arrow_matrix: Matrix, transkription_field: TranskriptionField, namespaces: dict, is_from_reference=True) ->Reference: """Return reference. """ reference = None arrow_left = arrow_matrix.add2X(add_to_x=arrow.get('x'))\ if arrow.tag.endswith('tspan')\ else arrow_matrix.getX() arrow_y = _get_arrow_y(arrow, matrix=arrow_matrix) xmin = 0\ if arrow_left < transkription_field.xmin\ else transkription_field.xmax + transkription_field.line_number_area_width xmax = arrow_left ymin = arrow_y -5 ymax = arrow_y +5 if not is_from_reference: xmin = xmax xmax = transkription_field.xmin - transkription_field.line_number_area_width\ if arrow_left < transkription_field.xmin\ else transkription_field.documentWidth + transkription_field.line_number_area_width text_nodes_on_arrow_line = sorted([ text_node for text_node in svg_tree.xpath('//ns:text', namespaces=namespaces)\ if text_node != arrow and text_node_is_inside_match_box(text_node, xmin, xmax, ymin, ymax) ],\ key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX()) reference_string = '' for text_node in text_nodes_on_arrow_line: reference_string += ''.join([ child.text for child in text_node.getchildren()])\ if len(text_node.getchildren()) > 0\ else text_node.text if reference_string != '': reference = Reference.create_cls(reference_string=reference_string) return reference def node_is_on_marginalia(node: ET.Element, transkription_field: TranskriptionField) ->bool: """Return true if node is on marginalia. """ if node.tag.endswith('tspan'): return tspan_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\ or tspan_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax) return text_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\ or text_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract the line continuations. svgscripts/extract_line_continuation.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK): if not UNITTESTING: - print(Fore.CYAN + f'Extracting line continuations on {page.title}, {page.number} ...' + Style.RESET_ALL) back_up(page, page.xml_file) extract_line_continuations(page) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/util.py =================================================================== --- svgscripts/util.py (revision 98) +++ svgscripts/util.py (revision 99) @@ -1,435 +1,509 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to copy a faksimile svg file with the option of highlighting some word boxes. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from datetime import datetime from functools import cmp_to_key import getopt import inspect import itertools import lxml.etree as ET import re import shutil import signal import string import subprocess from svgpathtools import svg_to_paths import sys import tempfile import os from os import listdir, sep, path, setpgrp, devnull, makedirs from os.path import basename, commonpath, dirname, exists, isfile, isdir, realpath, splitext import warnings import wget import xml.etree.ElementTree as XET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.faksimile import FaksimilePage, get_paths_inside_rect from datatypes.faksimile_image import FaksimileImage from datatypes.lineNumber import LineNumber from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page import Page from datatypes.page_creator import PageCreator from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import update_transkription_position_ids from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT -from process_files import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False HIGHLIGHT_COLOR = 'red' OPACITY = '0.5' class ExternalViewer: """This class can be used to show files with external viewers. """ file_format_viewer_dict = { '.pdf': PDF_READER, '.svg': SVG_EDITOR } @classmethod def show_files(cls, single_file=None, list_of_files=[]): """Opens file(s) with corresponding external viewer(s). """ DEVNULL = None if type(single_file) == list: list_of_files = single_file elif single_file is not None: list_of_files.append(single_file) if len(list_of_files) > 1: DEVNULL = open(devnull, 'wb') process_list = [] list_of_files.reverse() while len(list_of_files) > 0: file2open = list_of_files.pop() viewer = cls.file_format_viewer_dict.get(splitext(file2open)[1]) if viewer is not None: if len(list_of_files) > 0: process_list.append(\ subprocess.Popen([viewer, file2open], stdout=DEVNULL, stderr=DEVNULL, preexec_fn=os.setsid)) else: subprocess.run([viewer, file2open]) for process in process_list: os.killpg(os.getpgid(process.pid), signal.SIGTERM) if DEVNULL is not None: DEVNULL.close() def back_up(page: Page, reference_file, bak_dir='./bak') -> str: """Back up a xml_source_file. :return: target_file_name """ date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') makedirs(bak_dir, exist_ok=True) - target_file_name = bak_dir + sep + basename(page.page_tree.docinfo.URL) + '_' + date_string - write_pretty(xml_element_tree=page.page_tree, file_name=target_file_name,\ + page.bak_file = bak_dir + sep + basename(page.page_tree.docinfo.URL) + '_' + date_string + write_pretty(xml_element_tree=page.page_tree, file_name=page.bak_file,\ script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, reference_file),\ file_type=FILE_TYPE_SVG_WORD_POSITION) - return target_file_name + return page.bak_file + +def back_up_svg_file(svg_tree: ET.ElementTree, namespaces=None, bak_dir='./bak') -> str: + """Back up a xml_source_file. + + :return: target_file_name + """ + if namespaces is None: + namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } + date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') + makedirs(bak_dir, exist_ok=True) + bak_file = bak_dir + sep + date_string + '_' + basename(svg_tree.docinfo.URL) + copy_faksimile_svg_file(target_file=bak_file, faksimile_tree=svg_tree, namespaces=namespaces) + return bak_file def copy_faksimile_svg_file(target_file=None, faksimile_source_file=None, faksimile_tree=None, target_directory=None, abs_image_path=None, local_image_path=None, namespaces=None): """Copy a faksimile_svg_file to target_file. """ if faksimile_source_file is None and faksimile_tree is not None: faksimile_source_file = faksimile_tree.docinfo.URL elif faksimile_source_file is None: raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file') if target_file is not None and target_directory is not None: target_file = target_directory + sep + target_file elif target_file is None and target_directory is not None: target_file = target_directory + sep + basename(faksimile_source_file) elif target_file is None: raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory') paths, attributes, svg_attributes = svg_to_paths.svg2paths(faksimile_source_file, return_svg_attributes=True) for key in [ key for key in svg_attributes.keys() if key.startswith('xmlns:') ]: try: XET.register_namespace(key.replace('xmlns:', ''), svg_attributes[key]) except ValueError: pass XET.register_namespace('', 'http://www.w3.org/2000/svg') if namespaces is None: namespaces = { 'ns': svg_attributes['xmlns'], 'xlink': svg_attributes['xmlns:xlink'],\ 'sodipodi': svg_attributes['xmlns:sodipodi'] } if faksimile_tree is not None: element = XET.fromstring(ET.tostring(faksimile_tree))\ if type(faksimile_tree) == ET._ElementTree\ else XET.fromstring(XET.tostring(faksimile_tree.getroot())) target_tree = XET.ElementTree(element) else: target_tree = XET.parse(faksimile_source_file) if (local_image_path is not None or abs_image_path is not None)\ and len(target_tree.findall('.//ns:image', namespaces=namespaces)) > 0: image_node = target_tree.findall('.//ns:image', namespaces=namespaces)[0] if local_image_path is not None: image_node.set('{%s}href' % namespaces['xlink'], local_image_path) if abs_image_path is not None: image_node.set('{%s}absref' % namespaces['sodipodi'], abs_image_path) target_tree.write(target_file) def copy_faksimile_update_image_location(faksimile_source_file=None, faksimile_tree=None, target_file=None, target_directory=None, overwrite=False): """Copy a faksimile_svg_file to target_file and update image location. """ if faksimile_source_file is None and faksimile_tree is not None: faksimile_source_file = faksimile_tree.docinfo.URL elif faksimile_source_file is None: raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file') if target_file is not None and target_directory is not None: target_file = target_directory + sep + target_file elif target_file is None and target_directory is not None: target_file = target_directory + sep + basename(faksimile_source_file) elif target_directory is None and target_file is not None: target_directory = dirname(target_file) elif target_file is None: raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory') source_tree = ET.parse(faksimile_source_file) if faksimile_tree is None else faksimile_tree namespaces = { k if k is not None else 'ns': v for k, v in source_tree.getroot().nsmap.items() } image_nodes = source_tree.xpath('//ns:image', namespaces=namespaces) local_image_path = None abs_image_path = None user_abs_image_path = None if len(image_nodes) > 0: image = FaksimileImage.CREATE_IMAGE(image_nodes[0], source_file=faksimile_source_file) abs_image_path = image.local_path for user_name in USER_ROOT_LOCATION_DICT.keys(): if user_name in target_directory: user_abs_image_path = abs_image_path.replace(FAKSIMILE_LOCATION, USER_ROOT_LOCATION_DICT[user_name]).replace('//','/') break # if target_directory is subdir of FAKSIMILE_LOCATION if realpath(target_directory).startswith(realpath(FAKSIMILE_LOCATION)): common_path = commonpath([ realpath(target_directory), realpath(dirname(image.local_path)) ]) relative_directory = '/'.join(\ [ '..' for d in realpath(target_directory).replace(common_path + '/', '').split('/') ]) local_image_path = relative_directory + realpath(image.local_path).replace(common_path, '') if not isfile(target_directory + sep + local_image_path): local_image_path = None elif abs_image_path is not None: local_image_path = abs_image_path if abs_image_path is not None and not isfile(abs_image_path): wget.download(image.URL, out=dirname(abs_image_path)) if not isfile(target_file) or overwrite: abs_image_path = user_abs_image_path if user_abs_image_path is not None else abs_image_path copy_faksimile_svg_file(target_file=target_file, faksimile_source_file=faksimile_source_file,\ faksimile_tree=faksimile_tree, abs_image_path=abs_image_path,\ local_image_path=local_image_path, namespaces=namespaces) else: msg = 'File {0} not copied to directory {1}, it already contains a file {2}.'.format(faksimile_source_file, target_directory, target_file) warnings.warn(msg) def copy_xml_file_word_pos_only(xml_source_file, target_directory): """Copy word positions of a xml file to target directory. :return: (str) xml_target_file """ xml_target_file = target_directory + sep + basename(xml_source_file) source_page = Page(xml_source_file) target_page = PageCreator(xml_target_file, title=source_page.title, page_number=source_page.number, orientation=source_page.orientation) target_page.words = source_page.words target_page.update_and_attach_words2tree() write_pretty(xml_element_tree=target_page.page_tree, file_name=xml_target_file,\ script_name=__file__ + '({})'.format(inspect.currentframe().f_code.co_name), file_type=FILE_TYPE_SVG_WORD_POSITION) return xml_target_file -def create_highlighted_svg_file(faksimile_tree, node_ids, target_file=None, target_directory=None, local_image_path=None, namespaces={}, highlight_color=HIGHLIGHT_COLOR, opacity=OPACITY): +def create_highlighted_svg_file(faksimile_tree, node_ids, nodes_color_dict=None, target_file=None, target_directory=None, local_image_path=None, namespaces=None, highlight_color=HIGHLIGHT_COLOR, opacity=OPACITY): """Highlights the nodes of a faksimile_tree that are specified by the list of node_ids and writes the tree to a file. """ - if len(namespaces) == 0: + if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } for node in itertools.chain(*[\ faksimile_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)\ for node_id in node_ids\ ]): node.set('fill', highlight_color) node.set('opacity', opacity) node.set('style', '') copy_faksimile_update_image_location(target_file=target_file, faksimile_tree=faksimile_tree, target_directory=target_directory) def get_empty_node_ids(faksimile_tree, x_min=0.0, x_max=0.0, y_min=0.0, y_max=0.0, text_field_id=None, faksimile_page=None, namespaces={}): """Returns a list of ids of rect and path nodes that do not have a title element. """ THRESHOLD_X = 10 if faksimile_page is not None: x_min = faksimile_page.text_field.xmin + faksimile_page.faksimile_image.x x_max = faksimile_page.text_field.xmax + faksimile_page.faksimile_image.x - THRESHOLD_X y_min = faksimile_page.text_field.ymin + faksimile_page.faksimile_image.y y_max = faksimile_page.text_field.ymax + faksimile_page.faksimile_image.y text_field_id = faksimile_page.text_field.id if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } empyt_node_ids = [] nodes_without_title = faksimile_tree.xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'.format(\ x_min, x_max, y_min, y_max, text_field_id), namespaces=namespaces) nodes_without_title += get_paths_inside_rect(faksimile_tree, '//ns:path[not(./ns:title)]', x_min, x_max, y_min, y_max, text_field_id, namespaces=namespaces) for node_without_title in nodes_without_title: empyt_node_ids.append(node_without_title.get('id')) return empyt_node_ids def get_mismatching_ids(words, faksimile_positions): """ Return the list of mismatching words and the list of mismatching faksimile_positions as a 2-tuple. """ mismatching_words = [] mismatching_faksimile_positions = [] faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions) word_texts = [ word.text for word in words if word.text != '.' ] for word_text in set(word_texts): if word_text not in unique_faksimile_words: mismatching_words += [ word for word in words if word.text == word_text ] for faksimile_position_text in unique_faksimile_words: if faksimile_position_text not in set(word_texts): mismatching_faksimile_positions += [ faksimile_position for faksimile_position in faksimile_positions\ if faksimile_position.text == faksimile_position_text ] return mismatching_words, mismatching_faksimile_positions +def process_warnings4status(warnings, warning_messages, current_status, ok_status, status_prefix='') ->str: + """Process potential warnings and return actual status. + """ + if warnings is not None and len(warnings) > 0: + status = status_prefix + for warning_message in warning_messages: + if True in [ str(warn.message).startswith(warning_message) for warn in warnings ]: + status += f':{warning_message}:' + if status != status_prefix: + return status + return f'{current_status}:{ok_status}:' + else: + return f'{current_status}:{ok_status}:' + def record_changes(original_svg_file, changed_svg_file, node_ids, namespaces={}): """Copy changes made to changed_svg_file to original_svg_file. """ old_tree = ET.parse(original_svg_file) new_tree = ET.parse(changed_svg_file) if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() } for node_id in node_ids: new_titles = new_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces) old_nodes = old_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces) if len(new_titles) > 0 and len(old_nodes) > 0: if old_nodes[0].find('ns:title', namespaces=namespaces) is not None: old_nodes[0].find('ns:title', namespaces=namespaces).text = new_titles[0].text else: old_title_id_string = new_titles[0].get('id') old_title = ET.SubElement(old_nodes[0], 'title', attrib={ 'id': old_title_id_string }) old_title.text = new_titles[0].text elif len(old_nodes) > 0: for old_node in old_nodes: old_node.getparent().remove(old_node) copy_faksimile_svg_file(target_file=original_svg_file, faksimile_tree=old_tree) def record_changes_on_svg_file_to_page(xml_source_file, svg_file, word_ids=None): """Copy changes made to svg_file to xml_source_file. :return: datatypes.page.Page """ svg_tree = ET.parse(svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } transkription_field = TranskriptionField(svg_file) page = Page(xml_source_file) words = [ word for word in page.words if word.id in word_ids ]\ if word_ids is not None else page.words new_page_words = [] for word in words: word_id = 'word_' + str(word.id) + '_' recorded_ids = [] for transkription_position in word.transkription_positions: transkription_position_id = word_id + str(transkription_position.id) tp_nodes = svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]'.format(transkription_position_id), namespaces=namespaces) if len(tp_nodes) > 0: record_changes_to_transkription_position(tp_nodes[0], transkription_position,\ transkription_field.xmin, transkription_field.ymin, namespaces=namespaces) recorded_ids.append(transkription_position_id) extra_nodes = [ node for node in\ svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[contains(@id, "{0}")]'.format(word_id), namespaces=namespaces)\ if node.get('id') not in recorded_ids ] if len(extra_nodes) > 0: for extra_node in extra_nodes: old_ids = [ inkscape_id.replace('#','') for inkscape_id in\ svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]/@inkscape:label'.format(extra_node.get('id')),\ namespaces=namespaces) ] if len(old_ids) > 0 and re.match(r'word_[0-9]+_[0-9]+', old_ids[0]): old_id_list = old_ids[0].split('_') ref_word_id = int(old_id_list[1]) ref_tp_id = old_id_list[2] ref_words = [ word for word in page.words if word.id == ref_word_id ] if len(ref_words) > 0: ref_tps = [ tp for tp in ref_words[0].transkription_positions\ if tp.id == ref_tp_id ] if len(ref_tps) > 0: ref_words[0].transkription_positions.remove(ref_tps[0]) record_changes_to_transkription_position(extra_node,\ ref_tps[0], transkription_field.xmin, transkription_field.ymin, namespaces=namespaces) word.transkription_positions.append(ref_tps[0]) for word in page.words: if word.has_mixed_status('text'): new_page_words += [ word for word in word.split_according_to_status('text') if word.text is not None and word.text != '' ] elif len(word.transkription_positions) > 0: new_text = [ tp.text for tp in word.transkription_positions if tp.text is not None and tp.text != '' ] if len(new_text) > 0: word.text = new_text[0] new_page_words.append(word) page.words = new_page_words page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids) page.unlock() if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\ script_name=__file__ + ' -> ' + inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_SVG_WORD_POSITION) return page def record_changes_on_xml_file_to_page(xml_source_file, xml_file) -> Page: """Copy changes made to xml_file to xml_source_file. :return: datatypes.page.Page """ copy_page = Page(xml_file) page = Page(xml_source_file) page.unlock() back_up(page, xml_file) page.words = [] for word in copy_page.words: if word.split_strings is None\ or len(word.split_strings) == 0: page.words.append(word) else: next_word = word for split_string in word.split_strings: _, new_word, next_word = next_word.split(split_string) page.words.append(new_word) if next_word is not None: page.words.append(next_word) page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids) remove_words_if_done = [] for word in page.words: if 'join_string' in word.__dict__.keys()\ and word.join_string is not None: if word.id > 0\ and page.words[word.id-1].text + word.text == word.join_string: page.words[word.id-1].join(word) remove_words_if_done.append(word) elif word.id < len(page.words)\ and word.text + page.words[word.id+1].text == word.join_string: word.join(page.words[word.id+1]) remove_words_if_done.append(page.words[word.id+1]) for word in remove_words_if_done: page.words.remove(word) page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids) if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\ script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, xml_file), file_type=FILE_TYPE_SVG_WORD_POSITION) return page def record_changes_to_transkription_position(node, transkription_position, xmin=0.0, ymin=0.0, namespaces=None): """Record changes made to node to transkription_position. """ if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in node.nsmap.items() } if bool(node.get('x')): transkription_position.left = float(node.get('x')) - xmin if bool(node.get('y')): transkription_position.top = float(node.get('y')) - ymin if bool(node.get('width')): transkription_position.width = float(node.get('width')) if bool(node.get('height')): transkription_position.height = float(node.get('height')) if len(node.xpath('./ns:title/text()', namespaces=namespaces)) > 0: transkription_position.text = node.xpath('./ns:title/text()', namespaces=namespaces)[0] def replace_chars(words, faksimile_positions, unique_faksimile_words=None): """Return unique_faksimile_words and faksimile_positions, with characters changed according to transcription words. """ if unique_faksimile_words is None: unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\ key=lambda text: len(text)) for index, word_text in enumerate(unique_faksimile_words): if len([ word for word in words if word.text == word_text ]) == 0: if re.match(r'.*".*', word_text)\ and len([ word for word in words if word.text == word_text.replace('"', '“') ]) > 0: unique_faksimile_words[index] = word_text.replace('"', '“') elif re.match(r'.*ss.*', word_text)\ and len([ word for word in words if word.text == word_text.replace('ss', 'ß') ]) > 0: unique_faksimile_words[index] = word_text.replace('ss', 'ß') elif re.match(r'.*-.*', word_text)\ and len([ word for word in words if word.text == word_text.replace('-', '–') ]) > 0: unique_faksimile_words[index] = word_text.replace('-', '–') for faksimile_position in [ faksimile_position for faksimile_position in faksimile_positions\ if faksimile_position.text == word_text ]: faksimile_position.text = unique_faksimile_words[index] elif word_text == '-'\ and len([ word for word in words if word.text == '–' ]) > 0: print([ word.text for word in words if word.text == word_text ]) print([ word.text for word in words if word.text == '–' ]) return faksimile_positions, unique_faksimile_words +def update_svgposfile_status(file_name, manuscript_file=None, status='changed', append=True): + """Updates svg position file's status. Changes its status to status if it does not contain 'OK', + else it appends new status to old status. + """ + if isfile(file_name): + parser = ET.XMLParser(remove_blank_text=True) + file_tree = ET.parse(file_name, parser) + old_status = file_tree.getroot().get('status') + if old_status is None or 'OK' not in old_status.split(':'): + file_tree.getroot().set('status', status) + elif append: + if status not in old_status.split(':'): + new_status = old_status + ':' + status + file_tree.getroot().set('status', new_status) + else: + file_tree.getroot().set('status', new_status) + write_pretty(xml_element_tree=file_tree, file_name=file_name, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) + if manuscript_file is not None and isfile(manuscript_file): + page_number = file_tree.getroot().get('number') + update_manuscript_file(manuscript_file, page_number, file_name, status=status) + +def update_manuscript_file(manuscript_file, page_number, file_name, status='changed', append=True): + """Updates manuscript file: adds status information about page. + """ + if isfile(manuscript_file): + parser = ET.XMLParser(remove_blank_text=True) + manuscript_tree = ET.parse(manuscript_file, parser) + if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0: + node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0] + old_status = node.get('status') + if old_status is None or 'OK' not in old_status.split(':'): + node.set('status', status) + elif append: + if status not in old_status.split(':'): + new_status = old_status + ':' + status + node.set('status', new_status) + else: + node.set('status', new_status) + if not bool(node.get('output')): + node.set('output', file_name) + else: + pages_node = manuscript_tree.getroot().find('pages')\ + if manuscript_tree.getroot().find('pages') is not None\ + else ET.SubElement(manuscript_tree.getroot(), 'pages') + new_id = len(pages_node.findall('page')) + 1 + ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name}) + write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT) + Index: svgscripts/process_footnotes.py =================================================================== --- svgscripts/process_footnotes.py (revision 98) +++ svgscripts/process_footnotes.py (revision 99) @@ -1,263 +1,277 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import lxml.etree as ET import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path as PathlibPath from progress.bar import Bar import re import shutil import sys import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.atypical_writing import AtypicalWriting from datatypes.clarification import Clarification from datatypes.editor_comment import EditorComment from datatypes.editor_correction import EditorCorrection from datatypes.footnotes import extract_footnotes from datatypes.line_continuation import LineContinuation from datatypes.standoff_tag import StandoffTag from datatypes.text import Text +from datatypes.text_connection_mark import TextConnectionMark from datatypes.uncertain_decipherment import UncertainDecipherment from util import back_up from process_files import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False ATYPICAL_GROUP = re.compile(r'(.*:.*]\s*)(¿)(.*)') CLARIFICATION_GROUP = re.compile(r'(.*:.*]\s*)(Vk)(.*)') CONTINUATION_GROUP = re.compile(r'(.*:\s*)(Fortsetzung\s*)') COMMENT_GROUP = re.compile(r'(.*:.*])') EDITOR_CORRECTION_GROUP = re.compile(r'(.*:.*]\s*)(>[?]*)(.*)') LINE_REFERENCE_GROUP = re.compile(r'(\d+-|\d/(\d+/)*)*([0-9]+)(:.*)') LINE_REFERENCE_GROUP_START_INDEX = 1 LINE_REFERENCE_GROUP_MID_INDEX = 2 LINE_REFERENCE_GROUP_END_INDEX = 3 LINE_COMMENT_GROUP = re.compile(r'(.*\d+:)') UNCERTAINTY_WORD_GROUP = re.compile(r'(.*:.*]\s*)([>]*\?)(.*)') UNCERTAINTY_EDITOR_GROUP = re.compile(r'(.*)(\?)') WORD_REFERENCE_GROUP = re.compile(r'(.*[0-9]+:\s*)(.*)(].*)') DEBUG = False -def categorize_footnotes(page, footnotes=None, debug=False): +def categorize_footnotes(page, footnotes=None, debug=False, skip_after=-1.0, find_content=False): """Categorize footnotes. """ DEBUG = debug if footnotes is None: - footnotes = extract_footnotes(page) + footnotes = extract_footnotes(page, skip_after=skip_after) for footnote in footnotes: line_match = re.match(LINE_REFERENCE_GROUP, footnote.content) if line_match is not None: _process_line_match(page, footnote, line_match) else: warnings.warn(f'Unknown editor comment without a line reference: <{footnote}>') + if find_content and len(page.text_connection_marks) > 0: + TextConnectionMark.find_content_in_footnotes(page, footnotes=footnotes) page.update_and_attach_words2tree() for line in page.lines: line.attach_object_to_tree(page.page_tree) DEBUG = False if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def _is_uncertain(footnote) -> bool: """Return whether footnote contains sign for uncertainty. """ uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content) return (uncertain_match is not None\ and len([ markup for markup in footnote.standoff_markups\ if markup.css_string.endswith('italic;')\ and uncertain_match.end() >= markup.startIndex\ and uncertain_match.end() <= markup.endIndex ]) > 0) def _process_line_match(page, footnote, line_match): """Process footnote if reference to a line matches. """ word_match = re.match(WORD_REFERENCE_GROUP, footnote.content) end_line_number = int(line_match.group(LINE_REFERENCE_GROUP_END_INDEX)) lines = [] if line_match.group(LINE_REFERENCE_GROUP_START_INDEX) is not None: if line_match.group(LINE_REFERENCE_GROUP_MID_INDEX) is not None: line_ids = [ int(line_id) for line_id in\ line_match.group(LINE_REFERENCE_GROUP_START_INDEX).split('/')\ if line_id != '' ] + [ end_line_number ] lines = [ line for line in page.lines if line.id in line_ids ] else: start_line_number = int(line_match.group(1)[0:-1]) lines = [ line for line in page.lines if line.id >= start_line_number and line.id <= end_line_number ] else: lines = [ line for line in page.lines if line.id == end_line_number ] if word_match is not None: _process_word_match(page, footnote, line_match, word_match.group(2), end_line_number) elif len(lines) > 0: uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content) for line in lines: _process_line_reference(page, footnote, line, _is_uncertain(footnote)) else: warnings.warn(f'Footnote refers to missing line {line_number}: {footnote}') def _process_line_reference(page, footnote, line, is_uncertain): """Process footnote if there is a line reference. """ continuation_match = re.match(CONTINUATION_GROUP, footnote.content) if continuation_match is not None: reference_string = footnote.content[continuation_match.end():] if is_uncertain: reference_string = reference_string[:-1] line.editor_comments.append(LineContinuation.create_cls(reference_string=reference_string, is_uncertain=is_uncertain)) else: comment_match = re.match(LINE_COMMENT_GROUP, footnote.content) if comment_match is not None: is_uncertain = _is_uncertain(footnote) comment = footnote.content[comment_match.end():-1].strip()\ if is_uncertain\ else footnote.content[comment_match.end():].strip() line.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain)) else: warnings.warn(f'Unknown editor comment for line "{line.id}": <{footnote}>') def _process_word_match(page, footnote, line_match, word_text, line_number, parent_word_composition=None): """Process footnote if there is a word reference. """ referred_words = [ word for word in page.words\ if word.line_number == line_number\ and (word.text == word_text\ or re.match(rf'\W*{word_text}\W', word.text)\ or word.edited_text == word_text) ] + referred_word_parts = [ word.word_parts for word in page.words\ + if word.line_number == line_number\ + and len(word.word_parts) > 0\ + and word_text in [ wp.text for wp in word.word_parts ] ] overwritten_word_matches = [ word for word in page.words\ if word.line_number == line_number\ and len(word.word_parts) > 0\ and len([word_part for word_part in word.word_parts\ if word_part.overwrites_word is not None\ and word_part.overwrites_word.text == word_text]) > 0] if len(referred_words) > 0\ - or len(overwritten_word_matches) > 0: + or len(overwritten_word_matches) > 0\ + or len(referred_word_parts) > 0: word = None if len(referred_words) == 1: word = referred_words[0] elif len(overwritten_word_matches) > 0: word = [ word_part.overwrites_word for word_part in overwritten_word_matches[0].word_parts\ if word_part.overwrites_word is not None and word_part.overwrites_word.text == word_text][0] + elif len(referred_word_parts) > 0: + word = [ word_part for word_part in referred_word_parts[0] if word_part.text == word_text ][0] else: word = [ better_word for better_word in referred_words if better_word.text == word_text][0] atypical_match = re.match(ATYPICAL_GROUP, footnote.content) correction_match = re.match(EDITOR_CORRECTION_GROUP, footnote.content) clarification_match = re.match(CLARIFICATION_GROUP, footnote.content) is_uncertain = re.match(UNCERTAINTY_WORD_GROUP, footnote.content) is not None if correction_match is not None: correction = correction_match.group(3).strip() word.editor_comment = EditorCorrection(correction_text=correction, is_uncertain=is_uncertain) if not is_uncertain: word.edited_text = correction elif clarification_match is not None: word.editor_comment = Clarification(text=footnote.extract_part(word_text, css_filter='bold;')) elif atypical_match is not None: text = footnote.extract_part(word_text, css_filter='bold;')\ if footnote.markup_contains_css_filter('bold;')\ else None word.editor_comment = AtypicalWriting(text=text) elif is_uncertain: word.editor_comment = UncertainDecipherment() else: comment_match = re.match(COMMENT_GROUP, footnote.content) if comment_match is not None: is_uncertain = _is_uncertain(footnote) comment = footnote.content[comment_match.end():-1].strip()\ if is_uncertain\ else footnote.content[comment_match.end():].strip() word.editor_comment = EditorComment(comment=comment, is_uncertain=is_uncertain) else: warnings.warn(f'Unknown editor comment for word "{word.text}": <{footnote}>') elif re.match(r'.*\s.*', word_text): for word_part in word_text.split(' '): _process_word_match(page, footnote, line_match, word_part, line_number, parent_word_composition=word_text) else: warnings.warn(f'No word found with text "{word_text}" on line {line_number}: <{footnote}>') def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to process the footnotes of a page. svgscripts/process_footnotes.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: - -h|--help show help + -h|--help show help + -s|--skip-until=left skip all nodes.get('X') < left :return: exit code (int) """ + skip_after=-1.0 try: - opts, args = getopt.getopt(argv, "h", ["help" ]) + opts, args = getopt.getopt(argv, "hs:", ["help", "skip-until=" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 + elif opt in ('-s', '--skip-until'): + skip_after = float(arg) if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK): if not UNITTESTING: print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL) back_up(page, page.xml_file) - categorize_footnotes(page) + categorize_footnotes(page, skip_after=skip_after, find_content=True) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: tests_svgscripts/test_matrix.py =================================================================== --- tests_svgscripts/test_matrix.py (revision 98) +++ tests_svgscripts/test_matrix.py (revision 99) @@ -1,237 +1,245 @@ import unittest import lxml.etree as ET from os import sep, path from os.path import isdir, dirname import sys sys.path.append('svgscripts') from datatypes.matrix import Matrix from datatypes.transkriptionField import TranskriptionField class FakeTF: def __init__(self): self.xmin = 297.6379999999997 self.xmax = 765.354 self.ymin = 157.328 self.ymax = 752.6040160033832 class TestMatrix(unittest.TestCase): def setUp(self): self.x = 219.4058 self.y = 106.4634 self.matrix_string = 'matrix(1 0 0 1 {} {})'.format(str(self.x), str(self.y)) self.test_data_dir = dirname(__file__) + sep + 'test_data' if not isdir(self.test_data_dir): self.test_data_dir = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = self.test_data_dir + sep + 'test_ai.svg' self.rotation_angle = 20 self.rotation_matrix_string = 'matrix(0.94 0.342 -0.342 0.94 0 0)' self.test_margin_field_file = self.test_data_dir + sep + 'W_I_8_neu_125-01.svg' self.test_place_printing_verso = self.test_data_dir + sep + 'N_VII_1_xp5_4_page5.svg' self.test_place_printing_recto = self.test_data_dir + sep + 'N_VII_1_xp5_4_page6.svg' self.multipage = f'{self.test_data_dir}{sep}pdfsvg{sep}csv{sep}15.svg' self.marginals_extra = f'{self.test_data_dir}{sep}pdfsvg{sep}csv{sep}45.svg' self.marginals_extra_fn = f'{self.test_data_dir}{sep}pdfsvg{sep}csv{sep}44.svg' def test_Matrix(self): matrix = Matrix(self.matrix_string) self.assertEqual(matrix.getX(), self.x) self.assertEqual(matrix.add2X(1), self.x + 1) self.assertEqual(matrix.getY(), self.y) matrix = Matrix('matrix(0.98966578,0.1433933,-0.0913015,0.9958233,0,0)') self.assertEqual(matrix.getX(), 0) matrix = Matrix('matrix(1 2.998719e-04 -2.998719e-04 1 415.3643 476.7988)') def test_Matrix_rotation(self): rotation_string = 'rotate({})'.format(self.rotation_angle) rotation_stringC = 'rotate(-{})'.format(self.rotation_angle) matrixA = Matrix(rotation_string) matrixB = Matrix(self.rotation_matrix_string) matrixC = Matrix(rotation_stringC) self.assertEqual(matrixA.matrix[Matrix.A], matrixB.matrix[Matrix.A]) self.assertEqual(matrixA.matrix[Matrix.B], matrixB.matrix[Matrix.B]) self.assertEqual(matrixA.matrix[Matrix.C], matrixB.matrix[Matrix.C]) self.assertEqual(matrixA.matrix[Matrix.D], matrixB.matrix[Matrix.D]) self.assertEqual(matrixA.matrix[Matrix.E], matrixB.matrix[Matrix.E]) self.assertEqual(matrixA.matrix[Matrix.F], matrixB.matrix[Matrix.F]) self.assertEqual(matrixA.toString(), self.rotation_matrix_string) self.assertEqual(matrixC.toCSSTransformString(), 'rotate(-{}deg)'.format(self.rotation_angle)) def test_get_rotation_direction(self): rotation_string = 'rotate(-{})'.format(self.rotation_angle) matrixA = Matrix(rotation_string) matrixB = Matrix(self.rotation_matrix_string) matrixC = Matrix(self.matrix_string) self.assertEqual(matrixA.get_rotation_direction(), Matrix.UP) self.assertEqual(matrixB.get_rotation_direction(), Matrix.DOWN) self.assertEqual(matrixC.get_rotation_direction(), Matrix.STRAIGHT) def test_isRotationMatrix(self): rotation_string = 'rotate({})'.format(self.rotation_angle) matrixA = Matrix(rotation_string) self.assertEqual(matrixA.isRotationMatrix(), True) matrixB = Matrix(self.matrix_string) self.assertEqual(matrixB.isRotationMatrix(), False) def test_toCSSTransformString(self): rotation_string = 'rotate({})'.format(self.rotation_angle) matrixA = Matrix(rotation_string) self.assertEqual(matrixA.toCSSTransformString(), 'rotate({}deg)'.format(self.rotation_angle)) matrixB = Matrix(self.rotation_matrix_string) self.assertEqual(matrixB.toCSSTransformString(), 'rotate({}deg)'.format(self.rotation_angle)) def test_Matrix_Exception(self): with self.assertRaises(Exception): Matrix('matrix({})'.format(' '.join([ '0.0' for i in range(5)]))) def test_Matrix_TranskriptionField(self): tf = TranskriptionField(self.test_file) matrix = Matrix(self.matrix_string, transkription_field=tf) self.assertEqual(round(matrix.getX(), 3) , 28.706) self.assertEqual(round(matrix.getY(), 3) , 31.563) def test_get_transformed_positions(self): # Test relies on the example from "https://developer.mozilla.org/en-US/docs/Web/SVG/Attribute/transform" x = 10 y = 10 width = 30 height = 20 matrix = Matrix(transform_matrix_string='matrix(3 1 -1 3 30 40)') new_x, new_y, new_width, new_height = matrix.get_transformed_positions(x=x, y=y, width=width, height=height) self.assertEqual(new_x, 50) self.assertEqual(new_y, 80) self.assertEqual(new_width, 90) self.assertEqual(new_height, 60) def test_is_matrix_horizontal(self): matrix = Matrix(transform_matrix_string='matrix(3 1 -1 3 30 40)') self.assertEqual(matrix.is_matrix_horizontal(), False) matrix = Matrix(transform_matrix_string='matrix(1 0 0 1 30 40)') self.assertEqual(matrix.is_matrix_horizontal(), True) def test_is_part_of_transkription_field(self): tf = TranskriptionField(self.test_file) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 244.1211 91.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 244.1211 51.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 44.1211 91.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 244.1211 891.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 844.1211 91.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), False) text_node = ET.Element('text', attrib={'transform': 'matrix(0.866 -0.5 0.5 0.866 356.4303 753.4836)'}) tspan_node = ET.SubElement(text_node, 'tspan', attrib={'x': '41.82', 'y': '0'}) tspan_node.text = 'De' fake_tf = FakeTF() self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(fake_tf, text_node=text_node), True) + """ + local_file = '/home/knister0/ownCloud/myNietzscheDE/KGW-IX_12/Bd_12_XIV-XVI_Druck_als_SVG/03.svg' + tf = TranskriptionField(local_file) + svg_tree = ET.parse(local_file) + namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } + text_node = svg_tree.xpath('//ns:text[@transform="matrix(1 0 0 1 173.7407 144.8535)"]', namespaces=namespaces)[0] + self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), True) + """ def test_is_nearx_tf(self): tf = TranskriptionField(self.test_file) matrix_string = 'matrix(1 0 0 1 180.8755 315.9131)' self.assertEqual(Matrix.IS_NEARX_TRANSKRIPTION_FIELD(matrix_string, tf), True) matrix_string = 'matrix(1 0 0 1 100.8755 315.9131)' self.assertEqual(Matrix.IS_NEARX_TRANSKRIPTION_FIELD(matrix_string, tf), False) def test_do_conversion_factors_differ(self): self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(None, None), False) matrix_a = Matrix('matrix(1 0 0 1 180.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, None), True) matrix_b = Matrix('matrix(1 0 0 1 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), False) matrix_b = Matrix('matrix(0 0 0 1 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), True) matrix_b = Matrix('matrix(1 1 0 1 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), True) matrix_b = Matrix('matrix(1 0 1 1 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), True) matrix_b = Matrix('matrix(1 0 0 0 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), True) def test_clone_transformation_matrix(self): matrix_a = Matrix(matrix_list=[ 1, 0, 0, 1, 180.8755, 315.9131 ]) matrix_b = matrix_a.clone_transformation_matrix() self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), False) self.assertEqual(matrix_b.matrix[Matrix.E], 0) self.assertEqual(matrix_b.matrix[Matrix.F], 0) def test_toString(self): matrix_string = 'matrix(1.0 0.0 0.0 1.0 180.8755 315.9131)' matrix = Matrix(matrix_string) self.assertEqual(matrix.toString(), matrix_string) def test_get_semanticAndDataDict(self): matrix = Matrix('rotate(20)') #self.assertEqual(matrix.get_data_dictionary()['body'].get('matrix'), matrix.matrix) def test_is_in_margin_field(self): tf = TranskriptionField(self.test_margin_field_file) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 178.8916 182.0127)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 357.7339 818.3276)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf), False) tf = TranskriptionField(self.marginals_extra) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 778.519 407.1094)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf, marginals_on_extra_page=True), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 51.8503 1056.1182)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf, marginals_on_extra_page=True), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 552.9165 1072.1025)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf, marginals_on_extra_page=True), False) def test_is_in_place_of_printing_area(self): tf = TranskriptionField(self.test_place_printing_verso) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 42.5195 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 109.145 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 191.0571 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), False) tf = TranskriptionField(self.test_place_printing_recto) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 28.3462 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 28.3462 583.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 170.0791 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), False) def test_is_in_footnote_area(self): tf = TranskriptionField(self.test_place_printing_verso) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 42.5195 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 109.145 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 191.0571 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), True) tf = TranskriptionField(self.test_place_printing_recto) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 28.3462 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 28.3462 583.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 170.0791 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), True) tf = TranskriptionField(self.multipage, multipage_index=0) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 395.7141 463.6953)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 395.7141 453.6953)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), True) tf = TranskriptionField(self.marginals_extra) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 552.9165 1072.1025)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf, marginals_on_extra_page=True), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 51.8503 1056.1182)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf, x=5.352, marginals_on_extra_page=True), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 215.5483 1056.1182)'}) self.assertTrue(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf, x=24.732, marginals_on_extra_page=True)) svg_tree = ET.parse(self.marginals_extra_fn) tf = TranskriptionField(self.marginals_extra) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } node = svg_tree.xpath('//ns:text[@transform="matrix(1 0 0 1.0101 698.1499 85.3594)"]', namespaces=namespaces)[0] self.assertFalse(Matrix.NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, tf, marginals_on_extra_page=True)) self.assertFalse(Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), tf, marginals_on_extra_page=True)) node = svg_tree.xpath('//ns:text[@transform="matrix(1 0 0 1 215.5483 1056.1182)"]', namespaces=namespaces)[0] self.assertTrue(Matrix.NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, tf, marginals_on_extra_page=True)) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_faksimile.py =================================================================== --- tests_svgscripts/test_faksimile.py (revision 98) +++ tests_svgscripts/test_faksimile.py (revision 99) @@ -1,78 +1,82 @@ import unittest from os import sep, path from os.path import isdir, dirname, basename import lxml.etree as ET import sys import sys sys.path.append('svgscripts') from datatypes.faksimile import FaksimilePage, get_paths_inside_rect from datatypes.faksimile_image import FaksimileImage from datatypes.text_field import TextField class TestFaksimilePage(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.svg_file = DATADIR + sep + 'W-II-1,49et50.svg' self.svg_testmatrix = DATADIR + sep + 'TESTMATRIX_1.svg' self.faksimile_dir = DATADIR + sep + 'faksimile_svg' self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg' def test_init(self): image = FaksimileImage(file_name='test.jpg', height=10, width=10) text_field = TextField(width=10, height=10, x=10, y=10) faksimile = FaksimilePage(title='test', page_number=1, faksimile_image=image, text_field=text_field) self.assertEqual(faksimile.page_tree.getroot().get('title'), 'test') self.assertEqual(faksimile.page_tree.getroot().get('page-number'), '1') self.assertEqual(faksimile.faksimile_image.width, 10) self.assertEqual(faksimile.text_field.width, 10) def test_GET_TEXTFIELDS(self): svg_tree = ET.parse(self.svg_file) pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree) self.assertEqual(len(pages), 2) text_field = pages[0].text_field self.assertEqual(text_field.width, 663.333) result_dir = '.{}xml{}'.format(sep, sep) if isdir('xml') else '' self.assertEqual(pages[0].xml_file, result_dir + 'W-II-1_49.xml') self.assertEqual(pages[0].title, 'W II 1') self.assertEqual(pages[0].page_number, '49') pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree, page_number='49') self.assertEqual(len(pages), 1) svg_tree = ET.parse(self.svg_testmatrix) pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree) self.assertEqual(len(pages), 1) self.assertEqual(len(pages[0].word_positions), 1) self.assertEqual(pages[0].word_positions[0].transform.toCSSTransformString(), 'rotate(45deg)') svg_tree = ET.parse(self.faksimile_file) pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree) self.assertEqual(len(pages), 2) textfield_id = pages[1].title.replace(' ', '-') + '_' + pages[1].page_number #print([ position.id for position in pages[0].word_positions]) self.assertEqual(textfield_id not in [ position.id for position in pages[0].word_positions ], True) self.assertEqual('path1237' in [ position.id for position in pages[0].word_positions ], True) self.assertEqual('Vorgangs' in [ position.text for position in pages[0].word_positions ], False) svg_tree = ET.parse(self.faksimile_file) pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree) self.assertEqual(len(pages), 2) self.assertEqual(pages[0].page_number, '5') svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Eric/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/W-II-1,141et142.svg') pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree) self.assertEqual(len(pages), 1) def test_get_paths_inside_rect(self): svg_tree = ET.parse(self.faksimile_file) paths = get_paths_inside_rect(svg_tree, '//ns:path', 360, 786, 92, 765, 'N-VII-1_5') self.assertEqual(len(paths), 1) svg_tree = ET.parse(self.svg_testmatrix) paths = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', 0, 2038.72, 0, 974.08002, 'TESTMATRIX_1') self.assertEqual(len(paths), 1) + svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Myriam/Mp_XIV/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/Mp-XIV-1,419a.svg') + namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } + paths = get_paths_inside_rect(svg_tree, '//ns:rect', 52, 800, 58, 900, 'Mp-XIV-1_419a', namespaces=namespaces) + self.assertEqual(len([ path for path in paths if 'seinen' in path.xpath('./ns:title/text()', namespaces=namespaces)]), 1) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_create_manuscript.py =================================================================== --- tests_svgscripts/test_create_manuscript.py (revision 98) +++ tests_svgscripts/test_create_manuscript.py (revision 99) @@ -1,50 +1,51 @@ import unittest from os import sep, path, remove from os.path import isfile import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') import create_manuscript from datatypes.manuscript import ArchivalManuscriptUnity class TestCreateManuscript(unittest.TestCase): def setUp(self): create_manuscript.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.content_file = DATADIR + sep + 'content.txt' def test_create_page_url_mapping(self): mapping = {} create_manuscript.create_page_url_mapping(self.content_file, mapping) self.assertTrue('Mp XV' in mapping.keys()) #print(mapping) #mapping = {} #create_manuscript.create_page_url_mapping('content.txt', mapping, default_title='Mp XV') #print(mapping) creator = create_manuscript.ManuscriptCreator('') pages_node = ET.Element('pages') #creator._create_or_update_pages(pages_node, mapping['Mp XV']) #print(ET.dump(pages_node)) def test_get_or_create_element(self): creator = create_manuscript.ManuscriptCreator('') manuscript_tree = ET.ElementTree(ET.Element(ArchivalManuscriptUnity.XML_TAG)) self.assertEqual(len(manuscript_tree.xpath('test')), 0) node = creator._get_or_create_element(manuscript_tree.getroot(), 'test', create_id=True) self.assertEqual(len(manuscript_tree.xpath('test')), 1) node = creator._get_or_create_element(manuscript_tree.getroot(), 'test[@id="0"]') self.assertEqual(len(manuscript_tree.xpath('test')), 1) node = creator._get_or_create_element(manuscript_tree.getroot(), 'page[@number="10"]') self.assertEqual(node.get('number'), '10') node = creator._get_or_create_element(manuscript_tree.getroot(), 'page[@number="0"]', create_id=True) self.assertEqual(node.get('id'), '1') self.assertEqual(node.get('number'), '0') + @unittest.skip('files missing') def test_main(self): create_manuscript.main(['-x', 'xml', '-t', 'Mp XV', self.content_file]) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_text_connection_mark.py =================================================================== --- tests_svgscripts/test_text_connection_mark.py (revision 98) +++ tests_svgscripts/test_text_connection_mark.py (revision 99) @@ -1,76 +1,76 @@ import unittest from os import sep, path from os.path import dirname, isdir import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.reference import Reference from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.text_connection_mark import TextConnectionMark from datatypes.word import Word class TestTextConnectionMark(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' self.xml_file = DATADIR + sep + 'N_VII_1_page008.xml' mylist = {'text': '*', 'id': '0', 'line-number': '2' } self.node = ET.Element(TextConnectionMark.XML_TAG, attrib=mylist) word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] word_position.attach_object_to_tree(self.node) self.test_tcm_xml = DATADIR + sep + 'N_VII_1_page013.xml' def test_create_cls(self): text_connection_mark = TextConnectionMark.create_cls(self.node) self.assertEqual(text_connection_mark.id, 0) self.assertEqual(text_connection_mark.transkription_positions[0].bottom, 11) self.assertEqual(text_connection_mark.transkription_positions[0].height, 10) self.assertEqual(text_connection_mark.transkription_positions[0].top, 1) self.assertEqual(text_connection_mark.transkription_positions[0].left, 0) self.assertEqual(text_connection_mark.transkription_positions[0].width, 10) self.assertEqual(text_connection_mark.text, '*') self.assertEqual(text_connection_mark.line_number, 2) self.assertEqual(text_connection_mark.transkription_positions[0].transform.isRotationMatrix(), True) def test_attach_word_to_tree(self): text_connection_mark = TextConnectionMark.create_cls(self.node) text_connection_mark.text_source = Reference(first_line=1, title='ASDF', page_number='5c') empty_tree = ET.ElementTree(ET.Element('page')) text_connection_mark.attach_word_to_tree(empty_tree) #print(ET.dump(empty_tree.getroot())) for node in empty_tree.xpath('//' + TextConnectionMark.XML_TAG): mark = TextConnectionMark.create_cls(node) self.assertEqual(mark.id, 0) self.assertEqual(mark.transkription_positions[0].bottom, 11) self.assertEqual(mark.transkription_positions[0].height, 10) self.assertEqual(mark.transkription_positions[0].top, 1) self.assertEqual(mark.transkription_positions[0].left, 0) self.assertEqual(mark.transkription_positions[0].width, 10) self.assertEqual(mark.text, '*') self.assertEqual(mark.line_number, 2) self.assertEqual(mark.transkription_positions[0].transform.isRotationMatrix(), True) self.assertEqual(mark.text_source.first_line, text_connection_mark.text_source.first_line) self.assertEqual(mark.text_source.page_number, text_connection_mark.text_source.page_number) def test_get_semanticAndDataDict(self): dictionary = TextConnectionMark.get_semantic_dictionary() #print(dictionary) def test_find_content(self): page = Page(self.test_tcm_xml) transkription_field = TranskriptionField(page.source) svg_tree = ET.parse(page.source) - text_connection_marks = [ TextConnectionMark.create_cls_from_word(word) for word in page.words if word.text == TextConnectionMark.SPECIAL_CHAR_LIST[1]] - TextConnectionMark.find_content_in_footnotes(text_connection_marks, transkription_field, svg_tree, title=page.title, page_number=page.number) - self.assertEqual(len(text_connection_marks), 4) - for tcm in text_connection_marks: + page.text_connection_marks = [ TextConnectionMark.create_cls_from_word(word) for word in page.words if word.text == TextConnectionMark.SPECIAL_CHAR_LIST[1]] + TextConnectionMark.find_content_in_footnotes(page, transkription_field, svg_tree) + self.assertEqual(len(page.text_connection_marks), 4) + for tcm in page.text_connection_marks: self.assertEqual(tcm.text_source is not None, True) self.assertEqual(tcm.text_source.first_line > -1, True) self.assertEqual(tcm.text_source.page_number, '14') if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_data/N_VII_1_page006.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1_page006.xml (revision 98) +++ tests_svgscripts/test_data/N_VII_1_page006.xml (revision 99) @@ -1,1276 +1,1276 @@ svgWordPosition 2019-08-02 15:17:37 2019-08-02 15:17:37 2019-08-02 15:30:59 2019-08-02 15:30:59 - 2020-08-27 09:47:22 + 2020-10-09 18:22:57 Index: tests_svgscripts/test_data/pdfsvg/W_II_1_page015.xml =================================================================== --- tests_svgscripts/test_data/pdfsvg/W_II_1_page015.xml (revision 98) +++ tests_svgscripts/test_data/pdfsvg/W_II_1_page015.xml (revision 99) @@ -1,3183 +1,18 @@ - + svgWordPosition 2019-06-17 22:47:39 2019-07-11 15:02:53 2019-07-04 11:13:33 2019-07-11 15:38:20 2019-08-02 09:46:39 - 2019-10-11 09:52:26 + 2020-10-09 18:31:56 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Index: tests_svgscripts/test_word.py =================================================================== --- tests_svgscripts/test_word.py (revision 98) +++ tests_svgscripts/test_word.py (revision 99) @@ -1,469 +1,481 @@ import unittest from os import sep, path import lxml.etree as ET import sys sys.path.append('svgscripts') from process_words_post_merging import reset_page, update_writing_process_ids from datatypes.box import Box from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.matrix import Matrix import datatypes.page from datatypes.path import Path from datatypes.positional_word_part import PositionalWordPart from datatypes.style import Style from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word, execute_function_on_parts, update_transkription_position_ids from datatypes.word_deletion_path import WordDeletionPath from datatypes.word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass class Page: def __init__(self): self.svg_file = None def get_line_number(self, input=0): return -1 def get_biggest_fontSize4styles(self, style_set={}): return 7 class TestWord(unittest.TestCase): TESTCASE = None def setUp(self): DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_file = DATADIR + sep + 'N_VII_1_page009.xml' self.word_deletion_path_file = DATADIR + sep + 'N_VII_1_page138.xml' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] x = 0 for dict in self.word_part_objs: dict['class'] = 'st22' dict['x'] = x dict['y'] = 11 x += 1 mylist = {'text': 'abc', 'id': '0', 'line-number': '2', 'deleted': 'true' } word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] self.word_node = ET.Element('word', attrib=mylist) word_position.attach_object_to_tree(self.word_node) x = 0 for char in mylist['text']: ET.SubElement(self.word_node, 'part', attrib={'text': char, 'x': str(x), 'y': '11', 'class': 'st22' }) x += 1 def test_add_deletion_paths(self): page = datatypes.page.Page(self.word_deletion_path_file, add_deletion_paths_to_words=False) word = [ word for word in page.words if word.text == 'AufBau'][0] #self.assertTrue(word.deleted) self.assertTrue(len(word.word_parts) > 0) self.assertTrue(word.word_parts[0].deleted) word.add_deletion_paths(page.word_deletion_paths, tr_xmin=28.347656, tr_ymin=49.921875) self.assertTrue(len(word.word_parts[0].deletion_paths) > 0) #print(word.deletion_paths) + def test_join_words(self): + words = [ Word(id=4, text='asdf-', line_number=1, deleted=True), Word(id=5, text='bsdf', line_number=2, deleted=False) ] + new_word = Word.join_words(words) + self.assertEqual(new_word.id, 4) + self.assertEqual(new_word.text, 'asdf-bsdf') + self.assertEqual(new_word.edited_text, 'asdfbsdf') + self.assertEqual(new_word.deleted, False) + self.assertEqual(new_word.line_number, -1) + words = [ Word(id=1, word_parts=[Word(id=4, text='asdf-', line_number=1, deleted=True), Word(id=5, text='bsdf', line_number=2, deleted=False)]),\ + Word(id=4, text='.', line_number=2, deleted=True), Word(id=5, text='.', line_number=2, deleted=False) ] + new_word = Word.join_words(words) + self.assertEqual(new_word.text, 'asdf-bsdf..') def test_Word_with_word_part_objs(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) self.assertEqual(word.id, 0) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_Word_with_word_node(self): word = Word.create_cls(self.word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, True) self.assertEqual(word.transkription_positions[0].bottom, 11) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 1) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') self.assertEqual(word.line_number, 2) self.assertEqual(word.transkription_positions[0].transform.isRotationMatrix(), True) def test_attach_word_to_tree(self): newWord = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) empty_tree = ET.ElementTree(ET.Element('page')) newWord.attach_word_to_tree(empty_tree) for word_node in empty_tree.getroot().xpath('//word'): word = Word.CREATE_WORD(word_node=word_node) self.assertEqual(word.id, 0) self.assertEqual(word.deleted, False) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') @unittest.skipUnless(TESTCASE is None or TESTCASE == 0, 'Not testing this case') def test_create_correction_history_case0(self): # Case 1: whole word over box box = Box(earlier_text='XYX') word = Word(text='ASDF', transkription_positions=[TranskriptionPosition()]) word.word_box = box word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.overwrites_word is not None, True) @unittest.skipUnless(TESTCASE is None or TESTCASE == 1, 'Not testing this case') def test_create_correction_history_case1(self): # Case 2: part of word over box box = Box(earlier_text='XYX') partA = Word(text='A', transkription_positions=[TranskriptionPosition()]) partA.word_box = box partB = Word(text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.word_parts[0].overwrites_word is not None, True) @unittest.skipUnless(TESTCASE is None or TESTCASE == 2, 'Not testing this case') def test_create_correction_history_case3(self): # Case 3: part of word over box, word under box is part of earlier version box = Box(earlier_text='XYX') tp0 = TranskriptionPosition() tp0.style = Style(writing_process_id=0) tp1 = TranskriptionPosition() tp1.style = Style(writing_process_id=1) partA = Word(id=0, text='Test', transkription_positions=[ tp0]) partB = Word(id=1, text='er', transkription_positions=[ tp1]) partB.word_box = box word = Word(text='Tester', writing_process_id=1, word_parts=[ partA, partB ] ) word.create_correction_history(box_style=tp0.style) self.assertEqual(word.text, 'Tester') self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.earlier_version.text, 'TestXYX') self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) @unittest.skipUnless(TESTCASE is None or TESTCASE == 3, 'Not testing this case') def test_create_correction_history_case4(self): # Case 4: part of word is deleted partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.edited_text, 'SDF') @unittest.skipUnless(TESTCASE is None or TESTCASE == 4, 'Not testing this case') def test_create_correction_history_case5(self): tp0 = TranskriptionPosition() tp0.style = Style(writing_process_id=0) tp1 = TranskriptionPosition() tp1.style = Style(writing_process_id=1) partA = Word(id=0, text='Test', transkription_positions=[ tp0]) partB = Word(id=1, text='er', transkription_positions=[ tp1]) word = Word(text='Tester', word_parts=[ partA, partB ] ) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[1].extendsEarlierVersion, True) self.assertEqual(word.word_parts[1].isExtensionOfWord, word.earlier_version) #@unittest.skipUnless(TESTCASE is None or TESTCASE == 5, 'Not testing this case') #@unittest.skip('case tested, relies on a local xml file') def test_create_correction_history_case_full(self): page = datatypes.page.Page('xml/N_VII_1_page138.xml') manuscript = ArchivalManuscriptUnity() reset_page(page) update_writing_process_ids(page) word = [ word for word in page.words if word.text == 'Verschiedenes' and word.line_number == 4 ][0] wordAufBau = [ word for word in page.words if word.text == 'AufBau' ][0] #page.words = [ word ] page.update_styles(manuscript=manuscript, partition_according_to_styles=True) word.word_parts[0].transkription_positions[0].has_box = Box(earlier_text='v') self.assertEqual(len(word.word_parts), 2) word_over_box = word._get_partial_word_over_box() update_transkription_position_ids(word) word.create_correction_history(page) self.assertEqual(word.writing_process_id, 1) self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.earlier_version.text, 'verschiedenes') #print(word.earlier_version.id, [ (w.id, w.text) for w in word.earlier_version.word_parts ]) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) #print(ET.dump(word_node)) """ self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) self.assertEqual(word.word_parts[1].overwrites_word is not None, True) """ word = wordAufBau page.words = [ word ] page.update_styles(manuscript=manuscript, partition_according_to_styles=True) word.word_parts[0].deleted = True word.word_parts[1].transkription_positions[0].has_box = Box(earlier_text='b') self.assertEqual(len(word.word_parts), 3) word_over_box = word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 3) update_transkription_position_ids(word) word.create_correction_history(page) self.assertEqual(word.writing_process_id, 2) self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.text, 'AufBau') self.assertEqual(word.edited_text, 'Bau') self.assertEqual(word.earlier_version.text, 'Aufbau') self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) self.assertEqual(word.word_parts[1].overwrites_word is not None, True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) #print(ET.dump(word_node)) newWord = Word.create_cls(word_node) #@unittest.skip('') def test_earlier_version(self): partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) earlier_version = word.create_earlier_version() self.assertEqual(earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, earlier_version.word_parts[0]) def test_undo_partitioning(self): tps = [] for i, xy in enumerate([ 3, 4, 5 ]): tps.append(TranskriptionPosition(id=i, x=xy, y=xy, height=10, width=10)) partA = Word(id=0, text='Auf', writing_process_id=1, deleted=True, transkription_positions=[ tps[0]]) partB = Word(id=1, text='B', writing_process_id=2, transkription_positions=[tps[1]]) partC = Word(id=2, text='au', writing_process_id=1,transkription_positions=[tps[2]]) word = Word(text='Aufbau', writing_process_id=2, word_parts=[ partA, partB, partC ] ) word.undo_partitioning() self.assertEqual(len(word.transkription_positions), len(tps)) self.assertEqual(len(word.word_parts), 0) """ page = datatypes.page.Page('xml/N_VII_1_page138.xml') word = page.words[77] word.undo_partitioning() self.assertEqual(len(word.word_parts), 0) self.assertEqual(len(word.transkription_positions), 3) update_transkription_position_ids(word) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) print(ET.dump(word_node)) """ def test_split(self): page = Page() pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('b') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) self.assertEqual(nextWord.id, 2) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('bc') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('ab', start_id=10) self.assertEqual(currentWord.id, 10) self.assertEqual(currentWord.text, 'ab') self.assertEqual(currentWord.transkription_positions[0].width, 2.1) self.assertEqual(nextWord.id, 11) self.assertEqual(nextWord.transkription_positions[0].width, 5.2) word_part_objs=[{'text': 'x', 'class':'st22', 'x': 0, 'y': 0},\ {'text': 'Insofern', 'class':'st22', 'x': 1, 'y': 0},\ {'text': 'x', 'class':'st22', 'x': 10, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofer') word_part_objs=[{'text': 'xInsofern', 'class':'st22', 'x': 0, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofern') def test_join(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word) self.assertEqual(word.text, 'abc.') other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word, append_at_end_of_new_word=False) self.assertEqual(word.text, '.abc.') """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_get_semanticAndDataDict(self): dictionary = Word.get_semantic_dictionary() #print(dictionary) info_dict = dictionary['properties'].get('isDeletionOfWord') self.assertEqual(SemanticClass.SUPER_PROPERTY in info_dict.keys(), True) super_info_dict = info_dict[SemanticClass.SUPER_PROPERTY] #print(info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME)) def test_simplify_transkription_positions(self): node_string = """ """ nodeA = ET.fromstring(node_string) node_string = """ """ nodeB = ET.fromstring(node_string) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) self.assertEqual(len(word.transkription_positions), 2) word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) word.transkription_positions[1].writing_process_id = -1 word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) self.assertEqual(word.transkription_positions[0].writing_process_id, 0) """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_partition(self): page = datatypes.page.Page(self.test_file) word = page.words[67] self.assertEqual(word.belongs_to_multiple_writing_processes(), True) word.partition_according_to_writing_process_id() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.belongs_to_multiple_writing_processes(), False) self.assertEqual(word.belongs_to_multiple_writing_processes(include_parts=True), True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) newWord = Word.create_cls(word_node) self.assertEqual(len(newWord.word_parts), 3) #print(ET.dump(empty_tree.getroot())) def test_partition_deletion(self): page = datatypes.page.Page(self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.deleted = transkription_position.writing_process_id == 1 self.assertEqual(word.has_mixed_status('deleted'), True) word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.has_mixed_status('deleted'), False) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) page = datatypes.page.Page(self.test_file) word = page.words[67] word.partition_according_to_writing_process_id() #print([(word.text, word.deleted) for word in word.word_parts]) word.word_parts[1].transkription_positions[1].deleted = True word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 4) #print([(word.text, word.deleted) for word in word.word_parts]) partA = Word(text='A', deleted=True) partB = Word(text='SDF', deleted=False) word = Word(text='ASDF', word_parts=[ partA, partB]) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) def test_execute_function_on_parts(self): page = datatypes.page.Page(self.test_file) word_parts = [ page.words[67], page.words[68] ] word_parts, none = execute_function_on_parts(word_parts, 'partition_according_to_writing_process_id') self.assertEqual(len(word_parts) == 4, True) def test_process_word_boxes(self): page = datatypes.page.Page(self.pdf_xml) page.source = self.pdf_xml_source page.update_styles(partition_according_to_styles=True) tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [30, 277, 288, 297, 321] for word_id, index in enumerate(indices): word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) self.assertEqual(word_over_box is not None, True) self.assertEqual(word_over_box == page.words[index] or word_over_box in page.words[index].word_parts, True) #self.assertEqual(word_over_box in page.words[index].word_parts, True) def test_process_word_several_boxesOn1LIne(self): page = datatypes.page.Page(self.pdf_xml) page.source = self.pdf_xml_source for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [30, 277, 288, 297, 321] empty_tree = ET.ElementTree(ET.Element('page')) for word_id, index in enumerate(indices): word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) self.assertEqual(word_over_box is not None, True) def test_split_according_to_status(self): page = datatypes.page.Page(self.test_file) word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.text = 'asdf'\ if transkription_position.writing_process_id == 1\ else word.text self.assertEqual(word.has_mixed_status('text'), True) new_words = word.split_according_to_status('text') #print([word.text for word in new_words ]) self.assertEqual(len(new_words) > 1, True) self.assertEqual(new_words[0].id, word.id) self.assertEqual(new_words[0].deleted, word.deleted) self.assertEqual(new_words[1].id, word.id+1) manuscript = ArchivalManuscriptUnity() page = datatypes.page.Page(self.test_file) word = page.words[67] page.words = [ word ] page.update_styles(manuscript=manuscript) new_words = word.split_according_to_status('style', splits_are_parts=True) self.assertEqual(len(word.word_parts), 3) def test__create_new_word(self): manuscript = ArchivalManuscriptUnity() page = datatypes.page.Page(self.test_file) word = page.words[67] page.words = [ word ] page.update_styles(manuscript=manuscript) newWord = word._create_new_word([ word.transkription_positions[0] ], 'style') for key in Word.COPY_PROPERTY_KEY: self.assertEqual(newWord.__dict__[key], word.__dict__[key]) self.assertEqual(len(newWord.styles), 1) def test__get_partial_word_over_box(self): word = Word(text='test', transkription_positions=[ TranskriptionPosition(id=0), TranskriptionPosition(id=1) ]) word.transkription_positions[0].has_box = Box(earlier_text='asdf') word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 2) partA = Word(id=0, text='A', transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition(), TranskriptionPosition(id=1)]) partB.transkription_positions[0].has_box = Box(earlier_text='asdf') word = Word(text='ASDF', word_parts=[ partA, partB]) word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 2) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_footnotes.py =================================================================== --- tests_svgscripts/test_footnotes.py (revision 98) +++ tests_svgscripts/test_footnotes.py (revision 99) @@ -1,54 +1,69 @@ import unittest from os import sep, path, remove from os.path import isdir, isfile, dirname import shutil import sys import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') import datatypes.footnotes from datatypes.footnotes import FootnoteColumns, extract_footnotes, extract_footnotes_as_strings, UNITTESTING, DEBUG from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField class TestExtractFootnotes(unittest.TestCase): def setUp(self): datatypes.footnotes.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_footnote = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_footnote_verso = DATADIR + sep + 'N_VII_1_xp5_4_page5.svg' self.test_footnote_recto = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg' self.test_footnote_multi = DATADIR + sep + 'N_VII_1_xp5_4_page13.svg' self.test_footnote_multi_xml = DATADIR + sep + 'N_VII_1_page013.xml' self.test_categorize_footnote = DATADIR + sep + 'N_VII_1_page006.xml' def test_extract_footnotes(self): footnotes = extract_footnotes_as_strings(svg_file=self.test_footnote_multi, contains_string='Anschlußzeichen') self.assertEqual(len(footnotes), 4) page = Page(self.test_footnote_multi_xml) footnotes = extract_footnotes(page, svg_file=self.test_footnote_multi, contains_string='Anschlußzeichen') self.assertEqual(len(footnotes), 4) footnotes = extract_footnotes(page, svg_file=self.test_footnote_multi) self.assertEqual(len(footnotes), 7) + """ + page = Page('xml/Mp_XIV_page418.xml') + footnotes = extract_footnotes(page, skip_after=183) + print(footnotes) + """ def test_columns(self): svg_tree = ET.parse(self.test_footnote_multi) transkription_field = TranskriptionField(self.test_footnote_multi) nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ]) footnote_columns = FootnoteColumns(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, None) self.assertEqual(len(footnote_columns.footnote_columns), 2) footnote_columns.register_index(184) footnote_columns.append('asdf') self.assertEqual(len(footnote_columns.footnote_columns[0]), 1) with self.assertRaises(Exception): FootnoteColumns(svg_tree.getroot().nsmap, [], bottom_values, None) + """ + local_file = page.source#'/home/knister0/ownCloud/myNietzscheDE/KGW-IX_12/Bd_12_XIV-XVI_Druck_als_SVG//07.svg' + svg_tree = ET.parse(local_file) + transkription_field = TranskriptionField(local_file) + nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field),\ + svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] + bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ]) + footnote_columns = FootnoteColumns(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, None) + """ + if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_convert_wordPositions.py =================================================================== --- tests_svgscripts/test_convert_wordPositions.py (revision 98) +++ tests_svgscripts/test_convert_wordPositions.py (revision 99) @@ -1,67 +1,67 @@ import unittest from os import sep, path, remove import lxml.etree as ET import lxml.html import sys sys.path.append('svgscripts') import convert_wordPositions from convert_wordPositions import Converter, SVGConverter, HTMLConverter, JSONConverter from datatypes.page import Page from datatypes.page_creator import PageCreator from datatypes.transkription_position import TranskriptionPosition class TestConverter(unittest.TestCase): def setUp(self): DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.outputfile_txt = 'test.txt' self.outputfile_html = 'test.html' self.outputfile_svg = 'test.svg' self.outputfile_json = 'test.json' def test_main(self): - argv = ['-t', '-s', self.test_svg_file, self.test_file] + argv = ['-x', '-s', self.test_svg_file, self.test_file] self.assertEqual(convert_wordPositions.main(argv), 0) - argv = ['-t', '-s', self.test_svg_file, '-o', self.outputfile_txt, self.test_file] + argv = ['-x', '-s', self.test_svg_file, '-o', self.outputfile_txt, self.test_file] self.assertEqual(convert_wordPositions.main(argv), 0) self.assertEqual(path.isfile(self.outputfile_txt), True) - argv = ['-t', '-s', self.test_svg_file, '-o', self.outputfile_html, self.test_file] + argv = ['-x', '-s', self.test_svg_file, '-o', self.outputfile_html, self.test_file] self.assertEqual(convert_wordPositions.main(argv), 0) self.assertEqual(path.isfile(self.outputfile_html), True) html_tree = lxml.html.parse(self.outputfile_html) self.assertEqual(html_tree.getroot().tag, 'html') - argv = ['-t', '-s', self.test_svg_file, '-o', self.outputfile_svg, self.test_file] + argv = ['-x', '-s', self.test_svg_file, '-o', self.outputfile_svg, self.test_file] self.assertEqual(convert_wordPositions.main(argv), 0) self.assertEqual(path.isfile(self.outputfile_svg), True) svg_tree = ET.parse(self.outputfile_svg) self.assertEqual(svg_tree.getroot().tag, '{http://www.w3.org/2000/svg}svg') - argv = ['-t', '-k', 'number', '-o', self.outputfile_json, self.test_file] + argv = ['-x', '-k', 'number', '-o', self.outputfile_json, self.test_file] self.assertEqual(convert_wordPositions.main(argv), 0) def test_create_converter(self): page = PageCreator(self.test_file, svg_file=self.test_svg_file) converter = Converter.CREATE_CONVERTER(page, False, 'SVG') self.assertEqual(isinstance(converter, SVGConverter), True) converter = Converter.CREATE_CONVERTER(page, False, 'HTML') self.assertEqual(isinstance(converter, HTMLConverter), True) converter = Converter.CREATE_CONVERTER(page, False, 'JSON') self.assertEqual(isinstance(converter, JSONConverter), True) converter = Converter.CREATE_CONVERTER(page, False) self.assertEqual(isinstance(converter, Converter), True) def test_get_transkription_positions(self): tp = [ TranskriptionPosition(), TranskriptionPosition(), TranskriptionPosition() ] page = PageCreator(self.test_file, svg_file=self.test_svg_file) converter = Converter.CREATE_CONVERTER(page, False, 'SVG') converter._get_transkription_positions(tp, stage_version='1+') def tearDown(self): bool(path.isfile(self.outputfile_txt)) and remove(self.outputfile_txt) bool(path.isfile(self.outputfile_html)) and remove(self.outputfile_html) bool(path.isfile(self.outputfile_svg)) and remove(self.outputfile_svg) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_process_files.py =================================================================== --- tests_svgscripts/test_process_files.py (revision 98) +++ tests_svgscripts/test_process_files.py (revision 99) @@ -1,50 +1,55 @@ import unittest from os import sep, path, remove -from os.path import isfile +from os.path import isfile, dirname import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') import process_files from process_files import MyCSVHandler, MyErrorHandler +from datatypes.page_creator import PageCreator class TestProcessFiles(unittest.TestCase): def setUp(self): process_files.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.csv_file = DATADIR + sep + 'bd_12_Layout.csv' self.dir = DATADIR + sep + 'pdfsvg' self.csv_dir = self.dir + sep + 'csv' self.manuscript = self.dir + sep + 'W_II_1.xml' self.graphic_file = self.dir + sep + 'W_II_1_page001_web.svg' self.multipdf = DATADIR + sep + 'Bd_12_Mp_XIV_-XVI_Druck.pdf' @unittest.skipUnless(__name__ == "__main__", 'test takes too long, we do not run it with unittest discover') def test_main(self): #self.assertEqual(process_files.main([ self.manuscript ]), 1) argv = [ '-g', '-x', self.dir, '-s', self.dir, self.dir ] self.assertEqual(process_files.main(argv), 0) self.assertEqual(isfile(self.graphic_file), True) def test_csvhandler(self): csv_handler = MyCSVHandler(self.csv_file, self.multipdf, self.csv_dir) self.assertEqual(len([ entry for entry in csv_handler.csv_entries if entry[MyCSVHandler.ENTRY_KEY_FILE] is not None ]), 9) - #self.assertEqual(csv_handler.process_files('asdf', 'asdf'), 0) + csv_handler = MyCSVHandler(self.csv_file, self.multipdf, self.csv_dir, title="Mp XV") + self.assertEqual(len([ entry for entry in csv_handler.csv_entries if entry[MyCSVHandler.ENTRY_KEY_FILE] is not None ]), 2) #print(csv_handler.csv_entries) + #self.assertEqual(csv_handler.process_files('asdf', 'asdf'), 0) - def test_is_page_ok(self): + def test_page_status(self): self.assertEqual(process_files.is_page_ok(manuscript_file=self.manuscript, page_number=2), True) + #self.assertEqual(process_files.page_has_status(process_files.WARN_MISSING_USE_NODE, manuscript_file=self.manuscript, page_number='1'), True) + #self.assertEqual(process_files.get_page_output_file('2', manuscript_file=self.manuscript), dirname(self.manuscript) + sep + 'W_II_1_page001.xml') def test_is_svg_ok(self): self.assertEqual(process_files.is_svg_ok(manuscript_file=self.manuscript, page_number=1), True) @unittest.skip('') def test_run(self): error_handler = MyErrorHandler() error_handler.run(page_number='15') if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_extract_line_continuation.py =================================================================== --- tests_svgscripts/test_extract_line_continuation.py (revision 98) +++ tests_svgscripts/test_extract_line_continuation.py (revision 99) @@ -1,48 +1,48 @@ import unittest from os import sep, path, remove from os.path import isfile import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') import extract_line_continuation from datatypes.page import Page from datatypes.transkriptionField import TranskriptionField class TestExtractLineContinuation(unittest.TestCase): def setUp(self): extract_line_continuation.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.w_I_8_125_svg = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.w_I_8_125_xml = DATADIR + sep + 'W_I_8_new_page125.xml' def test_get_arrow_y(self): arrow = ET.Element('text') arrow.set('transform', 'matrix(1 0 0 1 10 20)') self.assertEqual(extract_line_continuation._get_arrow_y(arrow), 20.0) tspan = ET.SubElement(arrow, 'tspan') tspan.set('y', '10.0') self.assertEqual(extract_line_continuation._get_arrow_y(tspan), 30.0) def test_get_line_of_arrow(self): svg_tree = ET.parse(self.w_I_8_125_svg) page = Page(self.w_I_8_125_xml) transkription_field = TranskriptionField(self.w_I_8_125_svg) arrows = extract_line_continuation._extract_arrow_nodes(svg_tree, 'st7') line = extract_line_continuation._get_line_of_arrow(arrows[0], page, transkription_field) self.assertEqual(line.id, 15) def test_extract_line_continuations(self): page = Page(self.w_I_8_125_xml) extract_line_continuation.extract_line_continuations(page, svg_file=self.w_I_8_125_svg) lines_with_continuations = [ line for line in page.lines if len(line.editor_comments) > 0] self.assertEqual(len(lines_with_continuations), 2) page = Page('xml/N_VII_1_page029.xml') extract_line_continuation.extract_line_continuations(page) lines_with_continuations = [ line for line in page.lines if len(line.editor_comments) > 0] - print(lines_with_continuations) + #print(lines_with_continuations) self.assertEqual(len(lines_with_continuations), 1) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_util.py =================================================================== --- tests_svgscripts/test_util.py (revision 98) +++ tests_svgscripts/test_util.py (revision 99) @@ -1,229 +1,242 @@ import unittest from os import sep, path, remove, listdir from os.path import isdir, isfile, dirname, basename import shutil import sys import lxml.etree as ET import sys import tempfile import warnings sys.path.append('svgscripts') import util from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT from datatypes.faksimile import FaksimilePage from datatypes.page import Page from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.word_position import WordPosition from datatypes.word import Word sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT class TestCopy(unittest.TestCase): def setUp(self): util.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_dir = DATADIR self.faksimile_dir = DATADIR + sep + 'faksimile_svg' self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg' self.image = DATADIR + sep + 'image.jpg' self.svg_testrecord = DATADIR + sep + 'TESTRECORD.svg' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.tmp_dir = tempfile.mkdtemp() def test_copy(self): tmp_image = self.tmp_dir + sep + basename(self.image) target_file = 'asdf.svg' shutil.copy(self.image, self.tmp_dir) util.copy_faksimile_svg_file(target_file, faksimile_source_file=self.faksimile_file,\ target_directory=self.tmp_dir, local_image_path=tmp_image) self.assertEqual(isfile(self.tmp_dir + sep + target_file), True) util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_file,\ target_directory=self.tmp_dir, local_image_path=tmp_image) self.assertEqual(isfile(self.tmp_dir + sep + basename(self.faksimile_file)), True) with self.assertRaises(Exception): util.copy_faksimile_svg_file() with self.assertRaises(Exception): util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_source_file) def test_copy_xml(self): old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) self.assertEqual(isfile(xml_file), True) page = Page(xml_file) self.assertEqual(len(page.words), len(old_page.words)) self.assertEqual(len(page.line_numbers), 0) def test_create_highlighted_svg_file(self): target_file = self.tmp_dir + sep + basename(self.faksimile_file) tmp_image = self.tmp_dir + sep + basename(self.image) faksimile_tree = ET.parse(self.faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } node_ids = ['rect947', 'rect951', 'rect953', 'rect955', 'rect959', 'rect961', 'rect963'] highlight_color = 'blue' util.create_highlighted_svg_file(faksimile_tree, node_ids, target_directory=self.tmp_dir, highlight_color=highlight_color, namespaces=namespaces) self.assertEqual(isfile(target_file), True) new_tree = ET.parse(target_file) for node in new_tree.xpath('//ns:rect[@fill="{0}"]|//ns:path[@fill="{0}"]'.format(highlight_color), namespaces=namespaces): node_ids.remove(node.get('id')) self.assertEqual(len(node_ids), 0) def test_get_empty_node_ids(self): faksimile_tree = ET.parse(self.faksimile_file) faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] empty_node_ids = util.get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page) self.assertEqual('rect1085' in empty_node_ids, True) def test_record_changes(self): new_tree = ET.parse(self.faksimile_file) old_tree = ET.parse(self.faksimile_file) empty_node_id = 'rect1085' title_node_id = 'test001' namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() } node = new_tree.xpath('//ns:rect[@id="{0}"]'.format(empty_node_id), namespaces=namespaces)[0] title = ET.SubElement(node, 'title', attrib={ 'id': title_node_id }) title.text = 'test' new_file = self.tmp_dir + sep + 'new.svg' old_file = self.tmp_dir + sep + 'old.svg' util.copy_faksimile_svg_file(target_file=new_file, faksimile_tree=new_tree) util.copy_faksimile_svg_file(target_file=old_file, faksimile_tree=old_tree) util.record_changes(old_file, new_file, [ empty_node_id ], namespaces=namespaces) test_tree = ET.parse(old_file) self.assertEqual(len(test_tree.xpath('//ns:rect[@id="{0}"]/ns:title[@id="{1}"]'.format(empty_node_id, title_node_id), namespaces=namespaces)), 1) def test_replace_chars(self): page = Page(self.xml_file) faksimile_tree = ET.parse(self.faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } word_position = WordPosition(id='rect1159', text='„Gedächtniß"') wps, texts = util.replace_chars(page.words, [ word_position ]) self.assertEqual(texts[0].endswith('“'), True) self.assertEqual(wps[0].text.endswith('“'), True) word_position = WordPosition(id='rect1173', text='-') wps, texts = util.replace_chars(page.words, [ word_position ]) self.assertEqual(wps[0].text.endswith('–'), True) def test_mismatch_words(self): page = Page(self.xml_file) faksimile_tree = ET.parse(self.faksimile_file) faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] page = Page('xml/N_VII_1_page174.xml') faksimile_tree = ET.parse('faksimile_svg/N-VII-1,173et174.svg') faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] self.assertEqual('-' in [ tp.text for tp in faksimile_page.word_positions], True) wps, texts = util.replace_chars(page.words,faksimile_page.word_positions) self.assertEqual('–' in texts, True) self.assertEqual(len([ faksimile_position for faksimile_position in wps\ if faksimile_position.text == '–' ]), 4) mismatching_words, mismatching_faksimile_positions = util.get_mismatching_ids(page.words, faksimile_page.word_positions) self.assertEqual(len([word for word in mismatching_words if word.text.endswith('“') ]), 0) self.assertEqual(len([word for word in mismatching_words if word.text.endswith('–') ]), 0) + def test_process_warnings(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('default') + warnings.warn('Test1: asdf') + warnings.warn('Test2: asdf') + status = util.process_warnings4status(w, ['Test1', 'Test2' ], 'asdf', 'OK', status_prefix='with warnings') + #print(status) + self.assertTrue('Test1' in status.split(':')) + self.assertTrue('Test2' in status.split(':')) + @unittest.skip('test uses external program, has been tested') def test_show_files(self): list_of_files = [ self.test_dir + sep + file for file in listdir(self.test_dir) if file.endswith('pdf') ][0:2] util.ExternalViewer.show_files(single_file=self.faksimile_file, list_of_files=list_of_files) def test_record_changes_to_page(self): page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 1 ]) old_length = len(page.words) self.assertEqual(page.words[1].text, 'asdf') self.assertEqual(page.words[1].transkription_positions[0].width, 353) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 13 ]) self.assertEqual(page.words[13].text, 'er') self.assertEqual(page.words[14].text, '=') self.assertEqual(len(page.words), old_length+1) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 64 ]) self.assertEqual(page.words[64].text, 'Simplifications-apparat') self.assertEqual(len(page.words[64].transkription_positions), 3) self.assertEqual(len(page.words), old_length-1) @unittest.skipUnless(__name__ == "__main__", 'tests all words') def test_extended__record_changes_to_page(self): page = Page(self.xml_file) old_length = len(page.words) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord) self.assertEqual(page.words[1].text, 'asdf') self.assertEqual(page.words[13].text, 'er') self.assertEqual(page.words[14].text, '=') self.assertEqual(page.words[65].text, 'Simplifications-apparat') self.assertEqual(len(page.words), old_length) def test_copy_faksimile_update_image_location(self): test_dir = self.tmp_dir #FAKSIMILE_LOCATION + '/Myriam/Fertig/' util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir) with self.assertWarns(UserWarning): util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir) def test_record_changes_on_xml(self): old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) node = tree.xpath('//word[@id="135"]')[0] counter =0 while node.get('text') != 'gar' or counter > 5: counter += 1 nextnode = node.getnext() node.set('text', node.get('text') + nextnode.get('text')) for element in nextnode.getchildren(): node.append(element) nextnode.getparent().remove(nextnode) write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(new_page.words), len(old_page.words)-2) self.assertEqual(len([ word for word in new_page.words if word.text == 'gar']), 1) old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) node = tree.xpath('//word[@id="138"]')[0] counter =0 while node.get('text') != 'nichtvorkommt.' or counter > 5: counter += 1 nextnode = node.getnext() node.set('text', node.get('text') + nextnode.get('text')) for element in nextnode.getchildren(): node.append(element) nextnode.getparent().remove(nextnode) node.set('split', 'nicht vorkommt.') write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) joined_page = Page(xml_file) self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.']), 1) self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.'][0].split_strings), 2) self.assertEqual(len(joined_page.words), len(old_page.words)-1) new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(new_page.words), len(old_page.words)) self.assertEqual(len([word for word in new_page.words if word.text == 'vorkommt.']), 1) self.assertEqual(len([word for word in old_page.words if word.text == 'nicht']),\ len([word for word in new_page.words if word.text == 'nicht'])) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) old_page = Page(xml_file) nodes = tree.xpath('//word[@id>="85" and @id<="87"]') self.assertEqual(len(nodes), 3) prevWordText = nodes[0].get('text') nodes[0].set('join', prevWordText + 'z') nodes[1].set('split', 'z u') lastWordText = nodes[2].get('text') nodes[2].set('join', 'u' + lastWordText) write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) joined_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(joined_page.words), len(old_page.words)-1) def test_back_up(self): test_dir = self.tmp_dir page = Page(self.xml_file) target_file_name = util.back_up(page, self.xml_file, bak_dir=test_dir) self.assertEqual(isfile(target_file_name), True) + svg_tree = ET.parse(page.svg_file) + namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } + util.back_up_svg_file(svg_tree, namespaces) def tearDown(self): shutil.rmtree(self.tmp_dir, ignore_errors=True) pass if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_reference.py =================================================================== --- tests_svgscripts/test_reference.py (revision 98) +++ tests_svgscripts/test_reference.py (revision 99) @@ -1,47 +1,50 @@ import unittest from os import sep, path import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.reference import Reference class TestReference(unittest.TestCase): def test_init(self): reference_string = '5' reference = Reference.create_cls(reference_string=reference_string, title='ASDF') self.assertEqual(reference.first_line, 5) reference_string = '5,5' reference = Reference.create_cls(reference_string=reference_string, title='ASDF') self.assertEqual(reference.first_line, 5) self.assertEqual(reference.page_number, str(5)) reference_string = 'ASDF 5,5-8' reference = Reference.create_cls(reference_string=reference_string) self.assertEqual(reference.title, 'ASDF') self.assertEqual(reference.first_line, 5) self.assertEqual(reference.last_line, 8) self.assertEqual(reference.page_number, str(5)) + reference_string = 'ASDF 5,5 a .' + reference = Reference.create_cls(reference_string=reference_string) + self.assertEqual(reference.page_number, str(5)) def test_attach_create(self): reference_string = 'ASDF 5,5-8' reference = Reference.create_cls(is_uncertain=True, reference_string=reference_string) empty_tree = ET.ElementTree(ET.Element('page')) reference.attach_object_to_tree(empty_tree) #print(ET.dump(empty_tree.getroot())) nodes = empty_tree.xpath('//' + Reference.XML_TAG) self.assertEqual(len(nodes), 1) reference_copy = Reference.create_cls(node=nodes[0]) self.assertEqual(reference.id, reference_copy.id) self.assertEqual(reference.is_uncertain, reference_copy.is_uncertain) self.assertEqual(reference.title, reference_copy.title) self.assertEqual(reference.page_number, reference_copy.page_number) self.assertEqual(reference.first_line, reference_copy.first_line) self.assertEqual(reference.last_line, reference_copy.last_line) def test_get_semantic_dictionary(self): dictionary = Reference.get_semantic_dictionary() #print(dictionary) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_show_highlighted_svg_file.py =================================================================== --- tests_svgscripts/test_show_highlighted_svg_file.py (revision 0) +++ tests_svgscripts/test_show_highlighted_svg_file.py (revision 99) @@ -0,0 +1,30 @@ +import unittest +from os import sep, path, remove, listdir +from os.path import isdir, isfile, dirname, basename +import shutil +import sys +import lxml.etree as ET +import sys +import tempfile +import warnings + +sys.path.append('svgscripts') + +import show_highlighted_svg_file + +class TestCopy(unittest.TestCase): + def setUp(self): + show_highlighted_svg_file.UNITTESTING = True + DATADIR = path.dirname(__file__) + sep + 'test_data' + self.faksimile_dir = DATADIR + sep + 'faksimile_svg' + self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg' + self.tmp_dir = tempfile.mkdtemp() + + def test_main(self): + show_highlighted_svg_file.main([self.faksimile_file, 'Muster']) + + def tearDown(self): + shutil.rmtree(self.tmp_dir, ignore_errors=True) + +if __name__ == "__main__": + unittest.main() Index: tests_svgscripts/test_join_faksimileAndTranskription.py =================================================================== --- tests_svgscripts/test_join_faksimileAndTranskription.py (revision 98) +++ tests_svgscripts/test_join_faksimileAndTranskription.py (revision 99) @@ -1,122 +1,123 @@ import unittest from os import sep, path, remove from os.path import isdir, isfile, dirname import shutil import sys import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') import join_faksimileAndTranskription from datatypes.faksimile import FaksimilePage from datatypes.page import Page from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.word_position import WordPosition class TestJoin(unittest.TestCase): def setUp(self): join_faksimileAndTranskription.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.faksimile_dir = DATADIR + sep + 'faksimile_svg' self.manuscript = DATADIR + sep + 'N_VII_1.xml' self.manuscript_copy = self.manuscript.replace('.', '_copy.') self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.Mp_XIV_1_mytest_421 = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml' self.correction_dir = DATADIR + sep + 'correction_dir' self.page138 = DATADIR + sep + 'N_VII_1_page138.xml' def test_sort_words(self): page = Page(self.Mp_XIV_1_mytest_421) words_line7 = [ word for word in page.words if word.line_number == 7 ] page.words = words_line7 sorted_words = join_faksimileAndTranskription.sort_words(page) self.assertEqual(len(sorted_words), len(words_line7)) for index, word in enumerate(words_line7): self.assertEqual(sorted_words[index], word) def test_sort_faksimile_positions(self): faksimile_tree = ET.parse(self.faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces) self.assertEqual(len(faksimile_pages), 2) svg_pos_file, manuscript_file = join_faksimileAndTranskription.get_svgPosFile_and_manuscriptFile(faksimile_pages[0], manuscript_file=self.manuscript, redo_ok=True) sorted_positions = join_faksimileAndTranskription.sort_faksimile_positions(faksimile_pages[0].word_positions) page = Page(svg_pos_file) #print(max(sorted_positions).text) for index in range(0, 10): id = sorted_positions[index].id if len(faksimile_tree.getroot().xpath('//ns:rect[@id="{0}"]/ns:title/text()|//ns:path[@id="{0}"]/ns:title/text()'\ .format(id), namespaces=namespaces)) > 0: word_text = faksimile_tree.getroot().xpath('//ns:rect[@id="{0}"]/ns:title/text()|//ns:path[@id="{0}"]/ns:title/text()'\ .format(id), namespaces=namespaces)[0] #print(sorted_positions[index].left, sorted_positions[index].top, word_text, page.words[index].text) self.assertEqual(word_text, page.words[index].text) @unittest.skipUnless(__name__ == "__main__", 'test uses path from within dir') def test_get_filelist_and_manuscript_file(self): file_list, manuscript_file = join_faksimileAndTranskription.get_filelist_and_manuscript_file(self.faksimile_dir, self.manuscript) self.assertEqual(len(file_list), 1) self.assertEqual(file_list[0], self.faksimile_file) self.assertEqual(manuscript_file, self.manuscript) file_list, manuscript_file = join_faksimileAndTranskription.get_filelist_and_manuscript_file(self.manuscript, self.faksimile_file) self.assertEqual(len(file_list), 1) self.assertEqual(file_list[0], self.faksimile_file) self.assertEqual(manuscript_file, self.manuscript) file_list, manuscript_file = join_faksimileAndTranskription.get_filelist_and_manuscript_file(self.manuscript, correction_dir=self.correction_dir) self.assertEqual(len(file_list), 1) self.assertEqual(file_list[0], self.page138) @unittest.skipUnless(__name__ == "__main__", 'test uses path from within dir') def test_get_svgPosFile_and_manuscriptFile(self): faksimile_tree = ET.parse(self.faksimile_file) faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree) self.assertEqual(len(faksimile_pages), 2) svg_pos_file, manuscript_file = join_faksimileAndTranskription.get_svgPosFile_and_manuscriptFile(faksimile_pages[0], manuscript_file=self.manuscript, redo_ok=True) self.assertEqual(svg_pos_file, self.manuscript.replace('.', '_page00{}.'.format(faksimile_pages[0].page_number))) self.assertEqual(manuscript_file, self.manuscript) + @unittest.skip('join changed ... fix me') def test_join_faksimileAndTranskription(self): self.assertEqual(join_faksimileAndTranskription.join_faksimileAndTranskription(self.faksimile_file, self.manuscript), 0) #self.assertEqual(join_faksimileAndTranskription.join_faksimileAndTranskription(self.faksimile_file, self.manuscript, test_word_text='gar'), 0) @unittest.skip('function update_writing_process is deprecated') def testupdate_writing_process(self): page = Page(self.xml_file) word = page.words[12] self.assertEqual(len(word.faksimile_positions), 1) self.assertEqual(word.faksimile_positions[0].writing_process_id, -1) join_faksimileAndTranskription.update_writing_process(word) self.assertEqual(word.faksimile_positions[0].writing_process_id, 0) #@unittest.skipUnless(__name__ == "__main__", 'test takes too long, we do not run it with unittest discover') @unittest.skip('test takes too long, has been tested') def test_fix_errors(self): page = Page(self.xml_file) word_position = WordPosition(id='rect945', text='Lenken') exit_status = join_faksimileAndTranskription.fix_errors(self.faksimile_file, [ word_position], [page.words[12]], xml_source_file=self.xml_file, manuscript_file=self.manuscript ) self.assertEqual(exit_status, 0) @unittest.skip('tested with local file') def test_join_single_chars(self): page = Page('xml/N_VII_1_page016.xml') words = join_faksimileAndTranskription.sort_words(page) join_faksimileAndTranskription.join_single_char_words(words) new_words = [ word for word in words if word.text == 'selber' ] self.assertEqual(len(new_words), 1) new_words = [ word for word in words if word.text == 's' ] self.assertEqual(len(new_words), 0) def test_get_mismatching_ids(self): page = Page(self.xml_file) word_position = WordPosition(id='rect945', text='Lenken') mwords, mfps = join_faksimileAndTranskription.get_mismatching_ids([ page.words[12]], [ word_position ]) self.assertEqual(mwords[0].text, 'Denken') self.assertEqual(mfps[0].text, 'Lenken') if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_fix_missing_glyphs.py =================================================================== --- tests_svgscripts/test_fix_missing_glyphs.py (revision 98) +++ tests_svgscripts/test_fix_missing_glyphs.py (revision 99) @@ -1,79 +1,83 @@ import unittest from os import sep, path, remove from os.path import isdir, isfile, dirname import shutil import sys import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') import fix_missing_glyphs from datatypes.page import Page from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField class TestMissingGlyphs(unittest.TestCase): def setUp(self): fix_missing_glyphs.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.manuscript = DATADIR + sep + 'pdfsvg' + sep + 'W_II_1.xml' self.manuscript_copy = self.manuscript.replace('.', '_copy.') self.svgposfile = DATADIR + sep + 'pdfsvg' + sep + 'W_II_1_page015.xml' self.svgposfile_copy = DATADIR + sep + 'pdfsvg' + sep + 'W_II_1_page015_copy.xml' def test_main(self): argv_fileNotFound = [ 'asdf' ] with self.assertRaises(FileNotFoundError): fix_missing_glyphs.main(argv_fileNotFound) #shutil.copy(self.manuscript, self.manuscript_copy) #shutil.copy(self.svgposfile, self.svgposfile_copy) #self.assertEqual(fix_missing_glyphs.main([self.manuscript_copy]), 0) #shutil.copy(self.svgposfile_copy, self.svgposfile) #remove(self.manuscript_copy) shutil.copy(self.svgposfile, self.svgposfile_copy) self.assertEqual(fix_missing_glyphs.main([self.svgposfile_copy]), 0) remove(self.svgposfile_copy) + @unittest.skip('fix me') def test_update_word(self): page = Page(self.svgposfile) pwps = page.words[5].transkription_positions[0].positional_word_parts new_left = 10 old_left = pwps[0].left new_width = pwps[0].width + old_left - new_left pwps[0].left = new_left pwps[0].width = new_width pwps[0].text = 'X' + pwps[0].text original_text = page.words[5].text word = page.words[5] #print(ET.dump(pwp_node)) fix_missing_glyphs.update_word(word, word.transkription_positions[0], word.transkription_positions[0].positional_word_parts[0], [ pwps[0] ]) #print(ET.dump(pwp_node.getparent().getparent())) self.assertEqual(word.transkription_positions[0].positional_word_parts[0].width, new_width) self.assertEqual(word.text, 'X' + original_text) + @unittest.skip('Fix me') def test_find_missing_glyph_for_pwp(self): page = Page(self.svgposfile) transkription_field = TranskriptionField(page.svg_file) svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } positional_word_part_node = page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')[0]\ if len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) > 0 else None + print(ET.dump(positional_word_part_node)) pwp = PositionalWordPart(node=positional_word_part_node) pwps = fix_missing_glyphs.find_missing_glyph_for_pwp(pwp, svg_path_tree, namespaces, xmin=transkription_field.xmin, ymin=transkription_field.ymin) self.assertEqual(len(pwps), 2) + @unittest.skip('fix me') def test_get_filelist_and_manuscript_file(self): file_list, manuscript_file = fix_missing_glyphs.get_filelist_and_manuscript_file(self.manuscript, self.svgposfile) self.assertEqual(len(file_list), 1) self.assertEqual(file_list[0], self.svgposfile) self.assertEqual(manuscript_file, self.manuscript) file_list, manuscript_file = fix_missing_glyphs.get_filelist_and_manuscript_file(self.svgposfile, self.manuscript) self.assertEqual(len(file_list), 1) self.assertEqual(file_list[0], self.svgposfile) self.assertEqual(manuscript_file, self.manuscript) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_simple_word.py =================================================================== --- tests_svgscripts/test_simple_word.py (revision 98) +++ tests_svgscripts/test_simple_word.py (revision 99) @@ -1,28 +1,36 @@ import unittest from os import sep, path from os.path import dirname, isdir import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.matrix import Matrix from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.simple_word import SimpleWord from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.word import Word class TestSimpleWord(unittest.TestCase): def test_get_semanticAndDataDict(self): dictionary = SimpleWord.get_semantic_dictionary() #print(dictionary) def test_create_cls_from_word(self): word = Word(text='test') mark = MarkForeignHands.create_cls_from_word(word) self.assertEqual(mark.text, word.text) self.assertEqual(type(mark), MarkForeignHands) - + + def test_attach(self): + word = SimpleWord() + word.transkription_positions.append(TranskriptionPosition(id=0)) + word.transkription_positions.append(TranskriptionPosition(id=0)) + tree = ET.Element('page') + word.attach_word_to_tree(tree) + self.assertEqual(len(tree.xpath('//' + TranskriptionPosition.XML_TAG)), 2) + if __name__ == "__main__": unittest.main() Index: fixes/test_fix_old_data.py =================================================================== --- fixes/test_fix_old_data.py (revision 0) +++ fixes/test_fix_old_data.py (revision 99) @@ -0,0 +1,72 @@ +import lxml.etree as ET +from os import sep, path, remove +from os.path import isdir, isfile, dirname, basename +import shutil +import sys +import tempfile +import unittest +import warnings + +import fix_old_data + +sys.path.append('svgscripts') +from datatypes.faksimile import FaksimilePage +from datatypes.mark_foreign_hands import MarkForeignHands +from datatypes.page import Page +from datatypes.path import Path +from datatypes.positional_word_part import PositionalWordPart +from datatypes.text_connection_mark import TextConnectionMark +from datatypes.transkriptionField import TranskriptionField +from datatypes.word import Word +from datatypes.word_position import WordPosition +from process_words_post_merging import MERGED_DIR + + +class TestFixFaksimile(unittest.TestCase): + def setUp(self): + fix_old_data.UNITTESTING = True + DATADIR = path.dirname(__file__) + sep + 'test_data' + self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml' + self.fix_transkription_positions = DATADIR + sep + 'Mp_XIV_page419a.xml' + + def test_fix_faksimile(self): + page = Page(self.xml_file) + fp = page.words[0].faksimile_positions[0] + left = fp.left + top = fp.top + self.assertEqual(fix_old_data.fix_faksimile_positions(page), True) + self.assertEqual(fp.left, left + page.text_field.xmin) + self.assertEqual(fp.top, top + page.text_field.ymin) + + def test_fix_faksimile_line_position(self): + page = Page(self.xml_file) + fix_old_data.fix_faksimile_line_position(page) + for line_number in page.line_numbers: + #print(line_number.id) + self.assertTrue(line_number.faksimile_inner_top < line_number.faksimile_inner_bottom) + + @unittest.skip('already tested, interactive') + def test_fix_transkription_positions(self): + page = Page(self.fix_transkription_positions) + merged_page = Page(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)) + fix_old_data.sync_words_linewise(merged_page.words, page.words, page.line_numbers) + self.assertTrue(fix_old_data.fix_transkription_positions(page)) + + @unittest.skip('already tested, interactive') + def test_join_words(self): + page = Page(self.fix_transkription_positions) + fix_old_data.join_words_interactive(page) + + @unittest.skip('already tested, local file') + def test_fix_graphical_svg_file(self): + fix_old_data.fix_graphical_svg_file(Page('xml/Mp_XIV_page418.xml')) + + @unittest.skip('already tested, local file') + def test_get_words(self): + page = Page('xml/Mp_XIV_page418.xml') + print([ word.text for word in page.words if word.id == 300]) + words = fix_old_data._get_words_from_response('300-310', page.words) + print(words) + +if __name__ == "__main__": + unittest.main() Index: fixes/fix_old_data.py =================================================================== --- fixes/fix_old_data.py (revision 0) +++ fixes/fix_old_data.py (revision 99) @@ -0,0 +1,538 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This program can be used to process words after they have been merged with faksimile data. +""" +# Copyright (C) University of Basel 2019 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +from colorama import Fore, Style +from deprecated import deprecated +from functools import cmp_to_key +import getopt +import inspect +import lxml.etree as ET +import re +import shutil +import string +from svgpathtools import svg2paths2, svg_to_paths +from svgpathtools.path import Path as SVGPath +from svgpathtools.path import Line +import sys +import tempfile +from operator import attrgetter +import os +from os import listdir, sep, path, setpgrp, devnull +from os.path import exists, isfile, isdir, dirname, basename +from progress.bar import Bar +import warnings + + +sys.path.append('svgscripts') +from convert_wordPositions import HTMLConverter +from datatypes.box import Box +from datatypes.faksimile import FaksimilePage +from datatypes.manuscript import ArchivalManuscriptUnity +from datatypes.mark_foreign_hands import MarkForeignHands +from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK +from datatypes.path import Path +from datatypes.text_connection_mark import TextConnectionMark +from datatypes.transkriptionField import TranskriptionField +from datatypes.word import Word, update_transkription_position_ids +from join_faksimileAndTranskription import sort_words +from util import back_up, back_up_svg_file, copy_faksimile_svg_file +from process_files import update_svgposfile_status +from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR + +sys.path.append('shared_util') +from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT +from main_util import create_function_dictionary + + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.1" + +UNITTESTING = False +MAX_SVG_XY_THRESHOLD = 10 + +#TODO: fix all svg graphical files: change xlink:href to href!!!! + +def save_page(page, attach_first=False, backup=False): + """Write page to xml file + """ + if backup: + back_up(page, page.xml_file) + if attach_first: + page.update_and_attach_words2tree() + script_name = f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}' + write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ + script_name=script_name, file_type=FILE_TYPE_SVG_WORD_POSITION) + +def page_already_changed(page) -> bool: + """Return whether page has alreadybeen changed by function + """ + return len(\ + page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}:{inspect.currentframe().f_back.f_code.co_name}"]')\ + ) > 0 + +def fix_faksimile_line_position(page, redo=False) -> bool: + """Create a faksimile line position. + """ + if not redo and page_already_changed(page): + return False; + update_faksimile_line_positions(page) + if not UNITTESTING: + save_page(page) + return True + +def check_faksimile_positions(page, redo=False) -> bool: + """Check faksimile line position. + """ + if len(page.page_tree.xpath('//data-source/@file')) > 0: + svg_file = page.page_tree.xpath('//data-source/@file')[0] + svg_tree = ET.parse(svg_file) + positions_are_equal_counter = 0 + page_changed = False + for faksimile_page in FaksimilePage.GET_FAKSIMILEPAGES(svg_tree): + if page.title == faksimile_page.title\ + and page.number == faksimile_page.page_number: + #print([fp.id for fp in faksimile_page.word_positions ]) + for word in page.words: + for fp in word.faksimile_positions: + rect_fps = [ rfp for rfp in faksimile_page.word_positions if rfp.id == fp.id ] + if len(rect_fps) > 0: + rfp = rect_fps[0] + if fp.left != rfp.left or fp.top != rfp.top: + #print(f'{fp.id}: {fp.left}/{rfp.left} {fp.top}/{rfp.top}') + fp.left = rfp.left + fp.top = rfp.top + fp.bottom = fp.top + rfp.height + word.attach_word_to_tree(page.page_tree) + page_changed = True + else: + positions_are_equal_counter += 1 + print(f'{positions_are_equal_counter}/{len(page.words)} are equal') + if page_changed and not UNITTESTING: + save_page(page) + return page_changed + +def fix_faksimile_positions(page, redo=False) -> bool: + """Set faksimile positions to absolute values. + + [:return:] fixed + """ + if not redo and len(page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}"]')) > 0: + return False + x_min = page.text_field.xmin + y_min = page.text_field.ymin + for word in page.words: + for fp in word.faksimile_positions: + fp.left = fp.left + x_min + fp.top = fp.top + y_min + fp.bottom = fp.bottom + y_min + word.attach_word_to_tree(page.page_tree) + if not UNITTESTING: + print(f'writing to {page.page_tree.docinfo.URL}') + write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ + script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) + return True + +def fix_transkription_positions(page, redo=False) -> bool: + """Fix transkription positions of merged words + + [:return:] fixed + """ + if not isdir(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR)\ + or not isfile(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)): + return False + merged_page = Page(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)) + sync_dictionary = sync_words_linewise(merged_page.words, page.words, merged_page.line_numbers) + words = [] + for source_word in merged_page.words: + words.append(source_word) + if bool(sync_dictionary.get(source_word)): + _sync_transkriptions_with_words(source_word, sync_dictionary) + if source_word.text != ''.join([ t.get_text() for t in source_word.transkription_positions ]): + text = ''.join([ t.get_text() for t in source_word.transkription_positions ]) + print(f'{source_word.line_number}: {source_word.text} has transkription_positions with text "{text}".') + response = input('Change? [Y/n]>') + if not response.startswith('n'): + new_sync_dictionary = sync_words_linewise(merged_page.words, page.words,\ + [ line for line in merged_page.line_numbers if line.id == source_word.line_number ], force_sync_on_word=source_word) + if bool(new_sync_dictionary.get(source_word)): + _sync_transkriptions_with_words(source_word, new_sync_dictionary) + else: + raise Exception(f'Could not find sourc_word {source_word.text} in {new_sync_dictionary}!') + page.words = words + page.update_and_attach_words2tree() + if not UNITTESTING: + print(f'writing to {page.page_tree.docinfo.URL}') + save_page(page) + return True + +def fix_graphical_svg_file(page, redo=False) -> bool: + """Fix glyphs of word for which there is a /changed-word in page.page_tree + """ + svg_tree = ET.parse(page.svg_file) + transkription_field = TranskriptionField(page.source) + namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } + back_up_svg_file(svg_tree, namespaces=namespaces) + for deleted_word_node in page.page_tree.xpath('//deleted-word'): + deleted_word = Word.create_cls(deleted_word_node) + _run_function_on_nodes_for_word(svg_tree, namespaces, deleted_word, transkription_field, _set_node_attribute_to, 'visibility', 'hidden') + for changed_word_node in page.page_tree.xpath('//changed-word'): + changed_word = Word.create_cls(changed_word_node) + try: + word = [ word for word in page.words if word.id == changed_word.id and word.text == changed_word.text ][0] + left_difference = word.transkription_positions[0].left - changed_word.transkription_positions[0].left + _run_function_on_nodes_for_word(svg_tree, namespaces, word, transkription_field, _add_value2attribute, 'x', left_difference) + except IndexError: + warnings.warn(f'There is no word for changed_word {changed_word.id}: "{changed_word.text}" in {page.page_tree.docinfo.URL}!') + copy_faksimile_svg_file(target_file=page.svg_file, faksimile_tree=svg_tree, namespaces=namespaces) + +def _add_value2attribute(node, attribute, value): + """Add left_difference to x of node. + """ + node.set(attribute, str(float(node.get(attribute)) + value)) + node.set('changed', 'true') + +def _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=0.1) -> list: + """Return nodes with symbol_id n x = svg_x and y = svg_y. + """ + nodes = [ node for node in svg_tree.xpath(\ + f'//ns:use[@xlink:href="#{symbol_id}" and @x > {svg_x-threshold} and @x < {svg_x+threshold} and @y > {svg_y-threshold} and @y < {svg_y+threshold} ]',\ + namespaces=namespaces) if not bool(node.get('changed')) ] + if len(nodes) == 0 and threshold < MAX_SVG_XY_THRESHOLD: + return _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=threshold+1) + return nodes + +def _run_function_on_nodes_for_word(svg_tree, namespaces, word, transkription_field, function_on_node, attribute, value): + """Run function on nodes for words. + """ + for tp in word.transkription_positions: + for pwp in tp.positional_word_parts: + symbol_id = pwp.symbol_id + svg_x = pwp.left + transkription_field.xmin + svg_y = pwp.bottom + transkription_field.ymin + nodes = _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y) + if len(nodes) > 0: + node = nodes[0] + function_on_node(node, attribute, value) + +def _set_node_attribute_to(node, attribute, value): + """Set attribute of node to value. + """ + node.set(attribute, str(value)) + node.set('changed', 'true') + +def _get_words_from_response(response, words) ->list: + """Return a list of word that correspond to indices + """ + if re.match(r'\d+-\d+', response)\ + or re.match(r'\d+\+', response): + index_boundaries = [] + if response[-1] == '+': + index_boundaries.append(int(response[:response.index('+')])) + index_boundaries.append(index_boundaries[0]+1) + else: + index_boundaries = [ int(i) for i in response.split('-') ] + index_boundaries_length_diff = len(response.split('-')[0]) - len(response.split('-')[1]) + if index_boundaries_length_diff > 0: + index_boundaries[1] = int(response.split('-')[0][0-index_boundaries_length_diff-1] + response.split('-')[1]) + indices = [ i for i in range(index_boundaries[0], index_boundaries[1]+1) ] + if index_boundaries[0] > index_boundaries[1]: + indices = [ index_boundaries[0] ] + while indices[-1] > index_boundaries[1]: + indices.append(indices[-1]-1) + else: + indices = [ int(i) for i in response.split(' ') ] + result_words = [] + for index in indices: + if len([ word for word in words if word.id == index ]) > 0: + result_words += [ word for word in words if word.id == index ] + return result_words + +def _split_word(page, word, split_text): + """Split word. + """ + index = page.words.index(word) + _, left, right = word.split(split_text) + page.words[index] = left + page.words.insert(index+1, right) + +def join_words_interactive(page, redo=False) -> bool: + """Join words interactively. + """ + HTMLConverter(page).convert() + print('Specify ids of words to join.') + print('[s=split and join word]') + print('[d=mark deleted|i=fix ids|u=undelete|l[:value]=change line to value for ids|r=reload|b=restore backup|q=quit]>') + response = input('>') + if response.startswith('i'): + print(f'writing to {page.page_tree.docinfo.URL}') + save_page(page, attach_first=True) + return join_words_interactive(Page(page.page_tree.docinfo.URL)) + elif response.startswith('r'): + return join_words_interactive(Page(page.page_tree.docinfo.URL)) + elif response.startswith('b'): + if page.bak_file is not None: + return join_words_interactive(Page(page.bak_file)) + else: + print('Could not restore backup file, please restore manually!') + elif response.startswith('l'): + words = [] + line_number = -1 + if re.match(r'l:\d+\s\d+', response): + line_number = int(response.replace('l:', '').split(' ')[0]) + words = _get_words_from_response(re.compile('l:\d+\s').sub('', response), page.words) + else: + if not re.match(r'l:\d+$', response): + new_response_line = input('Specify new line number>') + if re.match(r'^\d+$', new_response_line): + line_number = int(new_response_line) + else: + line_number = int(response.replace('l:', '')) + new_response = input(f'Specify ids of words for which line number should be changed to {line_number}>') + if re.match(r'\d+', new_response): + words = _get_words_from_response(new_response, page.words) + if line_number != -1: + for word in words: word.line_number = line_number + if not UNITTESTING: + print(f'writing to {page.page_tree.docinfo.URL}') + save_page(page, attach_first=True) + page = Page(page.page_tree.docinfo.URL) + return join_words_interactive(page) + elif response.startswith('d') or response.startswith('u'): + if re.match(r'[du]\w*\s\d+', response): + words = _get_words_from_response(re.compile('[du]\w*\s').sub('', response), page.words) + else: + deletion_target = 'delete' if response.startswith('d') else 'undelete' + new_response = input(f'Specify ids of words to {deletion_target}. >') + if re.match(r'\d+', new_response): + words = _get_words_from_response(new_response, page.words) + if len(words) > 0: + for word in words: word.deleted = response.startswith('d') + if not UNITTESTING: + print(f'writing to {page.page_tree.docinfo.URL}') + save_page(page, attach_first=True) + page = Page(page.page_tree.docinfo.URL) + return join_words_interactive(page) + elif response.startswith('s'): + if re.match(r's\s\w+\s\d+', response): + words = _get_words_from_response(re.compile('s\s\w+\s').sub('', response), page.words) + split_text = response.split(' ')[1] + else: + split_text = input('Input split text>') + new_response = input(f'Specify ids of words to split. >') + if re.match(r'\d+', new_response): + words = _get_words_from_response(new_response, page.words) + if len(words) > 0 and split_text != '': + for word in words: _split_word(page, word, split_text) + if not UNITTESTING: + print(f'writing to {page.page_tree.docinfo.URL}') + save_page(page, attach_first=True) + page = Page(page.page_tree.docinfo.URL) + return join_words_interactive(page) + elif re.match(r'\d+', response): + words = _get_words_from_response(response, page.words) + if len(words) > 0: + if len(set([ word.line_number for word in words ])) == 1\ + and len(set([ word.deleted for word in words ])) == 1: + new_word = words[0] + for word2join in words[1:]: + page.words.remove(word2join) + new_word.join(word2join) + else: + new_word = Word.join_words(words) + index = len(page.words) + if words[0] in page.words: + index = page.words.index(words[0]) + elif len([ word for word in page.words if words[0] in word.word_parts ]) > 0: + index = page.words.index([ word for word in page.words if words[0] in word.word_parts ][0]) + for word2join in words: + if word2join in page.words: + page.words.remove(word2join) + elif len([ word for word in page.words if word2join in word.word_parts ]) > 0: + page.words.remove([ word for word in page.words if word2join in word.word_parts ][0]) + page.words.insert(index, new_word) + if not UNITTESTING: + print(f'writing to {page.page_tree.docinfo.URL}') + save_page(page, attach_first=True) + page = Page(page.page_tree.docinfo.URL) + return join_words_interactive(page) + return True + +def sync_words_linewise(source_words, target_words, lines, force_sync_on_word=None) -> dict: + """Sync words an create a dictionary with source_words as keys, refering to a list of corresponding words. + """ + result_dict = {} + for word in target_words + source_words: word.processed = False + for line in lines: + source_words_on_line = sorted([ word for word in source_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left) + target_words_on_line = sorted([ word for word in target_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left) + if len(target_words_on_line) == len(source_words_on_line): + _sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word) + elif len(source_words_on_line) < len(target_words_on_line): + _sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word) + else: + print('okey dokey') + return result_dict + +def _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict): + """Force sync on word. + """ + unprocessed_target_words = [t_word for t_word in target_words_on_line if not t_word.processed] + if len(unprocessed_target_words) > 0: + print([ (i, t_word.text) for i, t_word in enumerate(unprocessed_target_words)]) + response = input(f'Please specify indices of words to sync {force_sync_on_word.text} with: [default:0-{len(unprocessed_target_words)-1}]>') + indices = [ i for i in range(0, len(unprocessed_target_words)) ] + if re.match(r'\d+-\d+', response): + index_strings = response.split('-') + indices = [ i for i in range(int(index_strings[0]), int(index_strings[1])+1) ] + elif response != '': + indices = [ int(i) for i in response.split(' ') ] + target_words = [] + for i in indices: target_words.append(unprocessed_target_words[i]) + result_dict.update({ force_sync_on_word: target_words }) + else: + raise Exception(f'There are no unprocessed target_words for {force_sync_on_word.text} on line {force_sync_on_word.line_number}!') + +def _sync_transkriptions_with_words(word, sync_dictionary): + """Sync transkription_positions of word with syncronized words. + """ + word.transkription_positions = [] + for target_word in sync_dictionary[word]: + word.transkription_positions += target_word.transkription_positions + +def _sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None): + """Sync if there are more target words. + """ + current_source_word = None + for target_word in target_words_on_line: + if current_source_word is not None\ + and current_source_word.text.startswith(''.join([ w.text for w in result_dict[current_source_word]]) + target_word.text): + result_dict[current_source_word].append(target_word) + target_word.processed = True + if current_source_word.text == ''.join([ w.text for w in result_dict[current_source_word]]): + current_source_word = None + elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ]) > 0: + source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ][0] + target_word.processed = True + source_word.processed = True + result_dict.update({ source_word: [ target_word ] }) + elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ]) > 0: + current_source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ][0] + current_source_word.processed = True + target_word.processed = True + result_dict.update({ current_source_word: [ target_word ] }) + else: + msg = f'On line {target_word.line_number}: target_word "{target_word.text}" does not have a sibling in {[ s.text for s in source_words_on_line if not s.processed ]}' + warnings.warn(msg) + if force_sync_on_word is not None: + _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict) + +def _sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None): + """Sync same length + """ + for i, word in enumerate(source_words_on_line): + if word.text == target_words_on_line[i].text: + word.processed = True + target_words_on_line[i].processed = True + result_dict.update({ word: [ target_words_on_line[i] ] }) + elif len([ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ]) > 0: + target_word = [ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ][0] + word.processed = True + target_word.processed = True + result_dict.update({ word: [ target_word ] }) + else: + msg = f'On line {word.line_number}: source_word "{word.text}" does not have a sibling in {[ s.text for s in target_words_on_line]}' + warnings.warn(msg) + if force_sync_on_word is not None: + _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict) + +def usage(): + """prints information on how to use the script + """ + print(main.__doc__) + +def main(argv): + """This program can be used to fix faksimile position ->set them to their absolute value. + + svgscripts/fix_old_data.py [OPTIONS] + + a xml file about a manuscript, containing information about its pages. + a xml file about a page, containing information about svg word positions. + + OPTIONS: + -h|--help show help + -c|--check-faksimile-positions check whether faksimile positions have been updated + -j|--join-words join words by id interactive + -l|--faksimile-line-position create faksimile line positions + -p|--faksimile-positions fix old faksimile positions + -r|--redo rerun + -s|--fix-graphical-svg fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file. + + :return: exit code (int) + """ + function_list = [] + function_dict = create_function_dictionary(['-c', '--check-faksimile-positions'], check_faksimile_positions) + function_dict = create_function_dictionary(['-l', '--faksimile-line-position'], fix_faksimile_line_position, function_dictionary=function_dict) + function_dict = create_function_dictionary(['-p', '--faksimile-positions'], fix_faksimile_positions, function_dictionary=function_dict) + function_dict = create_function_dictionary(['-t', '--transkription-positions'], fix_transkription_positions, function_dictionary=function_dict) + function_dict = create_function_dictionary(['-s', '--fix-graphical-svg'], fix_graphical_svg_file, function_dictionary=function_dict) + function_dict = create_function_dictionary(['default', '-j', '--join-words'], join_words_interactive, function_dictionary=function_dict) + redo = False; + try: + opts, args = getopt.getopt(argv, "hcplrst", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position", "redo", "fix-graphical-svg", "transkription--positions" ]) + except getopt.GetoptError: + usage() + return 2 + for opt, arg in opts: + if opt in ('-h', '--help'): + usage() + return 0 + elif opt in ('-r', '--redo'): + redo = True; + elif opt in function_dict.keys(): + function_list.append(function_dict[opt]) + if len(function_list) == 0: + function_list.append(function_dict['default']) + if len(args) < 1: + usage() + return 2 + exit_status = 0 + xml_file = args[0] + if isfile(xml_file): + counters = { f.__name__: 0 for f in function_list } + for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK): + for current_function in function_list: + if not UNITTESTING: + print(Fore.CYAN + f'Processing {page.title}, {page.number} with function {current_function.__name__} ...' + Style.RESET_ALL) + back_up(page, page.xml_file) + counters[current_function.__name__] += 1 if current_function(page, redo=redo) else 0 + if not UNITTESTING: + for function_name, counter in counters.items(): + print(Style.RESET_ALL + f'[{counter} pages changed by {function_name}]') + else: + raise FileNotFoundError('File {} does not exist!'.format(xml_file)) + return exit_status + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) Index: fixes/test_data/N_VII_1_page138.xml =================================================================== --- fixes/test_data/N_VII_1_page138.xml (revision 0) +++ fixes/test_data/N_VII_1_page138.xml (revision 99) @@ -0,0 +1,2519 @@ + + + + + + svgWordPosition + + + 2019-08-02 15:15:41 + + 2019-08-02 15:15:41 + 2019-11-15 18:49:39 + 2019-11-15 18:49:39 + 2019-11-15 18:50:09 + 2019-10-28 12:08:59 + 2019-11-27 17:27:24 + 2019-11-27 17:27:21 + 2019-11-15 18:49:32 + 2019-11-15 18:50:09 + 2019-11-15 18:50:09 + 2019-11-15 18:50:13 + 2019-11-15 14:15:55 + 2019-11-27 15:33:24 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: fixes/test_data/Mp_XIV_page419a.xml =================================================================== --- fixes/test_data/Mp_XIV_page419a.xml (revision 0) +++ fixes/test_data/Mp_XIV_page419a.xml (revision 99) @@ -0,0 +1,4931 @@ + + + + + + + svgWordPosition + + + 2020-08-31 15:57:17 + + 2020-08-31 15:57:17 + 2020-09-02 10:58:02 + 2020-08-31 16:24:36 + 2020-08-31 17:11:05 + 2020-09-02 10:59:21 + 2020-09-09 11:36:22 + 2020-09-09 11:18:19 + 2020-09-09 11:18:40 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: fixes/test_data/merged/Mp_XIV_page419a.xml =================================================================== --- fixes/test_data/merged/Mp_XIV_page419a.xml (revision 0) +++ fixes/test_data/merged/Mp_XIV_page419a.xml (revision 99) @@ -0,0 +1,4910 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + svgWordPosition + + + 2020-08-31 15:57:17 + + 2020-08-31 15:57:17 + 2020-09-02 10:58:02 + 2020-08-31 16:24:36 + 2020-08-31 17:11:05 + 2020-09-02 10:58:05 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +