Index: TODO.md =================================================================== --- TODO.md (revision 24) +++ TODO.md (revision 25) @@ -1,40 +1,43 @@ # Wortsuche: - Die Wortsuche sollte über die Nähe der Wörter zueinander gewichtet werden. - Wortpfade, d.h. Abfolgen der Wörter sollen vermieden werden, da dies nicht automatisch generiert werden kann und höchst fehleranfällig ist. - Daher sollen die Worteinfügungen auch nicht dafür verwendet werden, alternative Textverläufe aufzuzeichnen. # Ontologie: - Values: Could knora-base:GeomValue be used for PositionalObject values? - but see https://github.com/dhlab-basel/Knora/issues/169 # Skript: - write a script that creates a class from a equivalentClass, i.e. a class from the shared ontologies. # Probleme: - Wie mit Worteinfügungen in Wortkomposita umgehen? # TODO - make datatypes: - Page [ok] - Word [ok] --> deal with non-horizontal text <<<< DONE! --> add style info to word --> connect style with character glyph-id from svg path file --> handle word layers, i.e. later correction of words by insertion + - WritingProcess + - TODO: test page.create_writing_processes()!!! + - correlates with font size: + - font 10-9: stage 0 + - font 8-6.5: stage 1 + - font 6-5: stage 2 - Style - WordPosition [ok] - TranskriptionPosition [ok] -->TODO: simplify by joininng - FaksimilePosition [ok] - LineNumber [ok] - Marginalien - Freehand: - Deletion - WordInsertionMark [reDO] - --> get glyph-id from svg path file - --> do not connect to previous and next word --> create word layers? - -->TODO: fix everything!!!!!!! - Underline Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 24) +++ svgscripts/datatypes/page.py (revision 25) @@ -1,239 +1,269 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile #from myxmlwriter import write_pretty from .class_spec import SemanticClass from .image import Image from .word import Word from .lineNumber import LineNumber +from .writing_process import WritingProcess from .word_insertion_mark import WordInsertionMark from .transkriptionField import TranskriptionField class Page(SemanticClass): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, pdfFile=None, svg_file=None, extract_transkription_field_only=False): self.title = title self.line_numbers = [] self.style_dict = {} self.sonderzeichen_list = [] self.svg_file = None self.pdfFile = None self.source = None self.number = int(page_number) if page_number is not None else -1 if xml_source_file is not None: if isfile(xml_source_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_source_file, parser) self.title = self.page_tree.getroot().get('title') self.number = self.page_tree.getroot().get('number') self.source = self.page_tree.getroot().get('source') self.init_words() self.add_style(style_node=self.page_tree.getroot().find('.//style')) self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None if pdfFile is not None and self.pdfFile is None: self.pdfFile = pdfFile ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) #write_pretty(xml_element_tree=self.page_tree, file_name=xml_source_file, script_name=__file__, file_type='svgWordPosition') if svg_file is not None and self.svg_file is None: self.svg_file = svg_file tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) #write_pretty(xml_element_tree=self.page_tree, file_name=xml_source_file, script_name=__file__, file_type='svgWordPosition') else: raise Exception('File "{}" does not exist!'.format(xml_source_file)) elif xml_target_file is not None: self.word_insertion_marks = [] self.words = [] + self.writing_processes = [] self.svg_file = svg_file self.pdfFile = pdfFile if isfile(xml_target_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_target_file, parser) self.source = self.page_tree.getroot().get('source') if bool(self.page_tree.getroot().get('title')): self.title = self.page_tree.getroot().get('title') elif title is not None: self.page_tree.getroot().set('title', title) if self.svg_file is None: self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 elif len(self.page_tree.xpath('.//svg/@file')) == 0: tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) else: self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 if self.pdfFile is None: self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None elif len(self.page_tree.xpath('.//pdf/@file')) == 0: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) - for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG, WordInsertionMark.XML_TAG ]: + for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG, WordInsertionMark.XML_TAG, WritingProcess.XML_TAG ]: for node in self.page_tree.xpath('//' + xpath2remove): node.getparent().remove(node) else: self.page_tree = ET.ElementTree(ET.Element('page')) self.pdfFile = pdfFile self.svg_file = svg_file if title is not None: self.page_tree.getroot().set('title', title) self.page_tree.getroot().set('transkription-field-only', str(extract_transkription_field_only).lower()) if page_number is not None: self.page_tree.getroot().set('number', str(page_number)) if self.pdfFile is not None: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) if self.svg_file is not None: tf = TranskriptionField(self.svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) self.svg_image = Image(file_name=self.svg_file, height=self.height, width=self.width) if self.svg_file is not None\ else None def init_line_numbers(self, line_numbers, document_bottom): """Init line numbers. """ even_index = 0 MINABOVE = 1 self.line_numbers = [] if len(line_numbers) > 0: first_line_bottom = line_numbers[even_index].top - MINABOVE self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 while even_index < len(line_numbers): self.line_numbers.append(LineNumber(id=line_numbers[even_index].id-1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=line_numbers[even_index].top-MINABOVE)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 self.line_numbers.append(LineNumber(id=line_numbers[even_index-1].id+1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=document_bottom)) for line_number in self.line_numbers: line_number.attach_object_to_tree(self.page_tree) def init_words(self): self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ] self.words = [ Word.CREATE_WORD(word_node=word_node) for word_node in self.page_tree.getroot().xpath('//word') ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] + self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ] """ for index, word in enumerate(self.words): for word_insertion_mark in self.word_insertion_marks: self.words[index] = word_insertion_mark.attach_and_update_word_if_involved(word) if self.words[index] != word: break """ + def create_writing_processes_and_attach2tree(self): + """Creates three stages of Nietzsche's process of writing. + """ + stages = [ [], [], [] ] + for word in self.words: + l = [ set( pwp.style_class.split(' ')[1] for pwp in transkription_position.positional_word_parts ) for transkription_position in word.transkription_positions ] + for font_key in set(item for sublist in l for item in sublist): + stage_index = self.fontsizekey2stage_mapping.get(font_key) + if word not in stages[stage_index]: + stages[stage_index].append(word) + for index, words in enumerate(stages): + writing_process = WritingProcess(version=index, words=words) + self.writing_processes.append(writing_process) + writing_process.attach_object_to_tree(self.page_tree) + def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None): """Adds a list of classes that are sonderzeichen and a style dictionary to page. """ self.sonderzeichen_list = sonderzeichen_list self.letterspacing_list = letterspacing_list self.style_dict = style_dict if style_node is not None: self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') } self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ] self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('letterspacing-list')) ] elif bool(self.style_dict): style_node = ET.SubElement(self.page_tree.getroot(), 'style') if len(self.sonderzeichen_list) > 0: style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list)) if len(self.letterspacing_list) > 0: style_node.set('letterspacing-list', ' '.join(self.letterspacing_list)) for key in self.style_dict.keys(): self.style_dict[key]['name'] = key ET.SubElement(style_node, 'class', attrib=self.style_dict[key]) + fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value } + fontsizes = sorted(fontsize_dict.values(), reverse=True) + # create a mapping between fontsizes and word stages + self.fontsizekey2stage_mapping = {} + for fontsize_key, value in fontsize_dict.items(): + if value >= fontsizes[0]-1: + self.fontsizekey2stage_mapping.update({ fontsize_key: 0 }) + elif value <= fontsizes[len(fontsizes)-1]+1: + self.fontsizekey2stage_mapping.update({ fontsize_key: 2 }) + else: + self.fontsizekey2stage_mapping.update({ fontsize_key: 1 }) def get_biggest_fontSize4styles(self, style_set={}): """Returns biggest font size from style_dict for a set of style class names. [:returns:] (float) biggest font size OR 1 if style_dict is empty """ if bool(self.style_dict): sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True) return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1 else: return 1 def get_line_number(self, y): """Returns line number id for element at y. [:return:] (int) line number id or -1 """ if len(self.line_numbers) > 0: result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ] return result_list[0] if len(result_list) > 0 else -1 else: return -1 @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'title': (str, 1, '/page/@title'), 'number': (str, 1, '/page/@number'),\ 'line_numbers': (LineNumber, SemanticClass.LIST, '/page/@number|/page/@title'),\ 'words': (Word, SemanticClass.LIST, '/page/@number|/page/@title'),\ 'svg_image': (Image, 1, '/page/svg'),\ + 'writing_processes': (WritingProcess, SemanticClass.LIST, '/page/@number|/page/@title'),\ 'word_insertion_marks': (WordInsertionMark, SemanticClass.LIST, '/page/@number|/page/@title')} dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary Index: svgscripts/datatypes/writing_process.py =================================================================== --- svgscripts/datatypes/writing_process.py (revision 0) +++ svgscripts/datatypes/writing_process.py (revision 25) @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" This class can be used to represent a text version. +""" +# Copyright (C) University of Basel 2019 {{{1 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see 1}}} + +__author__ = "Christian Steiner" +__maintainer__ = __author__ +__copyright__ = 'University of Basel' +__email__ = "christian.steiner@unibas.ch" +__status__ = "Development" +__license__ = "GPL v3" +__version__ = "0.0.1" + +from lxml import etree as ET + +from .attachable_object import AttachableObject +from .class_spec import SemanticClass +from .word import Word + +class WritingProcess(AttachableObject,SemanticClass): + """ + This class represents a stage in Nietzsche's process of writing the text. + + Args: + version (int): stage in the writing process + words (list of datatypes.word.Word): all words that belong to this stage + """ + XML_TAG = 'writing-process' + FIRST_VERSION = 0 + INSERTION_AND_ADDITION = 1 + LATER_INSERTION_AND_ADDITION = 2 + VERSION_DESCRIPTION = [ 'first version', 'insertion and addition', 'later insertion and addition' ] + + def __init__(self, version=FIRST_VERSION, words=[]): + self.version = version + self.words = words + self.description = WritingProcess.VERSION_DESCRIPTION[self.version]\ + if self.version < len(WritingProcess.VERSION_DESCRIPTION) else '' + + @classmethod + def create_writing_process_from_xml(cls, node, all_words=[]): + """Creates a WritingProcess by instantiating a -node. + + [:return:] (datatypes.writing_process) WritingProcess + """ + version = int(node.get('version'))\ + if bool(node.get('version')) else cls.FIRST_VERSION + word_ids = [ int(word_node.get('id')) for word_node in node.xpath('./word') ] + # take those words from all_words that have an id specified in word_ids + # (set & set -> subset of common elements) + words = [ word for word in all_words if len(set(word_ids) & set([word.id])) > 0 ] + return cls(version=version, words=words) + + @classmethod + def get_semantic_dictionary(cls): + """ Creates and returns a semantic dictionary as specified by SemanticClass. + """ + dictionary = {} + class_dict = cls.get_class_dictionary() + properties = {\ + 'version': (int, 1, '{}/@version'.format(WritingProcess.XML_TAG)),\ + 'description': (str, 1, '{}/@description'.format(WritingProcess.XML_TAG)),\ + 'words': (Word, SemanticClass.LIST, '{}/word/@id'.format(WritingProcess.XML_TAG))\ + } + dictionary.update({'class': class_dict}) + dictionary.update({'properties': properties}) + return dictionary + + def attach_object_to_tree(self, target_tree): + """Attach object to tree. + """ + obj_node = target_tree.getroot().xpath('//' + WritingProcess.XML_TAG + '[@version="%s"]' % self.version)[0] \ + if(len(target_tree.getroot().xpath('//' + WritingProcess.XML_TAG + '[@version="%s"]' % self.version)) > 0) \ + else ET.SubElement(target_tree.getroot(), WritingProcess.XML_TAG) + obj_node.set('version', str(self.version)) + if self.description != '': + obj_node.set('description', self.description) + for word in self.words: + ET.SubElement(obj_node, 'word', attrib={'id': str(word.id)}) Index: svgscripts/datatypes/test_writing_process.py =================================================================== --- svgscripts/datatypes/test_writing_process.py (revision 0) +++ svgscripts/datatypes/test_writing_process.py (revision 25) @@ -0,0 +1 @@ +link ../test_writing_process.py \ No newline at end of file Property changes on: svgscripts/datatypes/test_writing_process.py ___________________________________________________________________ Added: svn:special ## -0,0 +1 ## +* \ No newline at end of property Index: svgscripts/test_writing_process.py =================================================================== --- svgscripts/test_writing_process.py (revision 0) +++ svgscripts/test_writing_process.py (revision 25) @@ -0,0 +1,51 @@ +import unittest +from os import sep, path +from os.path import isdir, dirname +import lxml.etree as ET +import sys + +dir_changed = False +if not isdir('datatypes'): + sys.path.append(dirname(sys.path[0])) + dir_changed = True + +from datatypes.writing_process import WritingProcess +from datatypes.word import Word + +class TestWritingProcess(unittest.TestCase): + def setUp(self): + DATADIR = dirname(__file__) + sep + 'test_data' + if not isdir(DATADIR): + DATADIR = dirname(dirname(__file__)) + sep + 'test_data' + self.test_target_file = DATADIR + sep + 'test.xml' + self.words = [ Word(id=id, text=text) for id, text in enumerate([ 'Hello', 'World', '!' ])] + + def test_init(self): + wp = WritingProcess() + self.assertEqual(wp.version, 0) + self.assertEqual(wp.description, WritingProcess.VERSION_DESCRIPTION[0]) + + def test_attachable(self): + empty_tree = ET.ElementTree(ET.Element('page')) + wp = WritingProcess(words=self.words) + wp.attach_object_to_tree(empty_tree) + wp_node = empty_tree.xpath('//' + WritingProcess.XML_TAG) + self.assertEqual(len(wp_node), 1) + self.assertEqual(wp_node[0].get('version'), str(wp.version)) + self.assertEqual(len(wp_node[0].getchildren()), len(self.words)) + + def test_create_writing_process_from_xml(self): + empty_tree = ET.ElementTree(ET.Element('page')) + wp = WritingProcess(words=self.words) + wp.attach_object_to_tree(empty_tree) + wp_node = empty_tree.xpath('//' + WritingProcess.XML_TAG) + words = self.words + [ Word(id=len(self.words)) ] + wp2 = WritingProcess.create_writing_process_from_xml(wp_node[0], all_words=words) + self.assertEqual(len(wp2.words), len(self.words)) + self.assertEqual(wp2.words[0], self.words[0]) + + def test_semantics(self): + dict = WritingProcess.get_semantic_dictionary() + #print(dict) +if __name__ == "__main__": + unittest.main() Index: svgscripts/test_data/test.xml =================================================================== --- svgscripts/test_data/test.xml (revision 24) +++ svgscripts/test_data/test.xml (revision 25) @@ -1,4591 +1,4591 @@ - - - + + + svgWordPosition 2019-05-03 20:14:18 2019-06-03 09:32:42 2019-05-29 10:35:57 Index: svgscripts/convert_wordPositions.py =================================================================== --- svgscripts/convert_wordPositions.py (revision 24) +++ svgscripts/convert_wordPositions.py (revision 25) @@ -1,265 +1,292 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert the word positions to HTML for testing purposes. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import getopt from lxml.html import builder as E from lxml.html import open_in_browser import lxml from os import sep, listdir, mkdir, path from os.path import exists, isfile, isdir import re import sys from svgpathtools import svg_to_paths import xml.etree.ElementTree as ET from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.transkriptionField import TranskriptionField +from datatypes.writing_process import WritingProcess from datatypes.word import Word from myxmlwriter import write_pretty __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Converter: """The converter super class. """ def __init__(self, page, non_testing=True, show_word_insertion_mark=False): self.page = page self.non_testing = non_testing self.show_word_insertion_mark = show_word_insertion_mark - def convert(self, output_file=None): + def _get_words(self, stage_version=''): + """Returns the words of the indicated stage_version. + """ + all_words = self.page.words + if stage_version != '': + if re.match(r'^\d$', stage_version): + all_words = self.page.writing_processes[int(stage_version)].words + elif re.match(r'^\d\+$', stage_version): + all_words = [] + for stage_index in range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)): + all_words += self.page.writing_processes[stage_index].words + elif re.match(r'^\d\-\d$', stage_version): + all_words = [] + start_stop = [ int(i) for i in re.split(r'-', stage_version) ] + for stage_index in range(start_stop[0], start_stop[1]+1): + all_words += self.page.writing_processes[stage_index].words + return all_words + + def convert(self, output_file=None, stage_version=''): """Prints all words. """ first_word_of_line = None out = sys.stdout if output_file is not None: out = open(output_file, 'w') - for word in self.page.words: + for word in self._get_words(stage_version=stage_version): if first_word_of_line is None or first_word_of_line.line_number != word.line_number: out.write('\n') first_word_of_line = word if word.line_number % 2 == 0: out.write(str(word.line_number).zfill(2) + ' ') else: out.write(' ') - out.write(word.text + ' ') + if word.text is not None: + out.write(word.text + ' ') out.close() @classmethod def CREATE_CONVERTER(cls, page, non_testing=True,converter_type='', show_word_insertion_mark=False): """Returns a converter of type converter_type. [:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None """ cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() } cls_key = converter_type + 'Converter' if bool(cls_dict.get(cls_key)): return cls_dict.get(cls_key)(page, non_testing, show_word_insertion_mark) else: return Converter(page, non_testing, show_word_insertion_mark) class SVGConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text. """ BG_COLOR = 'yellow' OPACITY = '0.2' def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY): Converter.__init__(self, page, non_testing, show_word_insertion_mark) self.bg_color = bg_color self.opacity = opacity - def convert(self, output_file=None): + def convert(self, output_file=None, stage_version=''): """Converts Page to SVG """ title = self.page.title if(self.page.title is not None) else 'Test Page' title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title transkription_field = TranskriptionField(self.page.svg_file) if bool(transkription_field.get_svg_attributes('xmlns')): ET.register_namespace('', transkription_field.get_svg_attributes('xmlns')) if bool(transkription_field.get_svg_attributes('xmlns:xlink')): ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink')) svg_tree = ET.parse(self.page.svg_file) transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'}) colors = [ 'yellow', 'orange' ] color_index = 0 - for word in self.page.words: + for word in self._get_words(stage_version=stage_version): for transkription_position in word.transkription_positions: rect_node = ET.SubElement(transkription_node, 'rect',\ attrib={'id': str(transkription_position.id), 'x': str(transkription_position.left + transkription_field.xmin),\ 'y': str(transkription_position.top + transkription_field.ymin), 'width': str(transkription_position.width),\ 'height': str(transkription_position.height), 'fill': colors[color_index], 'opacity': self.opacity}) if transkription_position.transform is not None: matrix = transkription_position.transform.clone_transformation_matrix() matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3) matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3) rect_node.set('transform', matrix.toString()) rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3))) rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3))) ET.SubElement(rect_node, 'title').text = word.text color_index = (color_index + 1) % len(colors) if output_file is not None: svg_tree.write(output_file) class HTMLConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a test HTML file. """ CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; } .highlight1 { background-color: pink; opacity: 0.2; } .word-insertion-mark { background-color: orange; opacity: 0.2; } """ def __init__(self, page, non_testing=True, show_word_insertion_mark=False): Converter.__init__(self, page, non_testing, show_word_insertion_mark) - def convert(self, output_file=None): + def convert(self, output_file=None, stage_version=''): """Converts Page to HTML """ title = self.page.title if(self.page.title is not None) else 'Test Page' title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title width = self.page.width height = self.page.height style_content = ' position: relative; width: {}px; height: {}px; background-image: url({}); background-size: {}px {}px '\ .format(width, height, path.abspath(self.page.svg_file), width, height) style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS) head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style) transkription = E.DIV(id="transkription") counter = 0 - for word in self.page.words: + for word in self._get_words(stage_version=stage_version): highlight_class = 'highlight' + str(counter) word_title = 'id: {}/line: {}\n{}'.format(str(word.id), str(word.line_number), word.text) for transkription_position in word.transkription_positions: style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\ transkription_position.top, transkription_position.left, transkription_position.width, transkription_position.height) if transkription_position.transform is not None: style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString()) transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\ if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0 style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height) link = E.A(' ', E.CLASS(highlight_class), title=word_title, style=style_content) transkription.append(link) counter = (counter + 1) % 2 word_insertion_mark_class = 'word-insertion-mark' if self.show_word_insertion_mark: for word_insertion_mark in self.page.word_insertion_marks: wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(word_insertion_mark.id), str(word_insertion_mark.line_number)) style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\ word_insertion_mark.top, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height) link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content) transkription.append(link) html = E.HTML(head,E.BODY(transkription)) bool(self.non_testing) and open_in_browser(html) if output_file is not None: with open(output_file, 'wb') as f: f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8')) f.closed def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes. svgscripts/convert_wordPositions.py OPTIONS OPTIONS: -h|--help: show help -H|--HTML [default] convert to HTML test file -o|--output=outputFile save output to file outputFile -S|--SVG convert to SVG test file -s|--svg=svgFile: svg web file -T|--TEXT convert to TEXT output -t|--testing execute in test mode, do not write to file or open browser -w|--word-insertion-mark show word insertion mark on HTML + -v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. } :return: exit code (int) """ convert_to_type = None svg_file = None output_file = None non_testing = True show_word_insertion_mark = False page = None + stage_version = '' try: - opts, args = getopt.getopt(argv, "htHSTws:o:", ["help", "testing", "HTML", "SVG", "TEXT", "word-insertion-mark", "svg=", "output="]) + opts, args = getopt.getopt(argv, "htHSTws:o:v:", ["help", "testing", "HTML", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 + elif opt in ('-v', '--version'): + if re.match(r'^(\d|\d\+|\d\-\d)$', arg): + stage_version = arg + else: + raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg)) elif opt in ('-w', '--word-insertion-mark'): show_word_insertion_mark = True elif opt in ('-S', '--SVG'): convert_to_type = 'SVG' elif opt in ('-T', '--TEXT'): convert_to_type = 'TEXT' elif opt in ('-H', '--HTML'): convert_to_type = 'HTML' elif opt in ('-t', '--testing'): non_testing = False elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-o', '--output'): output_file = arg if len(args) < 1: usage() return 2 if convert_to_type is None: if output_file is not None and len(re.split(r'\.', output_file)) > 1: output_file_part_list = re.split(r'\.', output_file) convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper() else: convert_to_type = 'HTML' for word_position_file in args: if not isfile(word_position_file): print("'{}' does not exist!".format(word_position_file)) return 2 if svg_file is not None: if isfile(svg_file): page = Page(xml_source_file=word_position_file, svg_file=svg_file) else: print("'{}' does not exist!".format(word_position_file)) return 2 else: page = Page(xml_source_file=word_position_file) if page.svg_file is None: print('Please specify a svg file!') usage() return 2 converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark) - converter.convert(output_file=output_file) + converter.convert(output_file=output_file, stage_version=stage_version) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/extractWordPosition.py =================================================================== --- svgscripts/extractWordPosition.py (revision 24) +++ svgscripts/extractWordPosition.py (revision 25) @@ -1,550 +1,551 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract the position of the words in a svg file and write them to a xml file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import re import getopt import sys from os import sep, listdir, mkdir, path from os.path import exists, isfile, isdir from datetime import datetime from lxml import etree as ET from svgpathtools import svg2paths2 import warnings from myxmlwriter import write_pretty from datatypes.lineNumber import LineNumber from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.pdf import PDFText from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word from datatypes.word_insertion_mark import WordInsertionMark __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Extractor: """ This class can be used to extract the word positions in a svg file and write it to a xml file. Args: [xml_dir (str): target directory] [title (str): title of document] [manuscript_file (str): xml file containing information about the archival unity to which the current page belongs [extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that are part of the transkription field. """ SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ] def __init__(self, xml_dir=None, title=None, manuscript_file=None, extract_transkription_field_only=False): if bool(xml_dir): self.xml_dir = xml_dir not isdir(self.xml_dir) and mkdir(self.xml_dir) else: self.xml_dir = 'xml' if(isdir('xml')) else '' self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else '' self.title = title self.manuscript_file = manuscript_file self.extract_transkription_field_only = extract_transkription_field_only self.manuscript_tree = None if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file): self.manuscript_tree = ET.parse(self.manuscript_file) self.title = self.manuscript_tree.getroot().get('title') elif bool(self.manuscript_file): raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file)) elif bool(self.title): if not bool(self.manuscript_file): self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml' if not isfile(self.manuscript_file): self.manuscript_tree = ET.ElementTree(ET.Element('page', attrib={"title": self.title})) write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile') def get_page_number(self, file_name, page_number=None): """ Returns page number as a string (with leading zero(s) if len(page_number) < 3). """ if not bool(page_number) and bool(re.search(r'\d', file_name)): """if page_number=None and filename contains digits, then split filename into its parts that contain only digits, remove empty strings and return the last part containing only digits. """ page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop() if bool(page_number): leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else '' return leading_zeros + str(page_number) else: return '' def get_file_name(self, file_name, page_number=None): """Returns the file_name of the target xml file. """ dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else '' if bool(self.title): return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml' else: return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml')) def get_style(self, etree_root): """Returns the style specification as a dictionary. :returns: sonderzeichen_list: list of keys for classes that are 'Sonderzeichen' style_dict: dictionary: key = class name (str), value = style specification (dictionary) """ style_dict = {} sonderzeichen_list = [] letterspacing_list = [] style = etree_root.find('style', etree_root.nsmap) if style is not None: for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))): style_key = style_item.split('{')[0].replace('.', '') style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \ for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))} style_dict[style_key] = style_value_dict if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'): sonderzeichen_list.append(style_key) if bool(style_value_dict.get('letter-spacing')): letterspacing_list.append(style_key) return sonderzeichen_list, letterspacing_list, style_dict def get_word_from_part_obj(self, word_part_obj): """Extracts all 'text' from a list of dicitonaries and concats it to a string. """ return ''.join([ dict['text'] for dict in word_part_obj]) def find_inserted_words_by_position(self, target_tree, x, y): """Returns an Array with the words that are inserted above the x, y position or [] if not found. """ warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.') MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 if(len(target_tree.getroot().xpath('//word[@id]')) > 0): result_list = [] minus2left = 20.0 minus2top = 19.0 while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX : result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ] minus2left -= 1 minus2top += 1 if len(result_list) > 0: result_bottom = result_list[len(result_list)-1].bottom result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)): result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def find_inserted_words(self, target_tree, word_insertion_mark): """Returns an Array with the words that are inserted above/underneath the word_insertion_mark. """ warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.') if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1: return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y) if(len(target_tree.getroot().xpath('//word[@id]')) > 0): MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 result_list = [] x = word_insertion_mark.x y = word_insertion_mark.y if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line line_number = word_insertion_mark.line_number - 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: minus2top = 1.0 while len(result_list) == 0 and minus2top < MINY: for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y - minus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break minus2top += 1 elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line line_number = word_insertion_mark.line_number + 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: plus2top = 1.0 while len(result_list) == 0 and plus2top < MINY : for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y + plus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break plus2top += 1 if len(result_list) > 0: # now, collect more words that are right of already collected words result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width for item in target_tree.getroot().xpath(\ '//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)): result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None): """Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word). If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created. :returns: the new word counter (int) """ break_points = [] if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points for Sonderzeichen in self.SONDERZEICHEN_LIST: contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ] if True in contains_Sonderzeichen: break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]] for sz_point in [i for i, e in break_points]: wim_index = len(page.word_insertion_marks) x = float(word_part_objs[sz_point]['x']) y = float(word_part_objs[sz_point]['y']) if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None: svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } xmin = transkription_field.xmin ymin = transkription_field.ymin wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\ line_number=page.get_line_number(y-1), mark_type=Sonderzeichen) page.word_insertion_marks.append(wim) if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points THRESHOLDX = 20 # Threshold between line number and text last_x = -1 for i, x in enumerate([float(dict['x']) for dict in word_part_objs]): if(last_x > -1 and (x - last_x > THRESHOLDX)): break_points.append((i, i)) last_x = x if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words from_index = 0 for end_point, next_from_index in break_points: new_word_part_objs = word_part_objs[from_index:end_point] new_endX = word_part_objs[end_point]['x'] from_index = next_from_index index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) if from_index > 0 and from_index < len(word_part_objs): new_word_part_objs = word_part_objs[from_index:] index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) return index else: if len(word_part_objs) > 0: transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\ debug_msg_string=debug_msg, transkription_field=transkription_field) text = self.get_word_from_part_obj(word_part_objs) line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2) newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions) #newWord = Word.CREATE_WORD(page=page, word_part_objs=word_part_objs, id=index, endX=endX, endSign=endSign, matrix=matrix, debug_msg=debug_msg) #newWord.attach_word_to_tree(page.page_tree) -> now we attach all words with update_and_attach_words2tree() page.words.append(newWord) return int(index) + 1 else: return int(index) def get_bottoms(self, tree_root, from_position=-1.0, to_position=-1.0, transkription_field=None): """Returns all unique bottom values (Float) as a sorted list. """ bottom_list = sorted(set(item.get('transform').split(' ')[5].replace(')','') for item in tree_root.findall(".//text", tree_root.nsmap)), key=float) if transkription_field is not None: from_position = transkription_field.ymin to_position = transkription_field.ymax if (from_position > 0.0 and to_position > 0.0): return [ item for item in filter(lambda x: float(x) > from_position and float(x) < to_position, bottom_list) ] else: return bottom_list def get_text_items(self, tree_root, transkription_field=None): """Returns all text elements with a matrix or (if transkription_field is specified) all text elements that are located inside the transkription field. """ if transkription_field is not None: return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\ tree_root.iterfind(".//text", tree_root.nsmap)) else: return tree_root.iterfind(".//text", tree_root.nsmap) def extract_line_numbers(self, svg_tree, transkription_field): """Extracts line numbers and write them to a xml file. """ nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] line_numbers = [ LineNumber(raw_text_node=item, transkription_field=transkription_field)\ for item in filter(lambda x: LineNumber.IS_A_LINE_NUMBER(x), nodes_near_tf)] if len(line_numbers) > 0: MINABOVE = 3 last_to_position = transkription_field.ymin for line_number in line_numbers: above_current_line_bottom = line_number.bottom + transkription_field.ymin - MINABOVE bottoms = self.get_bottoms(svg_tree.getroot(), from_position=last_to_position, to_position=above_current_line_bottom) last_to_position = above_current_line_bottom if len(bottoms) > 0: current_line_top = float(bottoms[len(bottoms)-1]) - transkription_field.ymin + MINABOVE line_number.setTop(current_line_top) return line_numbers def get_word_object_multi_char_x(self, word_part_obj_dict): """Returns the x of the last char of word_part_object. TODO: get real widths from svg_file!!! """ WIDTHFACTOR = 2.6 return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR def extract_word_position(self, svg_tree, page, transkription_field=None): """Extracts word positions. """ counter = 0 word_part_obj = [] endSign = '%' last_matrix = None MAXBOTTOMDIFF = 5 MAXXDIFF = 6 for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field): current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field) # check for line breaks if (last_matrix is not None and len(word_part_obj) > 0 and (\ Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\ (abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\ (abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\ or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()): endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\ round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\ str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix))) counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field) word_part_obj = [] endX = current_matrix.getX() if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))): word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} ) else: endSign = text_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT endX = current_matrix.add2X(tspan_item.get('x')) if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))): y = current_matrix.add2Y(tspan_item.get('y')) word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix }) if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: """text_item has letterspacing class (set s & set t = new set with elements common to s and t) """ endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='tspan with letterspacing', transkription_field=transkription_field) word_part_obj = [] else: endSign = tspan_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='svg/text/tspan/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' last_matrix = current_matrix if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\ transkription_field=transkription_field) word_part_obj = [] endSign = '%' def update_and_attach_words2tree(self, page): """Update word ids and attach them to page.page_tree. """ for node in page.page_tree.xpath('//word'): node.getparent().remove(node) for index, word in enumerate(page.words): word.id = index word.attach_word_to_tree(page.page_tree) def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None): """Extracts information about positions of text elements. [:returns:] (datatypes.page) the Page containing all information. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file transkription_field = TranskriptionField(file_name) if bool(self.extract_transkription_field_only) else None svg_tree = ET.parse(file_name) page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number, pdfFile=pdfFile,\ svg_file=svg_file, extract_transkription_field_only=self.extract_transkription_field_only) sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot()) page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict) if transkription_field is not None: page.init_line_numbers(self.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax) self.extract_word_position(svg_tree, page, transkription_field=transkription_field) if page.pdfFile is not None and isfile(page.pdfFile): pdftext = PDFText(page.pdfFile, sonderzeichen=self.SONDERZEICHEN_LIST) pdftext.compare_svgWords2pdfWords(page, transkription_field=transkription_field) self.update_and_attach_words2tree(page) + page.create_writing_processes_and_attach2tree() for word_insertion_mark in page.word_insertion_marks: # it is not clear if we really need to know this alternative word ordering. See 'TODO.md' #word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark) word_insertion_mark.attach_object_to_tree(page.page_tree) return page else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None): """Extracts information about positions of text elements and writes them to a xml file. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile) write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition') return 0 else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract the position of the words in a svg file and write them to a xml file. svgscripts/extractWordPosition.py [OPTIONS] svg file OR xml target file containing file name of svg file as "/page/@source". directory containing svg files OPTIONS: -h|--help: show help -d|--xml-dir=xmlDir: target directory for the xml output file(s) -m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s) -o|--only-transkription-field: extract only words that are part of the transkription field. -p|--page=pageNumber: page number of the current page. For use with _one_ file only. -P|--PDF=pdfFile: pdf file - used for word correction -s|--svg=svgFile: svg web file -t|--title=title: title of the manuscript to which the current page(s) belong(s) -x|--xml-target-file=xmlOutputFile: xml target file :return: exit code (int) """ extract_transkription_field_only = False manuscript_file = None page_number = None pdfFile = None svg_file = None title = None xml_target_file = None xml_dir = ".{}xml".format(sep) try: opts, args = getopt.getopt(argv, "hod:m:t:p:s:x:P:", ["help", "only-transkription-field", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-o', '--only-transkription-field'): extract_transkription_field_only = True elif opt in ('-d', '--xml-dir'): xml_dir = arg elif opt in ('-m', '--manuscript-file'): manuscript_file = arg elif opt in ('-t', '--title'): title = arg elif opt in ('-p', '--page'): page_number = str(arg) elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-P', '--PDF'): pdfFile = arg elif opt in ('-x', '--xml-target-file'): xml_target_file = str(arg) files_to_process = list() for arg in args: if isfile(arg): files_to_process.append(arg) elif isdir(arg): files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg))) else: print("'{}' does not exist!".format(arg)) return 2 if len(files_to_process) < 1 or args[0].endswith('xml'): if xml_target_file is None: xml_target_file = args[0] if len(args) > 0 else None if xml_target_file is not None and isfile(xml_target_file): target_file_tree = ET.parse(xml_target_file) file_name = target_file_tree.getroot().get('source') title = target_file_tree.getroot().get('title') if title is None else title page_number = target_file_tree.getroot().get('number') if page_number is None else page_number extract_transkription_field_only = (target_file_tree.getroot().get('transkription-field-only') == 'true')\ if target_file_tree.getroot().get('transkription-field-only') is not None else False if svg_file is None: svg_file = target_file_tree.xpath('.//svg/@file')[0]\ if len(target_file_tree.xpath('.//svg/@file')) > 0 else None files_to_process.insert(0, file_name) if xml_target_file in files_to_process: files_to_process.remove(xml_target_file) else: usage() return 2 if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)): print("ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!") usage() return 2 extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, extract_transkription_field_only=extract_transkription_field_only) for file in files_to_process: extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/test_page.py =================================================================== --- svgscripts/test_page.py (revision 24) +++ svgscripts/test_page.py (revision 25) @@ -1,95 +1,82 @@ import unittest from os import sep, path from os.path import isdir, dirname, basename import lxml.etree as ET import sys dir_changed = False if not isdir('datatypes'): sys.path.append(dirname(sys.path[0])) dir_changed = True +from datatypes.lineNumber import LineNumber from datatypes.page import Page +from datatypes.writing_process import WritingProcess from datatypes.word import Word -from datatypes.lineNumber import LineNumber class TestPage(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' def test_Page(self): page = Page(xml_source_file=self.test_file, svg_file=self.test_svg_file) self.assertEqual(page.title, 'Mp XIV 1') self.assertEqual(page.number, '421') self.assertEqual(len(page.sonderzeichen_list), 2) self.assertEqual('st21' in page.sonderzeichen_list, True) self.assertEqual('st23' in page.sonderzeichen_list, True) self.assertEqual(page.style_dict['st0']['fill'], '#F8F9F8') self.assertEqual(page.width, 493.23) + stage0 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 0 ] + stage1 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 1 ] + stage2 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 2 ] + fontStage0 = float(page.style_dict.get(stage0[0]).get('font-size').replace('px','')) + fontStage1 = float(page.style_dict.get(stage1[0]).get('font-size').replace('px','')) + fontStage2 = float(page.style_dict.get(stage2[0]).get('font-size').replace('px','')) + self.assertEqual(fontStage0 > fontStage1, True) + self.assertEqual(fontStage1 > fontStage2, True) def test_get_biggest_fontSize4styles(self): page = Page(xml_source_file=self.test_file) style_set = { 'st12', 'st2', 'st14', 'st13' } self.assertEqual(page.get_biggest_fontSize4styles(style_set=style_set), 10) def test_get_words(self): page = Page(xml_source_file=self.test_file) words = page.words self.assertEqual(len(words), 440) self.assertEqual(words[0].text, '$') self.assertEqual(words[439].text, 'mußte!') - def test_init_words(self): + def test_create_writing_process(self): page = Page(xml_source_file=self.test_file) - words = page.words - """ word insertions are not tested for the moment!! - self.assertEqual(words[31].is_before_inserted_words, True) - self.assertEqual(words[31].word_insertion_mark.id, 0) - self.assertEqual(words[20].is_head_of_inserted_words, True) - self.assertEqual(words[25].is_tail_of_inserted_words, True) - for i in range(20, 25): - self.assertEqual(words[i].word_insertion_mark is not None, True) - """ + page.create_writing_processes_and_attach2tree() + self.assertEqual(page.words[97] in page.writing_processes[WritingProcess.LATER_INSERTION_AND_ADDITION].words, True) + self.assertEqual(page.words[129] in page.writing_processes[WritingProcess.LATER_INSERTION_AND_ADDITION].words, True) def test_init_line_numbers(self): page = Page(xml_source_file=self.test_file) line_numbers = [ LineNumber(id=2, top=20, bottom=40), LineNumber(id=4, top=50, bottom=60), LineNumber(id=6, top=70, bottom=90) ] page.init_line_numbers(line_numbers, 122.345) self.assertEqual(len(page.line_numbers), 7) self.assertEqual(page.line_numbers[0].id, 1) self.assertEqual(page.line_numbers[6].id, 7) self.assertEqual(page.line_numbers[6].top, 91) self.assertEqual(page.line_numbers[6].bottom, 122.345) self.assertEqual(page.get_line_number(122), 7) self.assertEqual(page.get_line_number(92), 7) self.assertEqual(page.get_line_number(22), 2) def test_get_line_number(self): page = Page(xml_source_file=self.test_file) self.assertEqual(page.get_line_number( (page.words[0].transkription_positions[0].bottom+page.words[0].transkription_positions[0].top)/2), 1) self.assertEqual(page.get_line_number( (page.words[27].transkription_positions[0].bottom+page.words[27].transkription_positions[0].top)/2), 2) self.assertEqual(page.get_line_number( (page.words[105].transkription_positions[0].bottom+page.words[105].transkription_positions[0].top)/2), 7) - - def test_get_word_insertion(self): - """ - PAUSED - """ - pass - #page = Page(self.test_file) - #insertions = page.word_insertion_marks - #self.assertEqual(len(insertions), 8) - #for insertion in insertions: - # self.assertEqual(insertion.inserted_words[0].is_head_of_inserted_words, True) - - def test_get_semanticAndDataDictionaries(self): - page = Page(xml_source_file=self.test_file) - #print(page.get_data_dictionary()) - #self.assertEqual(page.get_data_dictionary()['body'].get('svg_image').get('file_name'), page.svg_file) if __name__ == "__main__": unittest.main()