Index: tests_svgscripts/test_py2ttl.py =================================================================== --- tests_svgscripts/test_py2ttl.py (revision 49) +++ tests_svgscripts/test_py2ttl.py (revision 50) @@ -1,89 +0,0 @@ -import unittest -import lxml.etree as ET -from os import sep, path, remove -from os.path import isfile, dirname -from rdflib import Graph, URIRef -import sys - -sys.path.append('py2ttl') -import py2ttl -try: - from py2ttl import Py2TTLConverter -except ImportError: - from py2ttl.py2ttl import Py2TTLConverter -from config import PROJECT_NAME, PROJECT_ONTOLOGY_FILE - -if dirname(dirname(__file__)) not in sys.path: - sys.path.append(dirname(dirname(__file__))) - -from svgscripts.datatypes.word import Word -from svgscripts.datatypes.word_position import WordPosition - -class TestPy2TTL(unittest.TestCase): - """This is the unittest for py2ttl.py2ttl. - @label unittest - """ - def setUp(self): - self.ttl_target = __file__ + 'test.ttl' - - def test_main(self): - Py2TTLConverter.UNITTESTING = True - argv = ['-t', self.ttl_target ] - try: - self.assertEqual(py2ttl.main(argv), 0) - except AttributeError: - self.assertEqual(py2ttl.py2ttl.main(argv), 0) - - def test_init(self): - converter = Py2TTLConverter(project_ontology_file=PROJECT_ONTOLOGY_FILE) - self.assertEqual(converter.project_name, PROJECT_NAME) - - def test_get_semantic_classes(self): - converter = Py2TTLConverter() - classes = converter.get_semantic_classes('svgscripts/datatypes') - self.assertEqual('FaksimileImage' in [ cls.__name__ for cls in classes ], True) - self.assertEqual('Image' in [ cls.__name__ for cls in classes ], True) - self.assertEqual('SemanticClass' in [ cls.__name__ for cls in classes ], False) - - - def test_createProperty(self): - converter = Py2TTLConverter(project_ontology_file=PROJECT_ONTOLOGY_FILE) - converter.createProperty(converter.base_uriref + "#Test", 'test', str, 1) - name_uri = converter.base_uriref + '#hasTest' - self.assertEqual((name_uri, None, None) in converter.project_graph, True) - - def test_createPropertyName(self): - converter = Py2TTLConverter() - name = converter.createPropertyName(property_name='test_asdf_asdf') - self.assertEqual(name, 'hasTestAsdfAsdf') - name = converter.createPropertyName(object_uri=converter.base_uriref + '#Asdf') - self.assertEqual(name, 'hasAsdf') - name = converter.createPropertyName(subject_uri=converter.base_uriref + '#Test',object_uri=converter.base_uriref + '#Asdf') - self.assertEqual(name, 'testBelongsToAsdf') - name = converter.createPropertyName(subject_uri=converter.base_uriref + '#Test') - self.assertEqual(name, 'testBelongsTo') - - def test_get_comment_label(self): - converter = Py2TTLConverter() - comment, label = converter.get_comment_label(TestPy2TTL) - self.assertEqual(label, 'unittest') - self.assertEqual(comment, self.__doc__.split('\n')[0].lstrip()) - - def test_get_builtin_cls_keys(self): - dictionary = WordPosition.get_semantic_dictionary() - converter = Py2TTLConverter() - builtin_cls_keys = converter._get_builtin_cls_keys(dictionary['properties']) - self.assertEqual('width' in builtin_cls_keys, True) - self.assertEqual('height' in builtin_cls_keys, True) - - def test_get_semantic_dictionary_keys_super_first(self): - dict = Word.get_semantic_dictionary() - converter = Py2TTLConverter() - keys = converter._get_semantic_dictionary_keys_super_first(dict['properties']) - self.assertEqual(keys.index('faksimile_positions') < keys.index('transkription_positions'), True) - - def tearDown(self): - isfile(self.ttl_target) and remove(self.ttl_target) - -if __name__ == "__main__": - unittest.main() Index: tests_svgscripts/test_faksimile_image.py =================================================================== --- tests_svgscripts/test_faksimile_image.py (revision 49) +++ tests_svgscripts/test_faksimile_image.py (revision 50) @@ -1,59 +1,68 @@ import unittest from os import sep, path from os.path import isdir, dirname, basename import lxml.etree as ET import sys import sys sys.path.append('svgscripts') from datatypes.faksimile_image import FaksimileImage from datatypes.image import Image +from datatypes.text_field import TextField class TestFaksimileImage(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.svg_file = DATADIR + sep + 'W-II-1,49et50.svg' def test_init(self): image = FaksimileImage(file_name='test.jpg', height=10, width=10) self.assertEqual(image.tag, FaksimileImage.XML_TAG) self.assertEqual(image.width, 10) def test_attach_object_to_tree(self): image = FaksimileImage(file_name='test.jpg', height=10, width=10, x=-100, y=-200) empty_tree = ET.ElementTree(ET.Element('faksimile')) image.attach_object_to_tree(empty_tree) self.assertEqual(image.tag, FaksimileImage.XML_TAG) for node in empty_tree.getroot().xpath('//' + image.tag): self.assertEqual(node.get('file-name'), 'test.jpg') self.assertEqual(node.get('height'), '10') self.assertEqual(node.get('width'), '10') def test_CREATE_IMAGE(self): svg_tree = ET.parse(self.svg_file) image_node = svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap) - image = FaksimileImage.CREATE_IMAGE(image_node) + image = FaksimileImage.CREATE_IMAGE(image_node, self.svg_file) self.assertEqual(image.file_name, 'W-II-1,49et50.jpg') image_node = ET.Element('image') - file_name = 'test.jpg' + file_name = 'W-II-1,49et50.jpg' width = 10 height = 10 image_node.set('href', file_name) image_node.set('height', str(height)) image_node.set('width', str(width)) image = FaksimileImage.CREATE_IMAGE(image_node) self.assertEqual(image.height, height) self.assertEqual(image.width, width) self.assertEqual(image.file_name, file_name) def test_get_semantic_dict(self): image = FaksimileImage(file_name='test.jpg', URL=FaksimileImage.NIETZSCHE_SOURCES_URL + "N-II-1,2et3", height=10, width=10) self.assertEqual(FaksimileImage.get_semantic_dictionary()['class'].get('this'), FaksimileImage) - #print(Image.get_semantic_dictionary()) + #print(FaksimileImage.get_semantic_dictionary()) + + def text_get_image_joined_with_text_field(self): + tf = TextField() + orig_image = FaksimileImage(file_name='test.jpg', URL=FaksimileImage.NIETZSCHE_SOURCES_URL + "N-II-1,2et3", height=10, width=10) + copy_image = orig_image.get_image_joined_with_text_field(tf) + self.assertEqual(copy_image.text_field.width, tf.width) + self.assertEqual(copy_image.id, orig_image.id) + self.assertEqual(copy_image.file_name, orig_image.file_name) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_image.py =================================================================== --- tests_svgscripts/test_image.py (revision 49) +++ tests_svgscripts/test_image.py (revision 50) @@ -1,38 +1,50 @@ import unittest from os import sep, path from os.path import isdir, dirname, basename import lxml.etree as ET import sys import sys sys.path.append('svgscripts') -from datatypes.image import Image +from datatypes.image import Image, SVGImage +from datatypes.text_field import TextField class TestImage(unittest.TestCase): def test_init(self): - image = Image(file_name='test.jpg', height=10, width=10) + tf = TextField() + image = Image(file_name='test.jpg', height=10, width=10, text_field=tf) self.assertEqual(image.tag, 'image') self.assertEqual(image.width, 10) + self.assertEqual(image.text_field.width, 0) + node = ET.Element('svg', attrib={'file': 'test.svg', 'height': '10', 'width': '10'}) + image = SVGImage(node=node) + self.assertEqual(image.tag, 'svg-image') + self.assertEqual(image.width, 10) + self.assertEqual(image.file_name, 'test.svg') def test_attach_object_to_tree(self): tag = 'faksimile-image' - image = Image(file_name='test.jpg', height=10, width=10, tag=tag) + tf = TextField() + image = Image(file_name='test.jpg', URL='https://www.google.com', height=10, width=10, text_field=tf, tag=tag) empty_tree = ET.ElementTree(ET.Element('faksimile')) image.attach_object_to_tree(empty_tree) self.assertEqual(image.tag, tag) for node in empty_tree.getroot().xpath('//' + image.tag): self.assertEqual(node.get('file-name'), 'test.jpg') self.assertEqual(node.get('height'), '10') self.assertEqual(node.get('width'), '10') + self.assertEqual(len(node.findall(TextField.XML_TAG)), 1) def test_get_semantic_dict(self): - image = Image(file_name='test.jpg', height=10, width=10) - #print(Image.get_semantic_dictionary()) + #tf = TextField() + #image = Image(file_name='test.jpg', height=10, width=10, text_field=tf) + pass + #print(SVGImage.get_semantic_dictionary()) #self.assertEqual(image.get_data_dictionary()['body'].get('height'), 10) #self.assertEqual(image.get_data_dictionary()['body'].get('width'), 10) if __name__ == "__main__": unittest.main() Index: svgscripts/join_faksimileAndTranskription.py =================================================================== --- svgscripts/join_faksimileAndTranskription.py (revision 49) +++ svgscripts/join_faksimileAndTranskription.py (revision 50) @@ -1,312 +1,313 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from functools import cmp_to_key import getopt import lxml.etree as ET import re import string import sys from operator import attrgetter from os import listdir, sep, path from os.path import exists, isfile, isdir, dirname, basename import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.faksimile import FaksimilePage from datatypes.lineNumber import LineNumber from datatypes.page import Page from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from process_files import update_svgposfile_status __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation) PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"') SINGLE_PUNCTUATION_PATTERN = r"^[{}]$".format(string.punctuation) def get_filelist_and_manuscript_file(file_a, file_b=None): """Returns a file list and a manuscript file (or None) """ file_list = [] manuscript_file = None if isfile(file_a) and file_a.endswith('svg'): file_list.append(file_a) if file_b is not None and isfile(file_b): manuscript_file = file_b elif isfile(file_a) and file_a.endswith('xml'): manuscript_file = file_a if file_b is not None and isfile(file_b): file_list.append(file_b) elif isdir(file_b): file_list = [ svgfile for svgfile in listdir(file_b) if svgfile.endswith('svg') ] elif isdir(file_a): file_list = [ file_a + sep + svgfile for svgfile in listdir(file_a) if svgfile.endswith('svg') ] if file_b is not None and isfile(file_b): manuscript_file = file_b return file_list, manuscript_file def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None): """Return svg_pos_file and manuscript_file if they are ready for processing. """ svg_pos_file = None manuscript_tree = None if manuscript_file is not None\ and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')): manuscript_tree = ET.parse(manuscript_file) else: title_string = faksimile_page.title.replace(' ', '_') manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\ if isdir('.{}xml'.format(sep)) else title_string + '.xml' if isfile(manuscript_file): manuscript_tree = ET.parse(manuscript_file) if manuscript_tree is not None: if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0: svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0] else: if not UNITTESTING: msg_color = Fore.CYAN if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0\ else Fore.MAGENTA msg = 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number)\ if msg_color == Fore.MAGENTA\ else 'Faksimile already joined!' print(msg_color + msg, end='') print(Style.RESET_ALL) return svg_pos_file, manuscript_file def process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=''): """Joins faksimile_positions with text == word_text with words with text == word_text. """ text = word_text if alt_word_text == '' else alt_word_text fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ] words4word = [ word for word in words if word.text == word_text and not word.joined ] if alt_word_text != '': words4word += [ word for word in words if word.text == text and not word.joined ] words4word = sorted(words4word, key=attrgetter('id')) if len(fposition4word) == len(words4word): for index, faksimile_position in enumerate(fposition4word): faksimile_position.joined = True words4word[index].faksimile_positions = [ faksimile_position ] words[words4word[index].id].joined = True new_words.append(words4word[index]) elif len(words4word) < len(fposition4word): if re.match(r'(.*)ss(.*)', text): alt_word_text = re.sub(r'ss', 'ß', text) process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text) elif re.match(SINGLE_PUNCTUATION_PATTERN, text): if text == '-': alt_word_text = text.replace('-', '–') process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text) else: print('single', word_text, len(fposition4word), len(words4word)) elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text): alt_word_text = re.sub(PUNCTUATION_PATTERN, "", text) if alt_word_text != '': pattern = r'(.*){0}(.*)'.format(alt_word_text) words4word += [ word for word in words if re.match(pattern, word.text) and not word.joined ] if len(words4word) < len(fposition4word): process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text) else: words4word = sorted(words4word, key=attrgetter('id')) for index, faksimile_position in enumerate(fposition4word): faksimile_position.joined = True words4word[index].faksimile_positions = [ faksimile_position ] if words4word[index].text[len(words4word[index].text)-1] != word_text[len(word_text)-1]\ and words4word[index].id+1 < len(words)\ and words[words4word[index].id+1].text == word_text[len(word_text)-1]: words4word[index].join(words[words4word[index].id+1]) words[words4word[index].id+1].joined = True words[words4word[index].id].joined = True words4word[index].text = word_text new_words.append(words4word[index]) else: if len(text) > 1: new_words4word = [ word for word in words if word.text == text[1:] and not word.joined ] if len(new_words4word) == 0: alt_word_text = text[1:] process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text) else: for new_word in new_words4word: collected_text = new_word.text current_word = new_word while collected_text != word_text and word_text.endswith(collected_text) and current_word.id > 0: previous_word = words[current_word.id-1] if word_text.endswith(previous_word.text + collected_text): words[current_word.id].joined = True previous_word.join(current_word) current_word = previous_word collected_text = current_word.text else: collected_text = previous_word.text + collected_text words4word.append(current_word) words4word = sorted(words4word, key=attrgetter('id')) for index, faksimile_position in enumerate(fposition4word): faksimile_position.joined = True words4word[index].faksimile_positions = [ faksimile_position ] words4word[index].text = word_text words[words4word[index].id].joined = True new_words.append(words4word[index]) else: print('<{0}> {1}/{2}, ids: {3}'.\ format(word_text, len(fposition4word), len(words4word), [ position.id for position in fposition4word ])) else: print(word_text, len(fposition4word), len(words4word)) def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, test_word_text=''): """Joins the data of a faksimile file with the data of svgposfile. """ if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='') print(Style.RESET_ALL) faksimile_tree = ET.parse(faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces) exit_status = 0 for faksimile_page in faksimile_pages: svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file) if svg_pos_file is not None: if not UNITTESTING: print(Fore.CYAN + 'joining data with file {} ... '.format(svg_pos_file), end='') - page = Page(xml_source_file=svg_pos_file) + image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field) + page = Page(xml_source_file=svg_pos_file, faksimile_image=image4page) words = sort_words(page) faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions) new_words = [] unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\ key=lambda text: len(text)) for word_text in unique_faksimile_words: process_word_text(new_words, word_text, faksimile_positions, words) if False not in [ word.joined for word in words ]\ and False not in [ position.joined for position in faksimile_positions]\ and not UNITTESTING: page.words = sorted(new_words, key=attrgetter('id')) for word_node in page.page_tree.xpath('//word'): word_node.getparent().remove(word_node) for word in page.words: word.attach_word_to_tree(page.page_tree) write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) print(Fore.GREEN + '[OK]') print(Style.RESET_ALL) elif not UNITTESTING: not_joined_fp = [ (position.id, position.text) for position in faksimile_positions if not position.joined ] plural_fp = '' if len(not_joined_fp) < 2 else 's' not_joined_tw = [ (word.id, word.text) for word in words if not word.joined ] plural_tw = '' if len(not_joined_tw) < 2 else 's' print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp)) print(Fore.MAGENTA + '--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw)) print(Style.RESET_ALL) exit_status = 2 elif False in [ word.joined for word in words ]: print([ (word.id, word.text) for word in words if not word.joined ]) exit_status = 2 elif test_word_text != '': print([ (word.id, word.text) for word in new_words if word.text == test_word_text ]) return exit_status def sort_words(page): """Returns sorted words (from top left to bottom right). """ if -1 in [ word.line_number for word in page.words ]: warnings.warn('{} in page file {} for word with ids {}'.format(LineNumber.WARN_NO_LINE_NUMBER, page.page_tree.docinfo.URL, page.page_tree.xpath('//word[not(@line-number)]/@id'))) words = [] for line_number in page.line_numbers: word_on_line = [ word for word in page.words if word.line_number == line_number.id ] if line_number.id % 2 == 0: words += sorted(word_on_line, key=lambda word: word.transkription_positions[0].left) else: words += sorted(word_on_line, key=cmp_to_key(\ lambda wordA, wordB: -1\ if wordA.transkription_positions[0].left < wordB.transkription_positions[0].left\ and abs(wordA.transkription_positions[0].bottom-wordB.transkription_positions[0].bottom) < wordA.transkription_positions[0].height/2\ else 1)) for index, word in enumerate(words): words[index].id = index words[index].joined = False return words def sort_faksimile_positions(faksimile_positions): """Returns sorted words (from top left to bottom right). """ for faksimile_position in faksimile_positions: faksimile_position.joined = False return sorted(faksimile_positions, key=cmp_to_key(\ lambda positionA, positionB: -1\ if positionA.left+positionA.width/2 <= positionB.left+positionB.width/2\ and positionA.top+positionA.height/2 <= positionB.top+positionB.height/2\ else 1\ )\ ) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to join the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION. svgscripts/join_faksimileAndTranskription.py [OPTIONS] [xmlManuscriptFile] a directory containing a svg file containing information about the word positions on the faksimile. a xml file about a manuscript, containing information about its pages. OPTIONS: -h|--help: show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if exists(file_a): file_b = None if len(args) > 1 and exists(args[1]): file_b = args[1] file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b) for faksimile_file in file_list: join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file) else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/local_config.py =================================================================== --- svgscripts/local_config.py (revision 0) +++ svgscripts/local_config.py (revision 50) @@ -0,0 +1,3 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +FAKSIMILE_LOCATION = '/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile' # location of faksimiles Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 49) +++ svgscripts/datatypes/page.py (revision 50) @@ -1,380 +1,403 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path from .class_spec import SemanticClass -from .image import Image +from .image import Image, SVGImage +from .faksimile_image import FaksimileImage from .lineNumber import LineNumber from .path import Path from .positional_word_part import PositionalWordPart from .transkriptionField import TranskriptionField from .writing_process import WritingProcess from .word import Word from .word_insertion_mark import WordInsertionMark class Page(SemanticClass): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ UNITTESTING = False WARNING_MISSING_USE_NODE4PWP = PositionalWordPart.WARN_NO_USE_NODE_FOUND WARNING_MISSING_GLYPH_ID4WIM = WordInsertionMark.WARN_NO_GLYPH_ID - def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, pdfFile=None, svg_file=None, orientation='North', extract_transkription_field_only=False): + def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, faksimile_image=None, pdfFile=None, svg_file=None, orientation='North', extract_transkription_field_only=False): self.title = title self.line_numbers = [] self.style_dict = {} self.sonderzeichen_list = [] self.svg_file = None + self.svg_image = None self.pdfFile = None self.source = None self.number = page_number if page_number is not None else -1 self.orientation = orientation self.word_deletion_paths = [] + self.faksimile_image = faksimile_image if xml_source_file is not None: if isfile(xml_source_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_source_file, parser) self.title = self.page_tree.getroot().get('title') self.number = self.page_tree.getroot().get('number') self.source = self.page_tree.getroot().get('source') self.orientation = self.page_tree.getroot().get('orientation') self.init_words() self.add_style(style_node=self.page_tree.getroot().find('.//style')) + self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ + if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None + self.svg_image = SVGImage(node=self.page_tree.xpath('.//' + SVGImage.XML_TAG)[0])\ + if len(self.page_tree.xpath('.//' + SVGImage.XML_TAG)) > 0 else None + self.faksimile_image = FaksimileImage(node=self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)[0])\ + if len(self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)) > 0 else None self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 - self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ - if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None if pdfFile is not None and self.pdfFile is None: self.pdfFile = pdfFile ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) + if faksimile_image is not None: + self.faksimile_image = faksimile_image + self.faksimile_image.attach_object_to_tree(self.page_tree) if svg_file is not None and self.svg_file is None: self.svg_file = svg_file tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) - ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) + self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height) + self.svg_image.attach_object_to_tree(self.page_tree) + if self.svg_image is not None and self.svg_file is None: + self.svg_file = self.svg_image.file_name + if self.svg_image is not None and self.width == 0.0: + self.width = self.svg_image.width + if self.svg_image is not None and self.height == 0.0: + self.height = self.svg_image.height else: raise Exception('File "{}" does not exist!'.format(xml_source_file)) elif xml_target_file is not None: self.word_insertion_marks = [] self.words = [] self.writing_processes = [] self.svg_file = svg_file self.pdfFile = pdfFile if isfile(xml_target_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_target_file, parser) self.source = self.page_tree.getroot().get('source') if bool(self.page_tree.getroot().get('orientation')): self.orientation = self.page_tree.getroot().get('orientation') elif orientation is not None: self.page_tree.getroot().set('orientation', orientation) if bool(self.page_tree.getroot().get('title')): self.title = self.page_tree.getroot().get('title') elif title is not None: self.page_tree.getroot().set('title', title) if self.svg_file is None: self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 elif len(self.page_tree.xpath('.//svg/@file')) == 0: tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) - ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) + self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height) + self.svg_image.attach_object_to_tree(self.page_tree) + #ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) else: self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 if self.pdfFile is None: self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None elif len(self.page_tree.xpath('.//pdf/@file')) == 0: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG, WordInsertionMark.XML_TAG,\ WritingProcess.XML_TAG, Path.WORD_DELETION_PATH_TAG ]: for node in self.page_tree.xpath('//' + xpath2remove): node.getparent().remove(node) else: self.page_tree = ET.ElementTree(ET.Element('page')) self.pdfFile = pdfFile self.svg_file = svg_file if title is not None: self.page_tree.getroot().set('title', title) if orientation is not None: self.page_tree.getroot().set('orientation', orientation) self.page_tree.getroot().set('transkription-field-only', str(extract_transkription_field_only).lower()) if page_number is not None: self.page_tree.getroot().set('number', str(page_number)) if self.pdfFile is not None: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) if self.svg_file is not None: tf = TranskriptionField(self.svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) - ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) - self.svg_image = Image(file_name=self.svg_file, height=self.height, width=self.width) if self.svg_file is not None\ - else None + self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height) + self.svg_image.attach_object_to_tree(self.page_tree) + #ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) + if self.svg_image is None and self.svg_file is not None: + self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height) + self.svg_image.attach_object_to_tree(self.page_tree) def categorize_paths(self, transkription_field=None): """Categorize all paths that are part of the transkription field. """ if self.source is not None and isfile(self.source): MAX_HEIGHT_LINES = 1 max_line = sorted(\ [line_number.bottom-line_number.top for line_number in self.line_numbers if line_number.id % 2 == 0],\ reverse=True)[0] + 2 if len(self.line_numbers) > 0 else 17 tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0 tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0 paths, attributes = svg_to_paths.svg2paths(self.source) allpaths_on_tf = [] if transkription_field is not None: for index in range(0, len(paths)): path = paths[index] attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and path.start.real > tr_xmin\ and path.end.real < transkription_field.xmax: allpaths_on_tf.append(Path(id=index, path=path, style_class=attribute.get('class'))) text_area_deletion_paths = [] deletion_or_underline_paths = [] box_paths = [] dots_paths = [] word_connector_paths = [] uncategorized_paths = [] for mypath in allpaths_on_tf: xmin, xmax, ymin, ymax = mypath.path.bbox() start_line_number = self.get_line_number(mypath.path.start.imag-tr_ymin) if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1: dots_paths.append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and not mypath.path.isclosed(): deletion_or_underline_paths.append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed(): box_paths.append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed(): word_connector_paths.append(mypath) elif abs(ymax-ymin) < MAX_HEIGHT_LINES: deletion_or_underline_paths.append(mypath) elif start_line_number != -1 and start_line_number != self.get_line_number(mypath.path.end.imag-tr_ymin): text_area_deletion_paths.append(mypath) else: uncategorized_paths.append(mypath) self.mark_words_intersecting_with_paths_as_deleted(deletion_or_underline_paths, tr_xmin, tr_ymin) elif not Page.UNITTESTING: error_msg = 'Svg source file {} does not exist!'.format(self.source)\ if self.source is not None else 'Page does not contain a source file!' raise FileNotFoundError(error_msg) def init_line_numbers(self, line_numbers, document_bottom): """Init line numbers. """ even_index = 0 MINABOVE = 1 self.line_numbers = [] if len(line_numbers) > 0: first_line_bottom = line_numbers[even_index].top - MINABOVE self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 while even_index < len(line_numbers): self.line_numbers.append(LineNumber(id=line_numbers[even_index].id-1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=line_numbers[even_index].top-MINABOVE)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 self.line_numbers.append(LineNumber(id=line_numbers[even_index-1].id+1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=document_bottom)) for line_number in self.line_numbers: line_number.attach_object_to_tree(self.page_tree) def init_words(self): self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ] self.words = [ Word.CREATE_WORD(word_node=word_node) for word_node in self.page_tree.getroot().xpath('//word') ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ] self.word_deletion_paths = [ Path(node=node) for node in self.page_tree.xpath('//' + Path.WORD_DELETION_PATH_TAG) ] """ for index, word in enumerate(self.words): for word_insertion_mark in self.word_insertion_marks: self.words[index] = word_insertion_mark.attach_and_update_word_if_involved(word) if self.words[index] != word: break """ def create_writing_processes_and_attach2tree(self): """Creates three stages of Nietzsche's process of writing. """ self.writing_processes = [ WritingProcess(version=WritingProcess.FIRST_VERSION),\ WritingProcess(version=WritingProcess.INSERTION_AND_ADDITION),\ WritingProcess(version=WritingProcess.LATER_INSERTION_AND_ADDITION) ] for writing_process in self.writing_processes: writing_process.attach_object_to_tree(self.page_tree) for word in self.words: for transkription_position in word.transkription_positions: for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): if font_key in self.fontsizekey2stage_mapping.keys(): transkription_position.writing_process_id = self.fontsizekey2stage_mapping.get(font_key) def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None): """Adds a list of classes that are sonderzeichen and a style dictionary to page. """ self.sonderzeichen_list = sonderzeichen_list self.letterspacing_list = letterspacing_list self.style_dict = style_dict if style_node is not None: self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') } self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ] self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('letterspacing-list')) ] elif bool(self.style_dict): style_node = ET.SubElement(self.page_tree.getroot(), 'style') if len(self.sonderzeichen_list) > 0: style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list)) if len(self.letterspacing_list) > 0: style_node.set('letterspacing-list', ' '.join(self.letterspacing_list)) for key in self.style_dict.keys(): self.style_dict[key]['name'] = key ET.SubElement(style_node, 'class', attrib=self.style_dict[key]) fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value } fontsizes = sorted(fontsize_dict.values(), reverse=True) # create a mapping between fontsizes and word stages self.fontsizekey2stage_mapping = {} for fontsize_key, value in fontsize_dict.items(): if value >= fontsizes[0]-1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION }) elif value <= fontsizes[len(fontsizes)-1]+1: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION }) else: self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION }) def add_source(self, source): """Adds a source to page and attaches it to page_tree. """ self.source = source self.page_tree.getroot().set('source', self.source) def get_biggest_fontSize4styles(self, style_set={}): """Returns biggest font size from style_dict for a set of style class names. [:returns:] (float) biggest font size OR 1 if style_dict is empty """ if bool(self.style_dict): sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True) return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1 else: return 1 def get_line_number(self, y): """Returns line number id for element at y. [:return:] (int) line number id or -1 """ if len(self.line_numbers) > 0: result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ] return result_list[0] if len(result_list) > 0 else -1 else: return -1 @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'title': (str, 1, '/page/@title'), 'number': (str, 1, '/page/@number'),\ + 'image': { 'class': Image, 'cardinality': 1, 'xpath': '/page/{}'.format(FaksimileImage.XML_TAG)},\ 'line_numbers': (LineNumber, SemanticClass.LIST, '/page/@number|/page/@title'),\ 'orientation': { 'class': str, 'cardinality': 1, 'xpath': '/page/@orientation'},\ 'words': (Word, SemanticClass.LIST, '/page/@number|/page/@title'),\ - 'svg_image': (Image, 1, '/page/svg'),\ + 'svg_image': { 'class': SVGImage, 'cardinality': 1, 'xpath': '/page/{}'.format(SVGImage.XML_TAG)},\ 'writing_processes': (WritingProcess, SemanticClass.LIST, '/page/@number|/page/@title'),\ 'word_deletion_paths': (Path, SemanticClass.LIST, '/page/@number|/page/@title'),\ 'word_insertion_marks': (WordInsertionMark, SemanticClass.LIST, '/page/@number|/page/@title')} dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary def mark_words_intersecting_with_paths_as_deleted(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks all words that intersect with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] list of .path.Path that might be word_underline_paths """ if not Page.UNITTESTING: bar = Bar('mark words that intersect with deletion paths', max=len(self.words)) for word in self.words: not bool(Page.UNITTESTING) and bar.next() for transkription_position in word.transkription_positions: first_pwp = transkription_position.positional_word_parts[0] last_pwp = transkription_position.positional_word_parts[len(transkription_position.positional_word_parts)-1] xmin = tr_xmin + first_pwp.left xmax = tr_xmin + last_pwp.left + last_pwp.width ymin = tr_ymin + sorted(pwp.top for pwp in transkription_position.positional_word_parts)[0] ymax = tr_ymin + sorted([pwp.bottom for pwp in transkription_position.positional_word_parts], reverse=True)[0] word_path = parse_path('M {}, {} L {}, {} L {}, {} L {}, {} z'.format(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax)) intersecting_paths = [ deletion_path for deletion_path in deletion_paths\ if do_paths_intersect_saveMode(deletion_path.path, word_path) ] if len(intersecting_paths) > 0: word.deleted = True for deletion_path in intersecting_paths: if deletion_path not in self.word_deletion_paths: deletion_path.tag = Path.WORD_DELETION_PATH_TAG deletion_path.attach_object_to_tree(self.page_tree) self.word_deletion_paths.append(deletion_path) not bool(Page.UNITTESTING) and bar.finish() # return those paths in deletion_paths that are not in self.word_deletion_paths return [ word_underline_path for word_underline_path in set(deletion_paths) - set(self.word_deletion_paths) ] def do_paths_intersect_saveMode(path1, path2): """Returns true if paths intersect, false if not or if there was an exception. """ try: return path1.intersect(path2, justonemode=True) except AssertionError: return False Index: svgscripts/datatypes/image.py =================================================================== --- svgscripts/datatypes/image.py (revision 49) +++ svgscripts/datatypes/image.py (revision 50) @@ -1,89 +1,113 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent all image types. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from .attachable_object import AttachableObject from .class_spec import SemanticClass +from .text_field import TextField class Image(AttachableObject,SemanticClass): """ This super class represents all types of images. Args: - file_name (str): name of the image file. - node (lxml.etree.Element) node, containing information - URL (str): URL of image file. - height (float): height of image - width (float): width of image + file_name (str): name of the image file. + node (lxml.etree.Element) node, containing information + URL (str): URL of image file. + height (float): height of image + width (float): width of image + text_field (.text_field.TextField) text_field on image representation """ - stringKeys = [ 'file_name', 'URL' ] + stringKeys = [ 'file_name', 'URL', 'local_path' ] floatKeys = [ 'height', 'width' ] XML_TAG = 'image' - def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, tag=XML_TAG): + def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG): + self.text_field = text_field if node is not None: self.file_name = node.get('file-name') - self.URL = node.get('absolute-path') + self.local_path = node.get('local-path') + self.URL = node.get('URL') self.height = float(node.get('height')) self.width = float(node.get('width')) + if len(node.findall(TextField.XML_TAG)) > 0: + self.text_field = TextField(node=node.find(TextField.XML_TAG)) else: self.tag = tag self.file_name = file_name + self.local_path = local_path self.URL = URL self.height = height self.width = width def attach_object_to_tree(self, target_tree): """Attach object to tree. """ obj_node = target_tree.getroot().find('.//' + self.tag) \ if(len(target_tree.getroot().findall('.//' + self.tag)) > 0) \ else ET.SubElement(target_tree.getroot(), self.tag) for key in self.floatKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) for key in self.stringKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), self.__dict__[key]) + if self.text_field is not None: + self.text_field.attach_object_to_tree(obj_node) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} properties.update(dict(zip(Image.floatKeys, [ (float, 1, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in Image.floatKeys]))) properties.update({'file_name': (str, 1, '{}/@file-name'.format(cls.XML_TAG))}) properties.update({'URL': (str, 0, '{}/@absolute-path'.format(cls.XML_TAG))}) + properties.update({'text_field': (TextField, 0, '{}/{}'.format(cls.XML_TAG, TextField.XML_TAG))}) dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary +class SVGImage(Image): + """This class represents a svg image. + """ + XML_TAG = 'svg-image' + + def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG): + if node is not None and node.tag != self.XML_TAG: + file_name = node.get('file') + height = float(node.get('height')) if bool(node.get('height')) else 0.0 + width = float(node.get('width')) if bool(node.get('width')) else 0.0 + node = None + super(SVGImage, self).__init__(node=node, file_name=file_name, URL=URL,\ + height=height, width=width, text_field=text_field, tag=self.XML_TAG) + Index: svgscripts/datatypes/faksimile.py =================================================================== --- svgscripts/datatypes/faksimile.py (revision 49) +++ svgscripts/datatypes/faksimile.py (revision 50) @@ -1,135 +1,135 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a faksimile page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import re from lxml import etree as ET from os import path from os.path import isdir, isfile, sep, basename from svgpathtools.parser import parse_path from .faksimile_image import FaksimileImage from .matrix import Matrix from .super_page import SuperPage from .text_field import TextField from .word_position import WordPosition class FaksimilePage(SuperPage): """ This class represents a faksimile page. Args: xml_target_file (str): name of the xml file to which page info will be written. xml_source_file (str): name of the xml file that will be instantiated. """ XML_TAG = 'faksimile-page' def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, svg_source_file=None, faksimile_image=None, text_field=None): xml_file = xml_source_file if xml_source_file is not None else xml_target_file super(FaksimilePage, self).__init__(xml_file=xml_file, title=title, page_number=page_number, tag=self.XML_TAG) if xml_target_file is not None: self.remove_tags_from_page_tree([WordPosition.FAKSIMILE]) if svg_source_file is not None: self.page_tree.getroot().set('svg-source-file', svg_source_file) if faksimile_image is not None: faksimile_image.attach_object_to_tree(self.page_tree) if text_field is not None: text_field.attach_object_to_tree(self.page_tree) self.svg_source_file = self.page_tree.getroot().get('svg-source-file') self.faksimile_image = FaksimileImage(node=self.page_tree.getroot().find('.//' + FaksimileImage.XML_TAG))\ if len(self.page_tree.getroot().findall('.//' + FaksimileImage.XML_TAG)) > 0 else None self.text_field = TextField(node=self.page_tree.getroot().find('.//' + TextField.XML_TAG))\ if len(self.page_tree.getroot().findall('.//' + TextField.XML_TAG)) > 0 else None self.word_positions = [ WordPosition(node=node) for node in self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE) ]\ if len(self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE)) > 0 else [] def append_word_position(self, word_position): """Appends word_position to word_positions and attaches it to page_tree. """ self.word_positions.append(word_position) word_position.attach_object_to_tree(self.page_tree) @staticmethod def GET_FAKSIMILEPAGES(svg_tree, namespaces=None): """Creates and returns text fields contained in a svg_file as a list. """ THRESHOLD_X = 10 if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } - image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap)) source_file_name = svg_tree.docinfo.URL + image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap), source_file_name) xml_dir = '.{}xml'.format(sep) faksimile_pages = list() title_string = re.sub(r'[,_][0-9]+.*\.svg', '', basename(source_file_name)) title = title_string.replace('-', ' ') rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap) if rect.get('id', svg_tree.getroot().nsmap).startswith(title_string) ] for text_field_rect in rect_list: tf_x = float(text_field_rect.get('x', svg_tree.getroot().nsmap)) - image.x tf_y = float(text_field_rect.get('y', svg_tree.getroot().nsmap)) - image.y tf_width = float(text_field_rect.get('width', svg_tree.getroot().nsmap)) tf_height = float(text_field_rect.get('height', svg_tree.getroot().nsmap)) id = text_field_rect.get('id', svg_tree.getroot().nsmap) target_file_name = xml_dir + sep + id + '.xml' if isdir(xml_dir) else id + '.xml' page_number = re.sub(r'.*[,_]', '', id) text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y) faksimile_page = FaksimilePage(xml_target_file=target_file_name, svg_source_file=source_file_name,\ title=title, page_number=page_number, faksimile_image=image, text_field=text_field) x_min = text_field.xmin + image.x y_min = text_field.ymin + image.y rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\ x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces) rect_titles += svg_tree.getroot().xpath('//ns:path/ns:title', namespaces=namespaces) for rect_title in rect_titles: rect = rect_title.getparent() x, y, height, width = 0.0, 0.0, 0.0, 0.0 if rect.tag.endswith('path') and rect.get('d') != 0: path = parse_path(rect.get('d')) x, xmax, y, ymax = path.bbox() width = xmax - x height = ymax - y if x < x_min or x > text_field.xmax + image.x - THRESHOLD_X\ or y < y_min or y > text_field.ymax + image.y\ or rect.get('id') == text_field.id: break else: x = float(rect.get('x', svg_tree.getroot().nsmap)) y = float(rect.get('y', svg_tree.getroot().nsmap)) height = float(rect.get('height', svg_tree.getroot().nsmap)) width = width=float(rect.get('width', svg_tree.getroot().nsmap)) matrix = None if bool(rect.get('transform')): matrix = Matrix(transform_matrix_string=rect.get('transform')) faksimile_page.append_word_position(\ WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=rect_title.text, height=height,\ width=width, x=x-x_min, y=y-y_min, matrix=matrix, tag=WordPosition.FAKSIMILE)) faksimile_pages.append(faksimile_page) return faksimile_pages Index: svgscripts/datatypes/faksimile_image.py =================================================================== --- svgscripts/datatypes/faksimile_image.py (revision 49) +++ svgscripts/datatypes/faksimile_image.py (revision 50) @@ -1,71 +1,92 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent faksimile images. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" +import fnmatch from lxml import etree as ET -from os.path import isfile +import os +from os.path import basename, dirname, isfile, realpath, sep +import sys from .image import Image +sys.path.append('svgscripts') +from local_config import FAKSIMILE_LOCATION class FaksimileImage(Image): """ This class represents a faksimile image. Args: file_name (str): name of the image file. node (lxml.etree.Element) node, containing information URL (str): URL of image file. height (float): height of image width (float): width of image x (float): x y (float): y """ XML_TAG = 'faksimile-image' NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/download/' - def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0): - super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL,\ - height=height, width=width, tag=self.XML_TAG) + def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0, text_field=None): + super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL, local_path=local_path,\ + height=height, width=width, text_field=text_field, tag=self.XML_TAG) self.x = x self.y = y + def get_image_joined_with_text_field(self, text_field): + """Returns a new instance of itself that has a text_field (text_field.TextField). + """ + return FaksimileImage(file_name=self.file_name, local_path=self.local_path, URL=self.URL, height=self.height,\ + width=self.width, x=self.x, y=self.y, text_field=text_field) + @staticmethod - def CREATE_IMAGE(image_node): + def CREATE_IMAGE(image_node, source_file=None): """Instantiates a FaksimileImage from a (lxml.etree.Element) image_node. """ namespaces = image_node.nsmap if len(namespaces) == 0: namespaces = { 'xlink': '' } - file_name = image_node.get('{%s}href' % namespaces['xlink']) + local_path = image_node.get('{%s}href' % namespaces['xlink']) + file_name = basename(local_path) + if file_name != local_path and source_file is not None: + local_path = realpath(dirname(source_file)) + sep + local_path + local_path = realpath(local_path) + if not isfile(local_path): + local_path = None + for path, dirs, files in os.walk(os.path.abspath(FAKSIMILE_LOCATION)): + for filename in fnmatch.filter(files, file_name): + local_path = os.path.join(path, filename) + break URL = FaksimileImage.NIETZSCHE_SOURCES_URL + file_name.replace('.jpg','') height = float(image_node.get('height')) if bool(image_node.get('height')) else 0.0 width = float(image_node.get('width')) if bool(image_node.get('width')) else 0.0 x = float(image_node.get('x')) if bool(image_node.get('x')) else 0.0 y = float(image_node.get('y')) if bool(image_node.get('y')) else 0.0 - return FaksimileImage(file_name=file_name, URL=URL, height=height, width=width, x=x, y=y) + return FaksimileImage(file_name=file_name, local_path=local_path, URL=URL, height=height, width=width, x=x, y=y)