Index: svgscripts/test_word_insertion_mark.py =================================================================== --- svgscripts/test_word_insertion_mark.py (revision 27) +++ svgscripts/test_word_insertion_mark.py (revision 28) @@ -1,58 +1,63 @@ import unittest from os import sep, path from os.path import dirname, isdir import lxml.etree as ET from datatypes.transkriptionField import TranskriptionField from datatypes.word_insertion_mark import WordInsertionMark from datatypes.word import Word class TestWordInsertionMark(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.reference_file = DATADIR + sep + 'test_find_word.xml' self.test_svg_file = DATADIR + sep + 'path_svg.svg' def test_WIM_with_input(self): wim = WordInsertionMark(x=1.0, y=1.0, previous_word_id=0, inserted_word_id=1) self.assertEqual(wim.id, '0') self.assertEqual(wim.left, 1.0) self.assertEqual(wim.previous_word_id, 0) def test_WIM_with_node(self): mylist = { 'id': '0', 'left': '1.0', 'top': '1.0', 'height': '0', 'width': '0', 'bottom': '0', 'previous-word-id': '0', 'inserted-word-id': '1' } node = ET.Element(WordInsertionMark.XML_TAG, attrib=mylist) wim = WordInsertionMark(wim_node=node) self.assertEqual(wim.id, '0') self.assertEqual(wim.left, 1.0) self.assertEqual(wim.previous_word_id, 0) def test_WIM_attach_object_to_tree(self): empty_tree= ET.parse(self.reference_file) for node in empty_tree.xpath('//freehand'): node.getparent().remove(node) newWim = WordInsertionMark(x=1.0, y=1.0, previous_word_id=0) newWim.attach_object_to_tree(empty_tree) newWim = WordInsertionMark(id=1,x=1.0, y=1.0, previous_word_id=0) newWim.attach_object_to_tree(empty_tree) self.assertEqual(len(empty_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG)), 2) for wim_node in empty_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG): wim = WordInsertionMark(wim_node=wim_node) self.assertEqual(wim.left, 1.0) self.assertEqual(wim.top, 1.0) self.assertEqual(wim.previous_word_id, 0) def test_CREATE_WIM(self): svg_tree = ET.parse(self.test_svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } xmin = 311.8125 ymin = 158.0117 x = 261.865 y = 15.9 wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_tree, namespaces, x=x, y=y, xmin=xmin, ymin=ymin, line_number=1) self.assertEqual(wim.symbol_id, 'glyph2-1') + + def test_get_semantic_dictionary(self): + dictionary = WordInsertionMark.get_semantic_dictionary() + self.assertEqual('previous_word_id' in dictionary['properties'].keys(), True) + if __name__ == "__main__": unittest.main() Index: svgscripts/test_transkription_position.py =================================================================== --- svgscripts/test_transkription_position.py (revision 27) +++ svgscripts/test_transkription_position.py (revision 28) @@ -1,83 +1,87 @@ import unittest from os import sep, path from os.path import dirname, isdir, isfile import lxml.etree as ET from datatypes.debug_message import DebugMessage from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.positional_word_part import PositionalWordPart from datatypes.transkription_position import TranskriptionPosition from datatypes.transkriptionField import TranskriptionField from datatypes.word_position import WordPosition class TestTranskriptionPosition(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_svg_file = DATADIR + sep + 'W_I_8_page125_web.svg' self.test_xml = DATADIR + sep + 'W_I_8_page125.xml' self.dir = DATADIR def test_init(self): dmsg = DebugMessage(message='test') word_position = TranskriptionPosition(id=1, height=10, width=10, x=0, y=10, debug_message=dmsg) self.assertEqual(word_position.tag, WordPosition.TRANSKRIPTION) self.assertEqual(word_position.id, '1') self.assertEqual(word_position.debug_message.message, 'test') self.assertEqual(word_position.height, 10) self.assertEqual(word_position.top, 10) self.assertEqual(word_position.bottom, 20) self.assertEqual(word_position.left, 0) self.assertEqual(word_position.isOnTranskription(), True) self.assertEqual(word_position.isOnFaksimile(), False) def test_attach_object_to_tree(self): matrix = Matrix('matrix(0 0 0 0 0 0)') dmsg = DebugMessage(message='test') pwps = [ PositionalWordPart(text='test') ] word_position = TranskriptionPosition(id=1, height=10, width=10, x=0, y=10, matrix=matrix, debug_message=dmsg, positional_word_parts=pwps) empty_tree = ET.ElementTree(ET.Element('page')) word_position.attach_object_to_tree(empty_tree) #print(ET.dump(empty_tree.getroot())) for node in empty_tree.getroot().xpath('//' + word_position.tag): self.assertEqual(node.get('id'), '1') self.assertEqual(node.get('bottom'), '20') self.assertEqual(node.get('transform'), matrix.toString()) self.assertEqual(node.get('writing-process-id'), '-1') word_position = TranskriptionPosition(node=empty_tree.getroot().find('.//' + word_position.tag)) self.assertEqual(word_position.height, 10) self.assertEqual(word_position.debug_message is not None, True) self.assertEqual(word_position.debug_message.message, 'test') self.assertEqual(len(word_position.positional_word_parts), 1) def test_CREATE_TRANSKRIPTION_POSITION_LIST(self): page = Page(xml_source_file=self.test_xml, svg_file=self.test_svg_file) tf = TranskriptionField(page.svg_file) word_part_objs = [{'text': 'es', 'class': 'st5 st6', 'x': 258.148, 'y': '8.5' }] transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf) self.assertEqual(transkription_positions[0].top, 3.829) self.assertEqual(transkription_positions[0].height, 5.672) word_part_objs = [{'text': 'Meine', 'class': 'st5 st8', 'x': 8.504, 'y': 70.5 }] transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf) self.assertEqual(transkription_positions[0].height, 11.11) self.assertEqual(transkription_positions[0].top, 61.266) self.assertEqual(transkription_positions[0].bottom, 72.376) def test_CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(self): page = Page(xml_source_file=self.test_xml, svg_file=self.test_svg_file) tf = TranskriptionField(page.svg_file) word_part_objs = [{'text': 'Meine', 'class': 'st5 st8', 'x': 8.504, 'y': 70.5 }] transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf) transkription_positions[0].positional_word_parts[2].transform = Matrix('rotate(20)') transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, transkription_positions[0].positional_word_parts) self.assertEqual(len(transkription_positions), 3) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf) transkription_positions[0].positional_word_parts[0].style_class = 'st5 st10' transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, transkription_positions[0].positional_word_parts) self.assertEqual(len(transkription_positions), 2) + + def test_get_semantic_dictionary(self): + dictionary = TranskriptionPosition.get_semantic_dictionary() + self.assertEqual(TranskriptionPosition.XML_TAG in dictionary['properties'].get('writing_process_id').get('xpath'), True) if __name__ == "__main__": unittest.main() Index: svgscripts/test_extractWordPosition.py =================================================================== --- svgscripts/test_extractWordPosition.py (revision 27) +++ svgscripts/test_extractWordPosition.py (revision 28) @@ -1,195 +1,183 @@ import unittest import os from os import sep, path from os.path import isfile, isdir, dirname import re import shutil import tempfile import lxml.etree as ET import extractWordPosition from myxmlwriter import write_pretty from datatypes.transkriptionField import TranskriptionField from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.pdf import PDFText from datatypes.word import Word from datatypes.lineNumber import LineNumber from datatypes.word_insertion_mark import WordInsertionMark class TestExtractor(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' self.test_file_find_word = DATADIR + sep + 'test_find_word.xml' self.test_dir = tempfile.mkdtemp() self.title = 'ABC 111' self.matrix_string = 'matrix(1 0 0 1 183.6558 197.9131)' self.test_file = DATADIR + sep + 'Mp_XIV_1_mytest_421.svg' self.xml420 = DATADIR + sep + 'Mp_XIV_1_page420.xml' self.pdf420 = DATADIR + sep + 'Mp_XIV_1_online_420.pdf' self.pdf_file = DATADIR + sep + 'W_I_8_page125.pdf' self.faulty_xml = DATADIR + sep + 'W_I_8_faulty_page125.xml' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' def test_main(self): + extractWordPosition.Extractor.UNITTESTING = True argv = ['-d', self.test_dir, '-o', '--title=My Hero', '--page=1', self.test_file] self.assertEqual(extractWordPosition.main(argv), 0) def test_get_page_number(self): extractor = extractWordPosition.Extractor() self.assertEqual(extractor.get_page_number(self.test_file, page_number='1'), '001') self.assertEqual(extractor.get_page_number(self.test_file), '421') def test_get_file_name(self): extractor = extractWordPosition.Extractor() self.assertEqual(extractor.get_file_name(self.test_file), 'xml/Mp_XIV_1_mytest_421.xml') extractor = extractWordPosition.Extractor(title=self.title) self.assertEqual(extractor.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_'))) extractorA = extractWordPosition.Extractor(title=self.title) extractorB = extractWordPosition.Extractor(manuscript_file=extractorA.manuscript_file) self.assertEqual(extractorB.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_'))) def test_get_style(self): extractor = extractWordPosition.Extractor() svg_tree = ET.parse(self.test_file) sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot()) self.assertEqual(sonderzeichen_list, [ 'st21', 'st23']) self.assertEqual(style_dict.get('st11').get('font-family'), 'Frutiger-Europeen') self.assertEqual(style_dict.get('st5').get('stroke'), '#CED5CE') def test_get_word_from_part_obj(self): extractor = extractWordPosition.Extractor() mylist = [{'text': 'a', 'class': 'asdf' }, {'text': 'b', 'endX': 0 }, {'text': 'c'}] self.assertEqual(extractor.get_word_from_part_obj(mylist), 'abc') def test_get_bottoms(self): svg_tree = ET.parse(self.test_file) extractor = extractWordPosition.Extractor() mybottoms = extractor.get_bottoms(svg_tree.getroot()) self.assertEqual(mybottoms[0], '57.1914') self.assertEqual(len(mybottoms), 106) self.assertEqual(mybottoms[len(mybottoms)-1], '1155.6899') mybottoms = extractor.get_bottoms(svg_tree.getroot(), from_position=100.0, to_position=800.0) self.assertEqual(mybottoms[0], '100.5132') self.assertEqual(len(mybottoms), 84) self.assertEqual(mybottoms[len(mybottoms)-1], '792.8218') tf = TranskriptionField(self.test_file) mybottoms = extractor.get_bottoms(svg_tree.getroot(), transkription_field=tf) self.assertEqual(mybottoms[0], '91.7134') self.assertEqual(len(mybottoms), 75) self.assertEqual(mybottoms[len(mybottoms)-1], '681.7134') def test_get_text_items(self): svg_tree = ET.parse(self.test_file) extractor = extractWordPosition.Extractor() mytest_items = [ x for x in extractor.get_text_items(svg_tree.getroot()) ] self.assertEqual(len(mytest_items), 300) self.assertEqual(mytest_items[0].get('transform'), 'matrix(1 0 0 1 386.8218 57.1914)') tf = TranskriptionField(self.test_file) mytest_itemsTF = [ x for x in extractor.get_text_items(svg_tree.getroot(), transkription_field=tf) ] self.assertEqual(mytest_itemsTF[0].get('transform'), 'matrix(1 0 0 1 204.8618 91.7134)') def test_init_tree_and_target_file(self): target_file = 'xml/testA.xml' page = Page(xml_target_file=target_file, title=self.title) tree = page.page_tree self.assertEqual(tree.getroot().get('title'), self.title) self.assertEqual(tree.getroot().findall('./style'), []) write_pretty(xml_element_tree=tree, file_name=target_file) page = Page(xml_target_file=target_file) tree = page.page_tree self.assertEqual(tree.getroot().get('title'), self.title) self.assertEqual(tree.getroot().findall('./style'), []) isfile(target_file) and os.remove(target_file) def test_add_style(self): extractor = extractWordPosition.Extractor() svg_tree = ET.parse(self.test_file) sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot()) target_file = 'xml/testA.xml' page = Page(xml_target_file=target_file,title=self.title) page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict) write_pretty(xml_element_tree=page.page_tree, file_name=target_file) fromTarget_xml_tree = ET.parse(target_file) self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title) self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23") self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE') self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen') page = Page(xml_target_file=target_file) page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict) write_pretty(xml_element_tree=page.page_tree, file_name=target_file) fromTarget_xml_tree = ET.parse(target_file) self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title) self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23") self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE') self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen') isfile(target_file) and os.remove(target_file) def test_add_word(self): extractor = extractWordPosition.Extractor() svg_tree = ET.parse(self.test_file) mylist = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] matrix = Matrix(self.matrix_string) for dict in mylist: dict['class'] = 'st22' dict['x'] = matrix.add2X(0) dict['y'] = matrix.getY() target_file = self.test_dir + sep + 'asdfasdf.xml' page = Page(xml_target_file=target_file) sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot()) page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict) self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 1) mylist[1]['text'] = 'A' mylist[1]['class'] = 'st21' mylist[1]['x'] = matrix.add2X(1) self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 2) extractor.update_and_attach_words2tree(page) #self.assertEqual(page.word_insertion_marks[0].x, 184.656) #self.assertEqual(page.word_insertion_marks[0].y, 197.913) self.assertEqual(page.page_tree.getroot().xpath('//word[@id="1"]')[0].get('text'), 'a') self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]')[0].get('text'), 'c') self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('left'), '183.506') self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('height'), '8.25') - - def test_find_inserted_words(self): - """PAUSED - """ - """ - reference_tree = ET.parse(self.test_file_find_word) - extractor = extractWordPosition.Extractor() - svg_tree = ET.parse(self.test_file) - page = Page(xml_source_file=self.test_file_find_word) - for word_insertion in [ WordInsertionMark(wim_node=node) for node in reference_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG()) ]: - words = extractor.find_inserted_words(page.page_tree, word_insertion) - self.assertEqual([ str(word.id) for word in words ], [ str(word.id) for word in word_insertion.inserted_words]) - """ def test_extractor(self): extractor = extractWordPosition.Extractor() self.assertEqual(extractor.title, None) self.assertEqual(extractor.manuscript_file, None) self.assertEqual(extractor.xml_dir, 'xml/') self.assertEqual(extractor.manuscript_tree, None) def test_write_title_to_manuscript_file(self): extractor = extractWordPosition.Extractor(xml_dir=self.test_dir, title=self.title) self.assertEqual(isfile(extractor.manuscript_file), True) extractor = extractWordPosition.Extractor(manuscript_file=extractor.manuscript_file) self.assertEqual(extractor.title, self.title) def test_extract_line_numbers(self): svg_tree = ET.parse(self.test_file) tf = TranskriptionField(self.test_file) extractor = extractWordPosition.Extractor() line_numbers = extractor.extract_line_numbers(svg_tree, tf) self.assertEqual(line_numbers[0].id, 2) self.assertEqual(len(line_numbers), 24) self.assertEqual(line_numbers[0].top, 45.163) def tearDown(self): isdir(self.test_dir) and shutil.rmtree(self.test_dir) isfile('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) and os.remove('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) if __name__ == "__main__": unittest.main() Index: svgscripts/extractWordPosition.py =================================================================== --- svgscripts/extractWordPosition.py (revision 27) +++ svgscripts/extractWordPosition.py (revision 28) @@ -1,560 +1,562 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract the position of the words in a svg file and write them to a xml file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import inspect import getopt from lxml import etree as ET from os import sep, listdir, mkdir, path from os.path import exists, isfile, isdir from progress.bar import Bar import re import sys from svgpathtools import svg2paths2 import warnings from myxmlwriter import write_pretty from datatypes.lineNumber import LineNumber from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.pdf import PDFText from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word from datatypes.word_insertion_mark import WordInsertionMark __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Extractor: """ This class can be used to extract the word positions in a svg file and write it to a xml file. Args: [xml_dir (str): target directory] [title (str): title of document] [manuscript_file (str): xml file containing information about the archival unity to which the current page belongs [extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that are part of the transkription field. """ + UNITTESTING = False SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ] def __init__(self, xml_dir=None, title=None, manuscript_file=None, extract_transkription_field_only=False, compare2pdf=False): if bool(xml_dir): self.xml_dir = xml_dir not isdir(self.xml_dir) and mkdir(self.xml_dir) else: self.xml_dir = 'xml' if(isdir('xml')) else '' self.compare2pdf = compare2pdf self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else '' self.title = title self.manuscript_file = manuscript_file self.extract_transkription_field_only = extract_transkription_field_only self.manuscript_tree = None if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file): self.manuscript_tree = ET.parse(self.manuscript_file) self.title = self.manuscript_tree.getroot().get('title') elif bool(self.manuscript_file): raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file)) elif bool(self.title): if not bool(self.manuscript_file): self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml' if not isfile(self.manuscript_file): self.manuscript_tree = ET.ElementTree(ET.Element('page', attrib={"title": self.title})) write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile') def get_page_number(self, file_name, page_number=None): """ Returns page number as a string (with leading zero(s) if len(page_number) < 3). """ if not bool(page_number) and bool(re.search(r'\d', file_name)): """if page_number=None and filename contains digits, then split filename into its parts that contain only digits, remove empty strings and return the last part containing only digits. """ page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop() if bool(page_number): leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else '' return leading_zeros + str(page_number) else: return '' def get_file_name(self, file_name, page_number=None): """Returns the file_name of the target xml file. """ dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else '' if bool(self.title): return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml' else: return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml')) def get_style(self, etree_root): """Returns the style specification as a dictionary. :returns: sonderzeichen_list: list of keys for classes that are 'Sonderzeichen' style_dict: dictionary: key = class name (str), value = style specification (dictionary) """ style_dict = {} sonderzeichen_list = [] letterspacing_list = [] style = etree_root.find('style', etree_root.nsmap) if style is not None: for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))): style_key = style_item.split('{')[0].replace('.', '') style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \ for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))} style_dict[style_key] = style_value_dict if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'): sonderzeichen_list.append(style_key) if bool(style_value_dict.get('letter-spacing')): letterspacing_list.append(style_key) return sonderzeichen_list, letterspacing_list, style_dict def get_word_from_part_obj(self, word_part_obj): """Extracts all 'text' from a list of dicitonaries and concats it to a string. """ return ''.join([ dict['text'] for dict in word_part_obj]) def find_inserted_words_by_position(self, target_tree, x, y): """Returns an Array with the words that are inserted above the x, y position or [] if not found. """ warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.') MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 if(len(target_tree.getroot().xpath('//word[@id]')) > 0): result_list = [] minus2left = 20.0 minus2top = 19.0 while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX : result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ] minus2left -= 1 minus2top += 1 if len(result_list) > 0: result_bottom = result_list[len(result_list)-1].bottom result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)): result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def find_inserted_words(self, target_tree, word_insertion_mark): """Returns an Array with the words that are inserted above/underneath the word_insertion_mark. """ warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.') if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1: return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y) if(len(target_tree.getroot().xpath('//word[@id]')) > 0): MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 result_list = [] x = word_insertion_mark.x y = word_insertion_mark.y if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line line_number = word_insertion_mark.line_number - 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: minus2top = 1.0 while len(result_list) == 0 and minus2top < MINY: for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y - minus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break minus2top += 1 elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line line_number = word_insertion_mark.line_number + 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: plus2top = 1.0 while len(result_list) == 0 and plus2top < MINY : for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y + plus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break plus2top += 1 if len(result_list) > 0: # now, collect more words that are right of already collected words result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width for item in target_tree.getroot().xpath(\ '//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)): result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None): """Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word). If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created. :returns: the new word counter (int) """ break_points = [] if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points for Sonderzeichen in self.SONDERZEICHEN_LIST: contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ] if True in contains_Sonderzeichen: break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]] for sz_point in [i for i, e in break_points]: wim_index = len(page.word_insertion_marks) x = float(word_part_objs[sz_point]['x']) y = float(word_part_objs[sz_point]['y']) if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None: svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } xmin = transkription_field.xmin ymin = transkription_field.ymin wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\ line_number=page.get_line_number(y-1), mark_type=Sonderzeichen) page.word_insertion_marks.append(wim) if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points THRESHOLDX = 20 # Threshold between line number and text last_x = -1 for i, x in enumerate([float(dict['x']) for dict in word_part_objs]): if(last_x > -1 and (x - last_x > THRESHOLDX)): break_points.append((i, i)) last_x = x if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words from_index = 0 for end_point, next_from_index in break_points: new_word_part_objs = word_part_objs[from_index:end_point] new_endX = word_part_objs[end_point]['x'] from_index = next_from_index index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) if from_index > 0 and from_index < len(word_part_objs): new_word_part_objs = word_part_objs[from_index:] index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) return index else: if len(word_part_objs) > 0: transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\ debug_msg_string=debug_msg, transkription_field=transkription_field) text = self.get_word_from_part_obj(word_part_objs) line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2) newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions) #newWord = Word.CREATE_WORD(page=page, word_part_objs=word_part_objs, id=index, endX=endX, endSign=endSign, matrix=matrix, debug_msg=debug_msg) #newWord.attach_word_to_tree(page.page_tree) -> now we attach all words with update_and_attach_words2tree() page.words.append(newWord) return int(index) + 1 else: return int(index) def get_bottoms(self, tree_root, from_position=-1.0, to_position=-1.0, transkription_field=None): """Returns all unique bottom values (Float) as a sorted list. """ bottom_list = sorted(set(item.get('transform').split(' ')[5].replace(')','') for item in tree_root.findall(".//text", tree_root.nsmap)), key=float) if transkription_field is not None: from_position = transkription_field.ymin to_position = transkription_field.ymax if (from_position > 0.0 and to_position > 0.0): return [ item for item in filter(lambda x: float(x) > from_position and float(x) < to_position, bottom_list) ] else: return bottom_list def get_text_items(self, tree_root, transkription_field=None): """Returns all text elements with a matrix or (if transkription_field is specified) all text elements that are located inside the transkription field. """ if transkription_field is not None: return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\ tree_root.iterfind(".//text", tree_root.nsmap)) else: return tree_root.iterfind(".//text", tree_root.nsmap) def extract_line_numbers(self, svg_tree, transkription_field): """Extracts line numbers and write them to a xml file. """ nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] line_numbers = [ LineNumber(raw_text_node=item, transkription_field=transkription_field)\ for item in filter(lambda x: LineNumber.IS_A_LINE_NUMBER(x), nodes_near_tf)] if len(line_numbers) > 0: MINABOVE = 3 last_to_position = transkription_field.ymin for line_number in line_numbers: above_current_line_bottom = line_number.bottom + transkription_field.ymin - MINABOVE bottoms = self.get_bottoms(svg_tree.getroot(), from_position=last_to_position, to_position=above_current_line_bottom) last_to_position = above_current_line_bottom if len(bottoms) > 0: current_line_top = float(bottoms[len(bottoms)-1]) - transkription_field.ymin + MINABOVE line_number.setTop(current_line_top) return line_numbers def get_word_object_multi_char_x(self, word_part_obj_dict): """Returns the x of the last char of word_part_object. TODO: get real widths from svg_file!!! """ WIDTHFACTOR = 2.6 return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR def extract_word_position(self, svg_tree, page, transkription_field=None): """Extracts word positions. """ counter = 0 word_part_obj = [] endSign = '%' last_matrix = None MAXBOTTOMDIFF = 5 MAXXDIFF = 6 - bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)])) + if not Extractor.UNITTESTING: + bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)])) for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field): current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field) # check for line breaks if (last_matrix is not None and len(word_part_obj) > 0 and (\ Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\ (abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\ (abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\ or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()): endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\ round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\ str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix))) counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field) word_part_obj = [] endX = current_matrix.getX() if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))): word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} ) else: endSign = text_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT endX = current_matrix.add2X(tspan_item.get('x')) if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))): y = current_matrix.add2Y(tspan_item.get('y')) word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix }) if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: """text_item has letterspacing class (set s & set t = new set with elements common to s and t) """ endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='tspan with letterspacing', transkription_field=transkription_field) word_part_obj = [] else: endSign = tspan_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='svg/text/tspan/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' last_matrix = current_matrix - bar.next() + not bool(Extractor.UNITTESTING) and bar.next() if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\ transkription_field=transkription_field) word_part_obj = [] endSign = '%' - print() + not bool(Extractor.UNITTESTING) and bar.finish() def update_and_attach_words2tree(self, page): """Update word ids and attach them to page.page_tree. """ for node in page.page_tree.xpath('//word'): node.getparent().remove(node) for index, word in enumerate(page.words): word.id = index word.attach_word_to_tree(page.page_tree) def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None): """Extracts information about positions of text elements. [:returns:] (datatypes.page) the Page containing all information. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file transkription_field = TranskriptionField(file_name) if bool(self.extract_transkription_field_only) else None svg_tree = ET.parse(file_name) page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number, pdfFile=pdfFile,\ svg_file=svg_file, extract_transkription_field_only=self.extract_transkription_field_only) sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot()) page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict) if transkription_field is not None: page.init_line_numbers(self.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax) self.extract_word_position(svg_tree, page, transkription_field=transkription_field) if page.pdfFile is not None and isfile(page.pdfFile): pdftext = PDFText(page.pdfFile, sonderzeichen=self.SONDERZEICHEN_LIST) pdftext.compare_svgWords2pdfWords(page, transkription_field=transkription_field, split_wrongly_concatenated_words=self.compare2pdf) page.create_writing_processes_and_attach2tree() self.update_and_attach_words2tree(page) for word_insertion_mark in page.word_insertion_marks: # it is not clear if we really need to know this alternative word ordering. See 'TODO.md' #word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark) word_insertion_mark.attach_object_to_tree(page.page_tree) return page else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None): """Extracts information about positions of text elements and writes them to a xml file. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile) write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition') return 0 else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract the position of the words in a svg file and write them to a xml file. svgscripts/extractWordPosition.py [OPTIONS] svg file OR xml target file containing file name of svg file as "/page/@source". directory containing svg files OPTIONS: -h|--help: show help -c|--compare-to-pdf compare words to pdf and autocorrect -d|--xml-dir=xmlDir: target directory for the xml output file(s) -m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s) -o|--only-transkription-field: extract only words that are part of the transkription field. -p|--page=pageNumber: page number of the current page. For use with _one_ file only. -P|--PDF=pdfFile: pdf file - used for word correction -s|--svg=svgFile: svg web file -t|--title=title: title of the manuscript to which the current page(s) belong(s) -x|--xml-target-file=xmlOutputFile: xml target file :return: exit code (int) """ compare2pdf = False extract_transkription_field_only = True manuscript_file = None page_number = None pdfFile = None svg_file = None title = None xml_target_file = None xml_dir = ".{}xml".format(sep) try: opts, args = getopt.getopt(argv, "hocd:m:t:p:s:x:P:", ["help", "only-transkription-field", "compare-to-pdf", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-c', '--compare-to-pdf'): compare2pdf = True elif opt in ('-o', '--only-transkription-field'): extract_transkription_field_only = True elif opt in ('-d', '--xml-dir'): xml_dir = arg elif opt in ('-m', '--manuscript-file'): manuscript_file = arg elif opt in ('-t', '--title'): title = arg elif opt in ('-p', '--page'): page_number = str(arg) elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-P', '--PDF'): pdfFile = arg elif opt in ('-x', '--xml-target-file'): xml_target_file = str(arg) files_to_process = list() for arg in args: if isfile(arg): files_to_process.append(arg) elif isdir(arg): files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg))) else: print("'{}' does not exist!".format(arg)) return 2 if len(files_to_process) < 1 or args[0].endswith('xml'): if xml_target_file is None: xml_target_file = args[0] if len(args) > 0 else None if xml_target_file is not None and isfile(xml_target_file): target_file_tree = ET.parse(xml_target_file) file_name = target_file_tree.getroot().get('source') title = target_file_tree.getroot().get('title') if title is None else title page_number = target_file_tree.getroot().get('number') if page_number is None else page_number extract_transkription_field_only = (target_file_tree.getroot().get('transkription-field-only') == 'true')\ if target_file_tree.getroot().get('transkription-field-only') is not None else False if svg_file is None: svg_file = target_file_tree.xpath('.//svg/@file')[0]\ if len(target_file_tree.xpath('.//svg/@file')) > 0 else None files_to_process.insert(0, file_name) if xml_target_file in files_to_process: files_to_process.remove(xml_target_file) else: usage() return 2 if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)): print("ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!") usage() return 2 extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, extract_transkription_field_only=extract_transkription_field_only, compare2pdf=compare2pdf) for file in files_to_process: extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/test_word_position.py =================================================================== --- svgscripts/test_word_position.py (revision 27) +++ svgscripts/test_word_position.py (revision 28) @@ -1,34 +1,37 @@ import unittest from os import sep, path import lxml.etree as ET from datatypes.matrix import Matrix from datatypes.word_position import WordPosition class TestWordPosition(unittest.TestCase): def test_init(self): word_position = WordPosition(id=1, height=10, width=10, x=0, y=10) self.assertEqual(word_position.tag, WordPosition.TRANSKRIPTION) self.assertEqual(word_position.id, '1') self.assertEqual(word_position.height, 10) self.assertEqual(word_position.top, 10) self.assertEqual(word_position.bottom, 20) self.assertEqual(word_position.left, 0) self.assertEqual(word_position.isOnTranskription(), True) self.assertEqual(word_position.isOnFaksimile(), False) def test_attach_object_to_tree(self): matrix = Matrix('matrix(0 0 0 0 0 0)') word_position = WordPosition(id=1, height=10, width=10, x=0, y=10, matrix=matrix, tag=WordPosition.FAKSIMILE) empty_tree = ET.ElementTree(ET.Element('page')) word_position.attach_object_to_tree(empty_tree) for node in empty_tree.getroot().xpath('//' + word_position.tag): self.assertEqual(node.get('id'), '1') self.assertEqual(node.get('bottom'), '20') self.assertEqual(node.get('transform'), matrix.toString()) self.assertEqual(node.get('writing-process-id'), '-1') word_position = WordPosition(node=empty_tree.getroot().find('.//' + word_position.tag)) + def test_get_semantic_dictionary(self): + dictionary = WordPosition.get_semantic_dictionary() + self.assertEqual(WordPosition.XML_TAG in dictionary['properties'].get('writing_process_id').get('xpath'), True) if __name__ == "__main__": unittest.main() Index: svgscripts/datatypes/transkription_position.py =================================================================== --- svgscripts/datatypes/transkription_position.py (revision 27) +++ svgscripts/datatypes/transkription_position.py (revision 28) @@ -1,155 +1,155 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a transkription word position. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from .class_spec import SemanticClass from .debug_message import DebugMessage from .positional_word_part import PositionalWordPart from .word_position import WordPosition from .matrix import Matrix class TranskriptionPosition(WordPosition): """ This class represents a transkription word position. Args: id (int): word id matrix (datatypes.Matrix): matrix containing information about transformation. height (float): height of word width (float): width of word x (float): x position of word y (float): y position of word positional_word_parts a list of (datatypes.positional_word_part) PositionalWordPart debug_message a (datatypes.debug_message) DebugMessage """ ADD2X = 0.15 ADD2TOP = 1.0 ADD2BOTTOM = 0.2 HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height XML_TAG = WordPosition.TRANSKRIPTION def __init__(self, id=0, node=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, positional_word_parts=[], debug_message=None): super(TranskriptionPosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) self.positional_word_parts = positional_word_parts self.debug_message = debug_message if node is not None: self.debug_message = DebugMessage(node=node.xpath('.//' + DebugMessage.XML_TAG)[0])\ if len(node.xpath('.//' + DebugMessage.XML_TAG)) > 0 else None self.positional_word_parts = [ PositionalWordPart(node=pwp_node) for pwp_node in node.xpath('.//' + PositionalWordPart.XML_TAG) ] self.attachable_objects += self.positional_word_parts if self.debug_message is not None: self.attachable_objects.append(self.debug_message) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ - dictionary = super(cls,cls).get_semantic_dictionary() - dictionary['properties'].update({'positional_word_parts': (PositionalWordPart, SemanticClass.LIST, '{}/@id'.format(WordPosition.TRANSKRIPTION))}) + dictionary = super(TranskriptionPosition,cls).get_semantic_dictionary() + dictionary['properties'].update({'positional_word_parts': (PositionalWordPart, SemanticClass.LIST, '{}/@id'.format(cls.XML_TAG))}) return dictionary @staticmethod def CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, positional_word_parts, debug_message=None, debug_msg_string=None, transkription_position_id=0): """Creates a list of TranskriptionPosition from a list of (datatypes.positional_word_part) PositionalWordPart. [:return:] a list of (datatypes.transkription_position) TranskriptionPosition """ TOPCORRECTION = 1 debug_message = DebugMessage(message=debug_msg_string)\ if debug_msg_string is not None else debug_message transkription_positions = [] if len(positional_word_parts) < 1: return [] matrix = positional_word_parts[0].transform index = 0 matrices_differ = False style_class = positional_word_parts[0].style_class styles_differ = False while index < len(positional_word_parts) and not matrices_differ and not styles_differ: if Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix, positional_word_parts[index].transform): matrices_differ = True elif style_class != positional_word_parts[index].style_class: styles_differ = True else: index += 1 if (matrices_differ or styles_differ) and index < len(positional_word_parts): debug_msg_string = 'matrices differ' if matrices_differ else 'styles differ' transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page,\ positional_word_parts[index:], debug_msg_string=debug_msg_string, transkription_position_id=transkription_position_id+1) positional_word_parts = positional_word_parts[:index] if page.get_line_number((positional_word_parts[0].top + positional_word_parts[0].bottom)/2) % 2 == 0: all_styles = [] for pwp in positional_word_parts: all_styles += pwp.style_class.split(' ') biggest_font_size = page.get_biggest_fontSize4styles(style_set=set(all_styles)) height = round(biggest_font_size * TranskriptionPosition.HEIGHT_FACTOR + TranskriptionPosition.HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 2 + TranskriptionPosition.HEIGHT_FACTOR / biggest_font_size else: # take greatest value for height height = [ pwp.height for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.height, reverse=True)][0] + 2*TOPCORRECTION x = positional_word_parts[0].left - TranskriptionPosition.ADD2X y = [ pwp.top for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.top)][0] - TOPCORRECTION width = positional_word_parts[len(positional_word_parts)-1].left - x\ + positional_word_parts[len(positional_word_parts)-1].width + TranskriptionPosition.ADD2X for pwp_index, pwp in enumerate(positional_word_parts): pwp.id = pwp_index transkription_positions.insert(0, TranskriptionPosition(id=transkription_position_id, height=height, width=width, x=x, y=y, matrix=matrix,\ positional_word_parts=positional_word_parts, debug_message=debug_message)) return transkription_positions @staticmethod def CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=None, debug_msg_string=None, transkription_field=None): """Creates a list of TranskriptionPosition from word_part_objs (i.e. a list of dictionaries with the keys: text, x, y, matrix, class). [:return:] a list of (datatypes.transkription_position) TranskriptionPosition """ positional_word_parts = [] debug_message = DebugMessage(message=debug_msg_string)\ if debug_msg_string is not None else None if page.svg_file is not None and isfile(page.svg_file): svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } xmin = 0.0 ymin = 0.0 if transkription_field is not None: xmin = transkription_field.xmin ymin = transkription_field.ymin for part_obj in word_part_objs: positional_word_parts += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(\ part_obj, svg_path_tree, namespaces, page, start_id=len(positional_word_parts),\ xmin=xmin, ymin=ymin) else: positional_word_parts = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) if len(positional_word_parts) > 0: return TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, positional_word_parts, debug_message=debug_message) else: return [ TranskriptionPosition(matrix=matrix, debug_message=debug_message) ] Index: svgscripts/datatypes/word.py =================================================================== --- svgscripts/datatypes/word.py (revision 27) +++ svgscripts/datatypes/word.py (revision 28) @@ -1,214 +1,219 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import warnings from .class_spec import SemanticClass +from .lineNumber import LineNumber from .matrix import Matrix from .word_position import WordPosition from .transkription_position import TranskriptionPosition class Word(SemanticClass): """ This class represents a word. """ DATA = 'debug-data' def __init__(self, id=0, text='', line_number=-1, transkription_positions=[], faksimile_positions=[], word_part_objs=[]): self.id = id self.text = text self.line_number = line_number self.transkription_positions = transkription_positions self.faksimile_positions = faksimile_positions self.word_part_objs = word_part_objs self.is_head_of_inserted_words = False self.is_tail_of_inserted_words = False self.is_before_inserted_words = False self.is_after_inserted_words = False self.word_insertion_mark = None self.debug_msg = None @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() - properties = {'id': (int, 1, 'word/@id'), 'text': (str, 1, 'word/@text'), 'line_number': (int, 1, 'word/@line-number'),\ + properties = {'id': (int, 1, 'word/@id'), 'text': (str, 1, 'word/@text'),\ + 'line_number': {'class': LineNumber, 'cardinality': 1,\ + 'name': 'wordHasLineNumber', 'xpath': 'word/@line-number',\ + 'label': 'word has a line number',\ + 'comment': 'Relating a word to a line number it has.'},\ 'transkription_positions': (TranskriptionPosition, SemanticClass.LIST, 'word/@id'),\ 'faksimile_positions': (WordPosition, SemanticClass.LIST, 'word/@id')} dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary def set_word_insertion_mark(self, word_insertion_mark): """Sets word_insertion_mark """ self.word_insertion_mark = word_insertion_mark def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = target_tree.getroot().xpath('//word[@id="%s"]' % self.id)[0] \ if(len(target_tree.getroot().xpath('//word[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree.getroot(), 'word', attrib={'id': str(self.id)}) word_node.set('text', self.text) if self.line_number > -1: word_node.set('line-number', str(self.line_number)) for transkription_position in self.transkription_positions: transkription_position.attach_object_to_tree(word_node) """ data_node = word_node.find(self.DATA) if bool(word_node.find(self.DATA)) else ET.SubElement(word_node, self.DATA) for part_index, word_part in enumerate(self.word_part_objs): part_node = data_node.xpath('./part[@index="%s"]' % part_index)[0] \ if(len(data_node.xpath('./part[@index="%s"]' % part_index)) > 0) \ else ET.SubElement(data_node, 'part', attrib={'index': str(part_index)}) part_node.set('text', word_part['text']) part_node.set('class', word_part['class']) part_node.set('x', str(round(float(word_part['x']), 3))) part_node.set('y', str(round(float(word_part['y']), 3))) if self.debug_msg is not None: ET.SubElement(data_node, 'end', attrib={'debug-msg': self.debug_msg}) """ def split(self, page, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) return previousWord, currentWord, nextWord def join(self, other_word, append_at_end_of_new_word=True): """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. """ if append_at_end_of_new_word: self.text = self.text + other_word.text for position in other_word.transkription_positions: position.id = str(len(self.transkription_positions)) self.transkription_positions.append(position) else: self.text = other_word.text + self.text index = 0 for position in other_word.transkription_positions: self.transkription_positions.insert(index, position) index += 1 while index < len(self.transkription_positions): self.transkription_positions[index].id = str(index) index += 1 @staticmethod def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): """Creates a word from a (lxml.Element) node or word_part_objs. [:return:] Word """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number text = word_node.get('text') transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ if len(word_node.findall('.//' + Word.DATA)) > 0\ else [ item.attrib for item in word_node.findall('.//part')] return Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file WIDTH = 5 TOPCORRECTION = 2.0 FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize height = height x = round(float(word_part_objs[0]['x']), 3) if(page is not None and bool(page.style_dict)): HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size if endSign is not None and '%' in endSign: lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ if bool(page.style_dict[key].get('font-size'))] lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR elif endSign is not None and '%' in endSign: endX = float(endX) + WIDTH bottom = round(float(word_part_objs[0]['y']), 3) y = round(bottom - height + TOPCORRECTION, 3) width = round(float(endX) - x, 3) transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] text = ''.join([ dict['text'] for dict in word_part_objs]) line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) word.debug_msg = debug_msg return word else: error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' raise Exception('Error: {}'.format(error_msg)) Index: svgscripts/datatypes/word_insertion_mark.py =================================================================== --- svgscripts/datatypes/word_insertion_mark.py (revision 27) +++ svgscripts/datatypes/word_insertion_mark.py (revision 28) @@ -1,130 +1,135 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word insertion mark. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from svgpathtools.parser import parse_path import warnings +from .lineNumber import LineNumber from .positional_object import PositionalObject from .word import Word class WordInsertionMark(PositionalObject): """ This class represents a word insertion mark. Args: wim_node (etree.Element): element that contains information about a word_insertion_mark. OR id (int): word id x (float) y (float) height (float) width (float) previous_word_id (int): id of the word to which word insertion mark is attached inserted_words: Array->Word of inserted words marked by the word insertion mark. """ XML_TAG = 'word-insertion-mark' extraStringKeys = [ 'mark_type', 'symbol_id' ] def __init__(self, wim_node=None, id=0, x=-1.0, y=-1.0, height=0, width=0, previous_word_id=-1, next_word_id=-1, line_number=-1, symbol_id=None, inserted_words=[], inserted_word_id=-1, mark_type='A'): super(WordInsertionMark, self).__init__(id=id, node=wim_node, height=height, width=width, x=x, y=y, tag=WordInsertionMark.XML_TAG) self.stringKeys += [ 'mark_type', 'symbol_id' ] self.intKeys += [ 'line_number', 'next_word_id', 'previous_word_id' ] self.symbol_id = symbol_id self.mark_type = mark_type self.line_number = line_number self.previous_word_id = previous_word_id self.next_word_id = next_word_id if wim_node is not None: self.mark_type = wim_node.get('mark-type') self.line_number = int(wim_node.get('line-number')) if bool(wim_node.get('line-number')) else -1 self.previous_word_id = int(wim_node.get('previous-word-id')) if bool(wim_node.get('previous-word-id')) else -1 self.next_word_id = int(wim_node.get('next-word-id')) if bool(wim_node.get('next-word-id')) else -1 def init_inserted_words(self, inserted_words=[], wim_node=None, inserted_word_id_string=None): if wim_node is not None and inserted_word_id_string is not None: ids = inserted_word_id_string.split(' ') inserted_words = [ Word.CREATE_WORD(word_node=word_node) for word_node in wim_node.getroottree().getroot().xpath('//word[@id>="{0}" and @id<="{1}"]'.format(ids[0], ids[len(ids)-1])) ] if len(inserted_words) > 0: inserted_words[0].is_head_of_inserted_words = True inserted_words[len(inserted_words)-1].is_tail_of_inserted_words = True for word in inserted_words: word.set_word_insertion_mark(self) return inserted_words def attach_and_update_word_if_involved(self, word): if word.id == self.previous_word_id: word.is_before_inserted_words = True word.word_insertion_mark = self elif word.id == self.next_word_id: word.is_after_inserted_words = True word.word_insertion_mark = self elif word.id in [ inserted.id for inserted in self.inserted_words ]: word = [ inserted for inserted in self.inserted_words if inserted.id == word.id ][0] return word @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = super(cls,cls).get_semantic_dictionary() - dictionary['properties'].update({ 'line_number': (int, 1, '{}/@{}'.format(cls.XML_TAG, 'line_number'.replace('_','-')))}) - dictionary['properties'].update({ 'previous_word_id': (Word, 0, '{}/@{}'.format(cls.XML_TAG, 'previous_word_id'.replace('_','-')))}) - dictionary['properties'].update({ 'next_word_id': (Word, 0, '{}/@{}'.format(cls.XML_TAG, 'next_word_id'.replace('_','-')))}) + word_dicts = { key: { 'class': Word, 'cardinality': 1, 'cardinality_restriction': 'maxCardinality',\ + 'label': 'has a {} word'.format(key.replace('_word_id','')),\ + 'name': 'has{}'.format(key.title().replace('_Id','').replace('_','')),\ + 'xpath': '{}/@{}'.format(cls.XML_TAG, key.replace('_','-')) } for key in [ 'previous_word_id', 'next_word_id' ] } + dictionary['properties'].update(word_dicts) + dictionary['properties'].update({'line_number': {'class': LineNumber, 'cardinality': 1, 'xpath': '{}/@line-number'.format(cls.XML_TAG),\ + 'name': 'wordInsertionMarkHasLineNumber', 'label': 'word insertion mark has a line number'}}) dictionary['properties'].update(dict(zip(cls.extraStringKeys, [ (str, 1, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in cls.extraStringKeys]))) return dictionary @staticmethod def CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=0, x=0.0, y=0.0, xmin=0.0, ymin=0.0, line_number=-1, mark_type='A'): """Creates a (datatypes.word_insertion_mark) WordInsertionMark using a (lxml.ElementTree) svg_path_tree and the corresponding namespaces. """ THRESHOLD = 0.4 svg_x = x + xmin svg_y = y + ymin use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\ .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces) if len(use_nodes) > 0: symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '') d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces) height = 0.0 width = 0.0 if len(d_strings) > 0: path = parse_path(d_strings[0]) xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin height = ymax - ymin return WordInsertionMark(id=id, x=x, y=y-height, height=height, width=width, line_number=line_number,\ mark_type=mark_type, symbol_id=symbol_id) else: warnings.warn('No glyph_id found for word insertion mark {} on line {}'.format(id, line_number)) return WordInsertionMark(id=id, x=x, y=y, line_number=line_number, mark_type=mark_type) Index: svgscripts/datatypes/word_position.py =================================================================== --- svgscripts/datatypes/word_position.py (revision 27) +++ svgscripts/datatypes/word_position.py (revision 28) @@ -1,70 +1,82 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word position. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from .matrix import Matrix from .positional_object import PositionalObject +from .writing_process import WritingProcess class WordPosition(PositionalObject): """ This class represents a word position. Args: id (int): word id matrix (datatypes.Matrix): matrix containing information about conversion. height (float): height of word width (float): width of word x (float): x position of word y (float): y position of word tag (str) location of the word position: 'WordPosition.TRANSKRIPTION' (default) or 'WordPosition.FAKSIMILE' """ TRANSKRIPTION = 'transkription-position' FAKSIMILE = 'faksimile-position' XML_TAG = 'faksimile-position' def __init__(self, id=0, node=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, tag=TRANSKRIPTION): super(WordPosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=tag) self.intKeys.append('writing_process_id') self.writing_process_id = -1 if node is not None: self.writing_process_id = int(node.get('writing-process-id'))\ if bool(node.get('writing-process-id')) else -1 + @classmethod + def get_semantic_dictionary(cls): + """ Creates a semantic dictionary as specified by SemanticClass. + """ + dictionary = super(WordPosition,cls).get_semantic_dictionary() + dictionary['properties'].update({'writing_process_id':\ + { 'class': WritingProcess, 'cardinality': 1, 'cardinality_restriction': 'cardinality',\ + 'name': '{}BelongsTo{}'.format(WordPosition.__name__, WritingProcess.__name__),\ + 'label': "connects a {} with a stage in Nietzsche's process of writing".format(WordPosition.__name__),\ + 'xpath': '{}/@writing-process-id'.format(cls.XML_TAG)}}) + return dictionary + def isOnTranskription(self): """Returns whether position is on transkription. """ return self.tag == self.TRANSKRIPTION def isOnFaksimile(self): """Returns whether position is on transkription. """ return self.tag == self.FAKSIMILE - # TODO: def join(self, ... Index: svgscripts/test_word.py =================================================================== --- svgscripts/test_word.py (revision 27) +++ svgscripts/test_word.py (revision 28) @@ -1,140 +1,142 @@ import unittest from os import sep, path import lxml.etree as ET from datatypes.matrix import Matrix from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word from datatypes.word_position import WordPosition class Page: def __init__(self): self.svg_file = None def get_line_number(self, input=0): return -1 def get_biggest_fontSize4styles(self, style_set={}): return 7 class TestWord(unittest.TestCase): def setUp(self): self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] x = 0 for dict in self.word_part_objs: dict['class'] = 'st22' dict['x'] = x dict['y'] = 11 x += 1 mylist = {'text': 'abc', 'id': '0', 'line-number': '2' } word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] self.word_node = ET.Element('word', attrib=mylist) word_position.attach_object_to_tree(self.word_node) x = 0 for char in mylist['text']: ET.SubElement(self.word_node, 'part', attrib={'text': char, 'x': str(x), 'y': '11', 'class': 'st22' }) x += 1 def test_Word_with_word_part_objs(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) self.assertEqual(word.id, 0) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_Word_with_word_node(self): word = Word.CREATE_WORD(word_node=self.word_node) self.assertEqual(word.id, 0) self.assertEqual(word.transkription_positions[0].bottom, 11) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 1) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') self.assertEqual(word.line_number, 2) self.assertEqual(word.transkription_positions[0].transform.isRotationMatrix(), True) def test_attach_word_to_tree(self): newWord = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) empty_tree = ET.ElementTree(ET.Element('page')) newWord.attach_word_to_tree(empty_tree) for word_node in empty_tree.getroot().xpath('//word'): word = Word.CREATE_WORD(word_node=word_node) self.assertEqual(word.id, 0) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_split(self): page = Page() pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split(page, 'b') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) self.assertEqual(nextWord.id, 2) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split(page, 'bc') self.assertEqual(previousWord.id, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(currentWord.id, 1) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split(page, 'ab', start_id=10) self.assertEqual(currentWord.id, 10) self.assertEqual(currentWord.text, 'ab') self.assertEqual(currentWord.transkription_positions[0].width, 2.1) self.assertEqual(nextWord.id, 11) self.assertEqual(nextWord.transkription_positions[0].width, 5.2) word_part_objs=[{'text': 'x', 'class':'st22', 'x': 0, 'y': 0},\ {'text': 'Insofern', 'class':'st22', 'x': 1, 'y': 0},\ {'text': 'x', 'class':'st22', 'x': 10, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split(page, 'Insofer') word_part_objs=[{'text': 'xInsofern', 'class':'st22', 'x': 0, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split(page, 'Insofern') def test_join(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word) self.assertEqual(word.text, 'abc.') other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word, append_at_end_of_new_word=False) self.assertEqual(word.text, '.abc.') """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_get_semanticAndDataDict(self): word = Word.CREATE_WORD(word_node=self.word_node) empty_tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(empty_tree) - dict = Word.get_semantic_dictionary() - for key in dict['properties'].keys(): - cls, cardinality, xpath = dict['properties'].get(key) + dictionary = Word.get_semantic_dictionary() + for key in dictionary['properties'].keys(): + xpath = dictionary['properties'].get(key).get('xpath')\ + if type(dictionary['properties'].get(key)) is dict\ + else dictionary['properties'].get(key)[2] results = empty_tree.xpath(xpath) self.assertEqual(len(results), 1) #print('{}: {}'.format(key, results[0])) #self.assertEqual(word.get_data_dictionary()['body'].get('text'), 'abc') if __name__ == "__main__": unittest.main() Index: svgscripts/test_extractFaksimilePosition.py =================================================================== --- svgscripts/test_extractFaksimilePosition.py (revision 27) +++ svgscripts/test_extractFaksimilePosition.py (revision 28) @@ -1,35 +1,35 @@ import unittest import os from os import sep, path from os.path import isfile, isdir, dirname import shutil import tempfile import lxml.etree as ET import extractWordPosition from myxmlwriter import write_pretty from datatypes.transkriptionField import TranskriptionField from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.lineNumber import LineNumber from datatypes.word_insertion_mark import WordInsertionMark class TestExtractor(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' self.test_file_find_word = DATADIR + sep + 'test_find_word.xml' self.test_dir = tempfile.mkdtemp() self.title = 'ABC 111' self.matrix_string = 'matrix(1 0 0 1 183.6558 197.9131)' self.test_file = DATADIR + sep + 'Mp_XIV_1_mytest_421.svg' def test_main(self): argv = ['-d', self.test_dir, '-o', '--title=My Hero', '--page=1', self.test_file] - self.assertEqual(extractWordPosition.main(argv), 0) + pass def tearDown(self): isdir(self.test_dir) and shutil.rmtree(self.test_dir) isfile('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) and os.remove('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) if __name__ == "__main__": unittest.main() Index: py2ttl/test_py2ttl.py =================================================================== --- py2ttl/test_py2ttl.py (revision 27) +++ py2ttl/test_py2ttl.py (revision 28) @@ -1,72 +1,79 @@ import unittest import lxml.etree as ET from os import sep, path, remove from os.path import isfile, dirname from rdflib import Graph, URIRef import sys import py2ttl from py2ttl import Py2TTLConverter from config import PROJECT_NAME, PROJECT_ONTOLOGY_FILE if dirname(dirname(__file__)) not in sys.path: sys.path.append(dirname(dirname(__file__))) from svgscripts.datatypes.word import Word +from svgscripts.datatypes.word_position import WordPosition class TestPy2TTL(unittest.TestCase): """This is the unittest for py2ttl.py2ttl. @label unittest """ def setUp(self): self.ttl_target = __file__ + 'test.ttl' def test_main(self): argv = ['-t', self.ttl_target ] self.assertEqual(py2ttl.main(argv), 0) def test_init(self): converter = Py2TTLConverter(project_ontology_file=PROJECT_ONTOLOGY_FILE) self.assertEqual(converter.project_name, PROJECT_NAME) def test_get_semantic_classes(self): converter = Py2TTLConverter() classes = converter.get_semantic_classes('svgscripts/datatypes') self.assertEqual('FaksimileImage' in [ cls.__name__ for cls in classes ], True) self.assertEqual('Image' in [ cls.__name__ for cls in classes ], True) self.assertEqual('SemanticClass' in [ cls.__name__ for cls in classes ], False) def test_createProperty(self): converter = Py2TTLConverter(project_ontology_file=PROJECT_ONTOLOGY_FILE) converter.createProperty(converter.base_uriref + "#Test", 'test', str, 1) name_uri = converter.base_uriref + '#hasTest' self.assertEqual((name_uri, None, None) in converter.project_graph, True) def test_createPropertyName(self): converter = Py2TTLConverter() name = converter.createPropertyName(property_name='test_asdf_asdf') self.assertEqual(name, 'hasTestAsdfAsdf') name = converter.createPropertyName(object_uri=converter.base_uriref + '#Asdf') self.assertEqual(name, 'hasAsdf') name = converter.createPropertyName(subject_uri=converter.base_uriref + '#Test',object_uri=converter.base_uriref + '#Asdf') self.assertEqual(name, 'testBelongsToAsdf') name = converter.createPropertyName(subject_uri=converter.base_uriref + '#Test') self.assertEqual(name, 'testBelongsTo') def test_get_comment_label(self): converter = Py2TTLConverter() comment, label = converter.get_comment_label(TestPy2TTL) self.assertEqual(label, 'unittest') self.assertEqual(comment, self.__doc__.split('\n')[0].lstrip()) + def test_get_builtin_cls_keys(self): + dictionary = WordPosition.get_semantic_dictionary() + converter = Py2TTLConverter() + builtin_cls_keys = converter._get_builtin_cls_keys(dictionary['properties']) + self.assertEqual('id' in builtin_cls_keys, True) + def test_get_semantic_dictionary_keys_super_first(self): dict = Word.get_semantic_dictionary() converter = Py2TTLConverter() - keys = converter.get_semantic_dictionary_keys_super_first(dict['properties']) + keys = converter._get_semantic_dictionary_keys_super_first(dict['properties']) self.assertEqual(keys.index('faksimile_positions') < keys.index('transkription_positions'), True) def tearDown(self): isfile(self.ttl_target) and remove(self.ttl_target) if __name__ == "__main__": unittest.main() Index: py2ttl/py2ttl.py =================================================================== --- py2ttl/py2ttl.py (revision 27) +++ py2ttl/py2ttl.py (revision 28) @@ -1,373 +1,385 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert py classes that are subclasses of DATATYPES_DIR.class_spec.SemanticClass to a owl ontology in turtle format. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import getopt import importlib import importlib.util import inspect import lxml.etree as ET from os import sep, path, listdir from os.path import isfile, isdir, dirname, basename from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD import re import sys if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL, SHARED_ONTOLOGIES_DIR from knora_base import KNORA_BASE __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Py2TTLConverter: """This class can be used convert semantic_dictionaries to a owl ontology in turtle format. """ def __init__(self, project_ontology_file=None, create_super_cls_for_multi_property=True): self.list_value = -99 self.class_uri_dict = {} self.uri_xpath_mapping = {} self.create_super_cls_for_multi_property = create_super_cls_for_multi_property self.project_graph = Graph() self.base_uriref = URIRef(PROJECT_URL) self.project_name = PROJECT_NAME self.ns = { self.base_uriref + '#': self.project_name } if project_ontology_file is not None and isfile(project_ontology_file): self.project_graph.parse(project_ontology_file, format="turtle") if len(self.project_graph) > 0: self.base_uriref = self.project_graph.value(predicate=RDF.type, object=OWL.Ontology, any=False) self.ns = { uriref: ns for ns, uriref in self.project_graph.namespace_manager.namespaces() } self.project_name = self.ns.get(self.base_uriref + '#') self.project_graph.bind(self.project_name, self.base_uriref + '#') def get_semantic_classes(self, datatypes_dir): """Returns a list of all classes that are contained in datatypes_dir that are subclasses of DATATYPES_DIR.class_spec.SemanticClass. :return: a list of (str_name, class) """ base_dir = dirname(dirname(__file__)) sys.path.append(base_dir) root_modul_name = datatypes_dir.replace('/','.') reference_cls = importlib.import_module('{}.{}'.format(root_modul_name, 'class_spec')) try: self.list_value = reference_cls.LIST except AttributeError: pass files = [ file.replace('.py','') for file in listdir(datatypes_dir) if file.endswith('.py') and not file.startswith('test_') and not file.startswith('_')] all_modules = [] for name in files: all_modules.append(importlib.import_module('{}.{}'.format(root_modul_name, name))) all_classes = [] for modul in all_modules: all_classes += inspect.getmembers(modul, inspect.isclass) all_classes = sorted(set(all_classes)) semantic_classes = [ cls for name, cls in all_classes if issubclass(cls, reference_cls.SemanticClass) and not (cls == reference_cls.SemanticClass)] return semantic_classes def addRestriction2Class(self, cls_uri, property_uri, cardinality=0, comment="", label="", info_dict={}): """Adds restriction on property_uri to class cls_uri. """ if (cls_uri, None, None) not in self.project_graph: self.addClass(cls_uri, comment=comment, label=label) restriction = BNode() if 'cardinality_restriction' in info_dict.keys(): cardinality_restriction = URIRef(OWL + info_dict['cardinality_restriction']) else: cardinality_restriction = OWL.minCardinality if cardinality == 0 else OWL.cardinality self.project_graph.add((cls_uri, RDFS.subClassOf, restriction)) self.project_graph.add((restriction, RDF.type, OWL.Restriction)) self.project_graph.add((restriction, OWL.onProperty, property_uri)) self.project_graph.add((restriction, cardinality_restriction, Literal(str(cardinality), datatype=XSD.nonNegativeInteger))) def createPropertyName(self, property_name=None, subject_uri=None, object_uri=None, connector='BelongsTo', prefix='has'): """Returns a property name. """ if property_name is not None: property_name = ''.join([ property_name.split('_')[0].lower() ] + [ text.capitalize() for text in property_name.split('_')[1:] ]) return prefix + property_name[0].upper() + property_name[1:] if property_name[0].islower()\ else prefix + property_name elif subject_uri is not None: property_name = subject_uri.split('#')[1] + self.createPropertyName(object_uri=object_uri, prefix=connector) return property_name[0].lower() + property_name[1:] elif object_uri is not None: return prefix + object_uri.split('#')[1] else: return prefix def createSuperClassForSubjectClassConstraint(self, property_uri, sub_uri): """Creates a super class for classes that share a property. """ super_uri = URIRef(property_uri.replace('has', '') + 'Holder') self.project_graph.add((sub_uri, RDFS.subClassOf, super_uri)) self.project_graph.remove((sub_uri, RDFS.subClassOf, KNORA_BASE.Resource)) if (super_uri, RDF.type, OWL.Class) not in self.project_graph: label = 'holder of ' + property_uri.split('#')[1].replace('has', '') comment = 'super class for classes that have a ' + property_uri.split('#')[1].replace('has', '') self.addRestriction2Class(super_uri, property_uri, comment=comment, label=label) for object_uri in self.project_graph.objects(subject=property_uri, predicate=KNORA_BASE.subjectClassConstraint): self.project_graph.remove((property_uri, KNORA_BASE.subjectClassConstraint, object_uri)) self.project_graph.add((object_uri, RDFS.subClassOf, super_uri)) self.project_graph.remove((object_uri, RDFS.subClassOf, KNORA_BASE.Resource)) self.project_graph.add((property_uri, KNORA_BASE.subjectClassConstraint, super_uri)) objectClass = self.project_graph.value(subject=property_uri, predicate=KNORA_BASE.objectClassConstraint, any=False) comment = 'connects {} with {}'.format(super_uri.split('#')[1], objectClass.split('#')[1].replace('has', '')) self.project_graph.remove((property_uri, RDFS.comment, None)) self.project_graph.add((property_uri, RDFS.comment, Literal(comment, lang='en'))) def addProperty(self, property_uri, super_uri, subject_uri, object_uri, comment, label, cardinality, info_dict={}): """Add a property to self.project_graph. """ self.project_graph.add((property_uri, RDF.type, OWL.ObjectProperty)) self.project_graph.add((property_uri, RDFS.subPropertyOf, super_uri)) self.project_graph.add((property_uri, KNORA_BASE.objectClassConstraint, object_uri)) self.project_graph.add((property_uri, KNORA_BASE.subjectClassConstraint, subject_uri)) self.project_graph.add((property_uri, RDFS.comment, Literal(comment, lang='en'))) self.project_graph.add((property_uri, RDFS.label, Literal(label, lang='en'))) self.addRestriction2Class(subject_uri, property_uri, cardinality=cardinality, info_dict=info_dict) def createProperty(self, cls_uri, property_name, property_cls, cardinality, info_dict={}): """Creates a owl:ObjectProperty. :return: tuple of subject_uri (rdflib.URIRef) and property_uri (rdflib.URIRef) of created property """ inferredSubClass = RDFS.subClassOf * '*' name = self.createPropertyName(property_name=property_name)\ if 'name' not in info_dict.keys() else info_dict['name'] property_uri = URIRef(self.base_uriref + '#' + name) subject_uri = cls_uri label = 'has ' + name.replace('has','')\ if 'label' not in info_dict.keys() else info_dict['label'] super_uri = KNORA_BASE.hasValue if (property_uri, None, None) not in self.project_graph: if property_cls.__module__ == 'builtins': datatype_mapping = { float: KNORA_BASE.DecimalValue, int: KNORA_BASE.IntValue, str: KNORA_BASE.TextValue } object_uri = datatype_mapping.get(property_cls) if object_uri == KNORA_BASE.TextValue: if property_name == 'URL': object_uri = KNORA_BASE.UriValue elif property_name == 'file_name': object_uri = KNORA_BASE.FileValue else: object_uri = URIRef(self.base_uriref + '#' + property_cls.__name__) # if class X has a list of objects Y, we create a property YbelongsToX. if cardinality == self.list_value: subject_uri = object_uri object_uri = cls_uri result = self.project_graph.query(\ 'select ?p where {'\ + ' ?p <{0}> ?s .'.format(KNORA_BASE.subjectClassConstraint)\ + ' ?p <{0}> <{1}> .'.format(KNORA_BASE.objectClassConstraint, object_uri)\ + ' <{0}> ?s .'.format(subject_uri)\ + ' }') # if subject_uri is a subclass of a uri that is a subjectClassConstraint to a property_uri # that has object_uri as its objectClassConstraint, then we do not create a new property YbelongsToX, # instead we return subject_uri and this already existing property_uri. if len(result) > 0: return subject_uri, [ property_uri for property_uri in result ][0] name = self.createPropertyName(subject_uri=subject_uri, object_uri=object_uri) property_uri = URIRef(self.base_uriref + '#' + name) cardinality = 1 label = subject_uri.split('#')[1] + ' belongs to ' + object_uri.split('#')[1] super_uri = KNORA_BASE.hasLinkTo property_value_uri = URIRef(property_uri + 'Value') comment = 'Reification statement of relation between {} and {}'.format(subject_uri.split('#')[1], object_uri.split('#')[1]) reification_info_dict = {} if 'cardinality_restriction' in info_dict.keys(): reification_info_dict.update({'cardinality_restriction': info_dict['cardinality_restriction']}) self.addProperty(property_value_uri, KNORA_BASE.hasLinkToValue, subject_uri, KNORA_BASE.LinkValue,\ comment, label + ' - statement', cardinality, info_dict=reification_info_dict) - comment = 'connects {} with {}'.format(subject_uri.split('#')[1], object_uri.split('#')[1]) + comment = 'connects {} with {}'.format(subject_uri.split('#')[1], object_uri.split('#')[1])\ + if 'comment' not in info_dict.keys() else info_dict['comment'] self.addProperty(property_uri, super_uri, subject_uri, object_uri, comment, label, cardinality, info_dict=info_dict) elif not True in [\ (cls_uri, inferredSubClass, o) in self.project_graph\ for o in self.project_graph.objects(property_uri, KNORA_BASE.subjectClassConstraint)\ ]: # if cls_uri is NOT a subclass of a cls specified by KNORA_BASE.subjectClassConstraint self.addRestriction2Class(subject_uri, property_uri, cardinality=cardinality, info_dict=info_dict) if self.create_super_cls_for_multi_property: self.createSuperClassForSubjectClassConstraint(property_uri, subject_uri) else: # not sure if Knora accepts this, i.e. several subject_uris specified by KNORA_BASE.subjectClassConstraint. self.project_graph.add((property_uri, KNORA_BASE.subjectClassConstraint, subject_uri)) return subject_uri, property_uri def get_comment_label(self, cls): """Returns comment and label from cls __doc__. """ comment = cls.__doc__.replace('\n','').lstrip() label = cls.__name__ if '.' in cls.__doc__: comment = [ text for text in cls.__doc__.split('\n') if text != '' ][0].lstrip() if '@label' in cls.__doc__: m = re.search('(@label[:]*\s)(.*[\.]*)', cls.__doc__) label_tag, label = m.groups() elif re.search('([A-Z][a-z]+)', label): m = re.search('([A-Z]\w+)([A-Z]\w+)', label) label = ' '.join([ text.lower() for text in re.split(r'([A-Z][a-z]+)', label) if text != '' ]) return comment, label def addClass(self, cls_uri, comment='', label='', super_uri=KNORA_BASE.Resource): """Add a class to project_graph. """ if comment == '' and cls_uri.split('#')[1] in self.class_uri_dict: comment, label = self.get_comment_label(self.class_uri_dict.get(cls_uri.split('#')[1])) - elif comment == '': - label = cls_uri.split('#')[1].lower() - comment = label self.project_graph.add((cls_uri, RDF.type, OWL.Class)) - self.project_graph.add((cls_uri, RDFS.comment, Literal(comment, lang='en'))) - self.project_graph.add((cls_uri, RDFS.label, Literal(label, lang='en'))) + if comment != '': + self.project_graph.add((cls_uri, RDFS.comment, Literal(comment, lang='en'))) + if label != '': + self.project_graph.add((cls_uri, RDFS.label, Literal(label, lang='en'))) self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri)) - def get_semantic_dictionary_keys_super_first(self, property_dict): + def _get_builtin_cls_keys(self, property_dict): + """Returns a list of keys for classes that are builtin. + """ + builtin_cls_keys = [] + for key in property_dict.keys(): + property_cls = property_dict.get(key).get('class')\ + if type(property_dict.get(key)) is dict\ + else property_dict.get(key)[0] + if property_cls.__module__ == 'builtins': + builtin_cls_keys.append(key) + return builtin_cls_keys + + def _get_semantic_dictionary_keys_super_first(self, property_dict): """Sorts the keys of the property part of a semantic dictionary and returns the keys for super classes before keys of subclasses. :return: a sorted list of keys. """ - builtin_cls_keys = [ key for key in property_dict.keys()\ - if int in property_dict.get(key)\ - or str in property_dict.get(key)\ - or float in property_dict.get(key) ] + builtin_cls_keys = self._get_builtin_cls_keys(property_dict) complex_cls_keys = [] for key in [ key for key in property_dict.keys()\ - if int not in property_dict.get(key)\ - and str not in property_dict.get(key)\ - and float not in property_dict.get(key) ]: - current_cls = property_dict.get(key)[0] + if key not in builtin_cls_keys ]: + current_cls = property_dict.get(key).get('class')\ + if type(property_dict.get(key)) is dict\ + else property_dict.get(key)[0] key_inserted = False for index, cls_key in enumerate(complex_cls_keys): - if issubclass(property_dict.get(cls_key)[0], current_cls): + potential_sub_cls = property_dict.get(cls_key).get('class')\ + if type(property_dict.get(cls_key)) is dict\ + else property_dict.get(cls_key)[0] + if issubclass(potential_sub_cls, current_cls): complex_cls_keys.insert(index, key) key_inserted = True break if not key_inserted: complex_cls_keys.append(key) return builtin_cls_keys + complex_cls_keys def createClassAndProperties(self, cls): """Creates a owl:Class and some owl:ObjectProperty from semantic_dictionary of a python class. """ if not cls.__name__ in self.class_uri_dict: self.class_uri_dict.update({cls.__name__: cls}) semantic_dict = cls.get_semantic_dictionary() super_uri = KNORA_BASE.Resource if bool(semantic_dict['class'].get('type')): super_cls = semantic_dict['class'].get('type') self.createClassAndProperties(super_cls) super_uri = URIRef(self.base_uriref + '#' + super_cls.__name__) cls_uri = URIRef(self.base_uriref + '#' + cls.__name__) comment, label = self.get_comment_label(cls) self.addClass(cls_uri, comment, label, super_uri) #print('Mapping for {} <- {}'.format(cls_uri, cls)) - for property_key in self.get_semantic_dictionary_keys_super_first(semantic_dict['properties']): + for property_key in self._get_semantic_dictionary_keys_super_first(semantic_dict['properties']): try: if type(semantic_dict['properties'].get(property_key)) == dict: property_dict4key = semantic_dict['properties'].get(property_key) property_cls = property_dict4key.get('class') cardinality = property_dict4key.get('cardinality') info_dict = { key: value for key, value in property_dict4key.items() if key in\ [ 'cardinality_restriction', 'label', 'name', 'xpath' ]} subject_uri, property_uri = self.createProperty(cls_uri, property_key, property_cls, cardinality, info_dict=info_dict) else: property_cls, cardinality, xpath = semantic_dict['properties'].get(property_key) subject_uri, property_uri = self.createProperty(cls_uri, property_key, property_cls, cardinality) if not subject_uri in self.uri_xpath_mapping: self.uri_xpath_mapping.update({ subject_uri: {}}) self.uri_xpath_mapping.get(subject_uri).update({property_uri: xpath}) except ValueError: raise Exception('Class {} does not have a xpath spec in its get_semantic_dictionary()'.format(cls)) def convert_py2ttl(self, datatypes_dir, target_ontology_file): """Convert all classes contained in datatypes_dir that are subclasses of DATATYPES_DIR.class_spec.SemanticClass to rdf. :return: exit code (int) """ if isdir(datatypes_dir): semantic_classes = self.get_semantic_classes(datatypes_dir) for cls in semantic_classes: self.createClassAndProperties(cls) f = open(target_ontology_file, 'wb+') f.write(self.project_graph.serialize(format="turtle")) f.close() #print(self.uri_xpath_mapping.get(URIRef(self.base_uriref + '#TranskriptionPosition'))) else: print('Error: dir {} does not exist!'.format(datatypes_dir)) usage return 1 return 0 def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert py classes that are subclasses of .class_spec.SemanticClass to owl:Class. py2ttl/py2ttl.py [OPTIONS ] [optional] directory containing datatypes that are subclasses of .class_spec.SemanticClass. Overwrites DATATYPES_DIR in py2ttl/config.py. OPTIONS: -h|--help: show help -s|--source=source_ontology_file source ontology ttl file, option overwrites PROJECT_ONTOLOGY_FILE in py2ttl/config.py -t|--target=target_ontology_file target ontology ttl file, default: 'PROJECT_PREFIX-ontology_autogenerated.ttl' :return: exit code (int) """ check_config_files_exist() datatypes_dir = get_datatypes_dir() source_ontology_file = PROJECT_ONTOLOGY_FILE target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, PROJECT_NAME) try: opts, args = getopt.getopt(argv, "hs:t:", ["help","source=", "target="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-t', '--target'): target_ontology_file = arg elif opt in ('-s', '--source'): source_ontology_file = arg converter = Py2TTLConverter(project_ontology_file=source_ontology_file) if len(args) < 1 and datatypes_dir is not None: return converter.convert_py2ttl(datatypes_dir, target_ontology_file) else: for datatypes_dir in args: if converter.convert_py2ttl(datatypes_dir, target_ontology_file) > 0: return 2 return 0 if len(args) > 1 else 2 if __name__ == "__main__": sys.exit(main(sys.argv[1:]))