Index: tests_svgscripts/ =================================================================== --- tests_svgscripts/ (revision 108) +++ tests_svgscripts/ (revision 109) @@ -1,505 +1,506 @@ import unittest from os import sep, path import lxml.etree as ET import sys sys.path.append('svgscripts') from process_words_post_merging import reset_page, update_writing_process_ids from import Box from datatypes.archival_manuscript import ArchivalManuscriptUnity from datatypes.matrix import Matrix import from datatypes.path import Path from datatypes.positional_word_part import PositionalWordPart from import Style from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word, execute_function_on_parts, update_transkription_position_ids, do_paths_intersect_saveMode from datatypes.word_deletion_path import WordDeletionPath from datatypes.word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass sys.path.append('shared_util') from main_util import extract_paths_on_tf class Page: def __init__(self): self.svg_file = None def get_line_number(self, input=0): return -1 def get_biggest_fontSize4styles(self, style_set={}): return 7 class TestWord(unittest.TestCase): TESTCASE = None def setUp(self): DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_file = DATADIR + sep + 'N_VII_1_page009.xml' self.word_deletion_path_file = DATADIR + sep + 'N_VII_1_page138.xml' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] x = 0 for dict in self.word_part_objs: dict['class'] = 'st22' dict['x'] = x dict['y'] = 11 x += 1 mylist = {'text': 'abc', 'id': '0', 'line-number': '2', 'deleted': 'true' } word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] self.word_node = ET.Element('word', attrib=mylist) word_position.attach_object_to_tree(self.word_node) x = 0 word_path = Path.create_path_from_transkription_position(word_position, include_pwps=False) word_path.tag = WordDeletionPath.XML_TAG word_path.attach_object_to_tree(self.word_node) for char in mylist['text']: ET.SubElement(self.word_node, 'part', attrib={'text': char, 'x': str(x), 'y': '11', 'class': 'st22' }) x += 1 def test_add_deletion_paths(self): page = word = [ word for word in page.words if word.text == 'AufBau'][0] #self.assertTrue(word.deleted) self.assertTrue(len(word.word_parts) > 0) self.assertTrue(word.word_parts[0].deleted) word.add_deletion_paths(page.word_deletion_paths, tr_xmin=28.347656, tr_ymin=49.921875) self.assertTrue(len(word.word_parts[0].deletion_paths) > 0) #print(word.deletion_paths) page ='xml/Mp_XIV_page420.xml') words = [ word for word in page.words if word.deleted or True in [ part.deleted for part in word.word_parts ]] words[0].add_deletion_paths(extract_paths_on_tf(page)) word_path = Path.create_path_from_transkription_position(words[0].transkription_positions[0], include_pwps=False) #print( words[0].text, words[0].deletion_paths) def test_join_words(self): words = [ Word(id=4, text='asdf-', line_number=1, deleted=True), Word(id=5, text='bsdf', line_number=2, deleted=False) ] new_word = Word.join_words(words) self.assertEqual(, 4) self.assertEqual(new_word.text, 'asdf-bsdf') self.assertEqual(new_word.edited_text, 'asdfbsdf') self.assertEqual(new_word.deleted, False) self.assertEqual(new_word.line_number, -1) words = [ Word(id=1, word_parts=[Word(id=4, text='asdf-', line_number=1, deleted=True), Word(id=5, text='bsdf', line_number=2, deleted=False)]),\ Word(id=4, text='.', line_number=2, deleted=True), Word(id=5, text='.', line_number=2, deleted=False) ] new_word = Word.join_words(words) self.assertEqual(new_word.text, 'asdf-bsdf..') new_word = Word.join_words(words, add_white_space_between_words=True) self.assertEqual(new_word.text, 'asdf- bsdf . .') + def test_Word_with_word_part_objs(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) self.assertEqual(, 0) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') def test_Word_with_word_node(self): word = Word.create_cls(self.word_node) self.assertEqual(, 0) self.assertEqual(word.deleted, True) self.assertTrue(len(word.deletion_paths) > 0) self.assertEqual(word.transkription_positions[0].bottom, 11) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 1) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') self.assertEqual(word.line_number, 2) self.assertEqual(word.transkription_positions[0].transform.isRotationMatrix(), True) def test_attach_word_to_tree(self): newWord = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) empty_tree = ET.ElementTree(ET.Element('page')) newWord.attach_word_to_tree(empty_tree) for word_node in empty_tree.getroot().xpath('//word'): word = Word.CREATE_WORD(word_node=word_node) self.assertEqual(, 0) self.assertEqual(word.deleted, False) self.assertEqual(word.transkription_positions[0].bottom, 13) self.assertEqual(word.transkription_positions[0].height, 10) self.assertEqual(word.transkription_positions[0].top, 3) self.assertEqual(word.transkription_positions[0].left, 0) self.assertEqual(word.transkription_positions[0].width, 10) self.assertEqual(word.text, 'abc') @unittest.skipUnless(TESTCASE is None or TESTCASE == 0, 'Not testing this case') def test_create_correction_history_case0(self): # Case 1: whole word over box box = Box(earlier_text='XYX') word = Word(text='ASDF', transkription_positions=[TranskriptionPosition()]) word.word_box = box word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.overwrites_word is not None, True) @unittest.skipUnless(TESTCASE is None or TESTCASE == 1, 'Not testing this case') def test_create_correction_history_case1(self): # Case 2: part of word over box box = Box(earlier_text='XYX') partA = Word(text='A', transkription_positions=[TranskriptionPosition()]) partA.word_box = box partB = Word(text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is None, True) self.assertEqual(word.word_parts[0].overwrites_word is not None, True) @unittest.skipUnless(TESTCASE is None or TESTCASE == 2, 'Not testing this case') def test_create_correction_history_case3(self): # Case 3: part of word over box, word under box is part of earlier version box = Box(earlier_text='XYX') tp0 = TranskriptionPosition() = Style(writing_process_id=0) tp1 = TranskriptionPosition() = Style(writing_process_id=1) partA = Word(id=0, text='Test', transkription_positions=[ tp0]) partB = Word(id=1, text='er', transkription_positions=[ tp1]) partB.word_box = box word = Word(text='Tester', writing_process_id=1, word_parts=[ partA, partB ] ) word.create_correction_history( self.assertEqual(word.text, 'Tester') self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.earlier_version.text, 'TestXYX') self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) @unittest.skipUnless(TESTCASE is None or TESTCASE == 3, 'Not testing this case') def test_create_correction_history_case4(self): # Case 4: part of word is deleted partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.edited_text, 'SDF') @unittest.skipUnless(TESTCASE is None or TESTCASE == 4, 'Not testing this case') def test_create_correction_history_case5(self): tp0 = TranskriptionPosition() = Style(writing_process_id=0) tp1 = TranskriptionPosition() = Style(writing_process_id=1) partA = Word(id=0, text='Test', transkription_positions=[ tp0]) partB = Word(id=1, text='er', transkription_positions=[ tp1]) word = Word(text='Tester', word_parts=[ partA, partB ] ) word.create_correction_history() self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.word_parts[1].extendsEarlierVersion, True) self.assertEqual(word.word_parts[1].isExtensionOfWord, word.earlier_version) #@unittest.skipUnless(TESTCASE is None or TESTCASE == 5, 'Not testing this case') #@unittest.skip('case tested, relies on a local xml file') def test_create_correction_history_case_full(self): page ='xml/N_VII_1_page138.xml') manuscript = ArchivalManuscriptUnity() reset_page(page) update_writing_process_ids(page) word = [ word for word in page.words if word.text == 'Verschiedenes' and word.line_number == 4 ][0] wordAufBau = [ word for word in page.words if word.text == 'AufBau' ][0] #page.words = [ word ] page.update_styles(manuscript=manuscript, partition_according_to_styles=True) word.word_parts[0].transkription_positions[0].has_box = Box(earlier_text='v') self.assertEqual(len(word.word_parts), 2) word_over_box = word._get_partial_word_over_box() update_transkription_position_ids(word) word.create_correction_history(page) self.assertEqual(word.writing_process_id, 1) self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.earlier_version.text, 'verschiedenes') #print(, [ (, w.text) for w in word.earlier_version.word_parts ]) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) #print(ET.dump(word_node)) """ self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) self.assertEqual(word.word_parts[1].overwrites_word is not None, True) """ word = wordAufBau page.words = [ word ] page.update_styles(manuscript=manuscript, partition_according_to_styles=True) word.word_parts[0].deleted = True word.word_parts[1].transkription_positions[0].has_box = Box(earlier_text='b') self.assertEqual(len(word.word_parts), 3) word_over_box = word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 3) update_transkription_position_ids(word) word.create_correction_history(page) self.assertEqual(word.writing_process_id, 2) self.assertEqual(word.earlier_version is not None, True) self.assertEqual(word.text, 'AufBau') self.assertEqual(word.edited_text, 'Bau') self.assertEqual(word.earlier_version.text, 'Aufbau') self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0]) self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1]) self.assertEqual(word.word_parts[1].overwrites_word is not None, True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) #print(ET.dump(word_node)) newWord = Word.create_cls(word_node) #@unittest.skip('') def test_earlier_version(self): partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()]) word = Word(text='ASDF', word_parts=[ partA, partB]) earlier_version = word.create_earlier_version() self.assertEqual(earlier_version is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True) self.assertEqual(word.word_parts[0].isDeletionOfWord, earlier_version.word_parts[0]) def test_undo_partitioning(self): tps = [] for i, xy in enumerate([ 3, 4, 5 ]): tps.append(TranskriptionPosition(id=i, x=xy, y=xy, height=10, width=10)) partA = Word(id=0, text='Auf', writing_process_id=1, deleted=True, transkription_positions=[ tps[0]]) partB = Word(id=1, text='B', writing_process_id=2, transkription_positions=[tps[1]]) partC = Word(id=2, text='au', writing_process_id=1,transkription_positions=[tps[2]]) word = Word(text='Aufbau', writing_process_id=2, word_parts=[ partA, partB, partC ] ) word.undo_partitioning() self.assertEqual(len(word.transkription_positions), len(tps)) self.assertEqual(len(word.word_parts), 0) """ page ='xml/N_VII_1_page138.xml') word = page.words[77] word.undo_partitioning() self.assertEqual(len(word.word_parts), 0) self.assertEqual(len(word.transkription_positions), 3) update_transkription_position_ids(word) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) print(ET.dump(word_node)) """ def test_split(self): page = Page() pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('b') self.assertEqual(, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(, 1) self.assertEqual(, 2) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('bc') self.assertEqual(, 0) self.assertEqual(previousWord.text, 'a') self.assertEqual(, 1) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) previousWord, currentWord, nextWord = word.split('ab', start_id=10) self.assertEqual(, 10) self.assertEqual(currentWord.text, 'ab') self.assertEqual(currentWord.transkription_positions[0].width, 2.1) self.assertEqual(, 11) self.assertEqual(nextWord.transkription_positions[0].width, 5.2) word_part_objs=[{'text': 'x', 'class':'st22', 'x': 0, 'y': 0},\ {'text': 'Insofern', 'class':'st22', 'x': 1, 'y': 0},\ {'text': 'x', 'class':'st22', 'x': 10, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofer') word_part_objs=[{'text': 'xInsofern', 'class':'st22', 'x': 0, 'y': 0}] pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps) word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions) with self.assertWarns(Warning): previousWord, currentWord, nextWord = word.split('Insofern') def test_join(self): word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word, add_white_space_between_words=True) self.assertEqual(word.text, 'abc .') word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10) other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word) self.assertEqual(word.text, 'abc.') other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}]) word.join(other_word, append_at_end_of_new_word=False) self.assertEqual(word.text, '.abc.') """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_get_semanticAndDataDict(self): dictionary = Word.get_semantic_dictionary() #print(dictionary) info_dict = dictionary['properties'].get('isDeletionOfWord') self.assertEqual(SemanticClass.SUPER_PROPERTY in info_dict.keys(), True) super_info_dict = info_dict[SemanticClass.SUPER_PROPERTY] #print(info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME)) def test_simplify_transkription_positions(self): node_string = """ """ nodeA = ET.fromstring(node_string) node_string = """ """ nodeB = ET.fromstring(node_string) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) self.assertEqual(len(word.transkription_positions), 2) word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ]) word.transkription_positions[1].writing_process_id = -1 word.simplify_transkription_positions() self.assertEqual(len(word.transkription_positions), 1) self.assertEqual(word.transkription_positions[0].writing_process_id, 0) """ tree = ET.ElementTree(ET.Element('page')) word.attach_word_to_tree(tree) print(ET.dump(tree.getroot())) """ def test_partition(self): page = word = page.words[67] self.assertEqual(word.belongs_to_multiple_writing_processes(), True) word.partition_according_to_writing_process_id() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.belongs_to_multiple_writing_processes(), False) self.assertEqual(word.belongs_to_multiple_writing_processes(include_parts=True), True) empty_tree = ET.ElementTree(ET.Element('page')) word_node = word.attach_word_to_tree(empty_tree) newWord = Word.create_cls(word_node) self.assertEqual(len(newWord.word_parts), 3) #print(ET.dump(empty_tree.getroot())) def test_partition_deletion(self): page = word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.deleted = transkription_position.writing_process_id == 1 #print([ transkription_position.deleted for transkription_position in word.transkription_positions]) self.assertEqual(word.has_mixed_status('deleted'), True) word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 3) self.assertEqual(word.has_mixed_status('deleted'), False) property_key = 'deleted' #print([ w.transkription_positions[0].deleted for w in word.word_parts]) #print(len(set(pword.transkription_positions[0].__dict__[property_key] for pword in word.word_parts\ # if len(pword.transkription_positions) > 0 and property_key in pword.transkription_positions[0].__dict__.keys()))) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) page = word = page.words[67] word.partition_according_to_writing_process_id() #print([(word.text, word.deleted) for word in word.word_parts]) word.word_parts[1].transkription_positions[1].deleted = True word.partition_according_to_deletion() self.assertEqual(len(word.word_parts), 4) #print([(word.text, word.deleted) for word in word.word_parts]) partA = Word(text='A', deleted=True) partB = Word(text='SDF', deleted=False) word = Word(text='ASDF', word_parts=[ partA, partB]) self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True) def test_execute_function_on_parts(self): page = word_parts = [ page.words[67], page.words[68] ] word_parts, none = execute_function_on_parts(word_parts, 'partition_according_to_writing_process_id') self.assertEqual(len(word_parts) == 4, True) def test_process_word_boxes(self): page = page.source = self.pdf_xml_source page.update_styles(partition_according_to_styles=True) tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [30, 277, 288, 297, 321] for word_id, index in enumerate(indices): word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) self.assertEqual(word_over_box is not None, True) self.assertEqual(word_over_box == page.words[index] or word_over_box in page.words[index].word_parts, True) #self.assertEqual(word_over_box in page.words[index].word_parts, True) def test_process_word_several_boxesOn1LIne(self): page = page.source = self.pdf_xml_source for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() tr = TranskriptionField(page.source) box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\ 'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\ 'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\ 'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\ 'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'] box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ] indices = [30, 277, 288, 297, 321] empty_tree = ET.ElementTree(ET.Element('page')) for word_id, index in enumerate(indices): word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin) self.assertEqual(word_over_box is not None, True) def test_split_according_to_status(self): page = word = page.words[67] for transkription_position in word.transkription_positions: transkription_position.text = 'asdf'\ if transkription_position.writing_process_id == 1\ else word.text self.assertEqual(word.has_mixed_status('text'), True) new_words = word.split_according_to_status('text') #print([word.text for word in new_words ]) self.assertEqual(len(new_words) > 1, True) self.assertEqual(new_words[0].id, self.assertEqual(new_words[0].deleted, word.deleted) self.assertEqual(new_words[1].id, manuscript = ArchivalManuscriptUnity() page = word = page.words[67] page.words = [ word ] page.update_styles(manuscript=manuscript) new_words = word.split_according_to_status('style', splits_are_parts=True) self.assertEqual(len(word.word_parts), 3) def test__create_new_word(self): manuscript = ArchivalManuscriptUnity() page = word = page.words[67] page.words = [ word ] page.update_styles(manuscript=manuscript) newWord = word._create_new_word([ word.transkription_positions[0] ], 'style') for key in Word.COPY_PROPERTY_KEY: self.assertEqual(newWord.__dict__[key], word.__dict__[key]) self.assertEqual(len(newWord.styles), 1) def test__get_partial_word_over_box(self): word = Word(text='test', transkription_positions=[ TranskriptionPosition(id=0), TranskriptionPosition(id=1) ]) word.transkription_positions[0].has_box = Box(earlier_text='asdf') word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 2) partA = Word(id=0, text='A', transkription_positions=[TranskriptionPosition()]) partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition(), TranskriptionPosition(id=1)]) partB.transkription_positions[0].has_box = Box(earlier_text='asdf') word = Word(text='ASDF', word_parts=[ partA, partB]) word._get_partial_word_over_box() self.assertEqual(len(word.word_parts), 2) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/ =================================================================== --- tests_svgscripts/ (revision 108) +++ tests_svgscripts/ (revision 109) @@ -1,81 +1,81 @@ import unittest from os import sep, path from os.path import dirname, isdir import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.matrix import Matrix from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.mark_foreign_hands import MarkForeignHands from import Page from datatypes.word import Word class TestMarkForeignHands(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' self.xml_file = DATADIR + sep + 'N_VII_1_page008.xml' self.test_content_svg = DATADIR + sep + 'N_VII_1_xp5_4_page5.svg' self.test_content_xml = DATADIR + sep + 'N_VII_1_page005.xml' self.test_contentB_svg = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg' self.test_contentB_xml = DATADIR + sep + 'N_VII_1_page006.xml' - mylist = {'text': '*', 'id': '0', 'line-number': '2' } + mylist = {'text': '*', 'id': '0', 'line-number': '2'} self.node = ET.Element(MarkForeignHands.XML_TAG, attrib=mylist) word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] word_position.attach_object_to_tree(self.node) def test_create_cls(self): mark_foreign_hands = MarkForeignHands.create_cls(self.node) self.assertEqual(, 0) self.assertEqual(mark_foreign_hands.transkription_positions[0].bottom, 11) self.assertEqual(mark_foreign_hands.transkription_positions[0].height, 10) self.assertEqual(mark_foreign_hands.transkription_positions[0].top, 1) self.assertEqual(mark_foreign_hands.transkription_positions[0].left, 0) self.assertEqual(mark_foreign_hands.transkription_positions[0].width, 10) self.assertEqual(mark_foreign_hands.text, '*') self.assertEqual(mark_foreign_hands.line_number, 2) self.assertEqual(mark_foreign_hands.transkription_positions[0].transform.isRotationMatrix(), True) def test_attach_word_to_tree(self): mark_foreign_hands = MarkForeignHands.create_cls(self.node) mark_foreign_hands.foreign_hands_text = 'test' mark_foreign_hands.pen= 'Rotstift' empty_tree = ET.ElementTree(ET.Element('page')) mark_foreign_hands.attach_word_to_tree(empty_tree) #print(ET.dump(empty_tree.getroot())) for node in empty_tree.xpath('//' + MarkForeignHands.XML_TAG): mark = MarkForeignHands.create_cls(node) self.assertEqual(mark.pen, 'Rotstift') self.assertEqual(mark.foreign_hands_text.content, 'test') self.assertEqual(, 0) self.assertEqual(mark.transkription_positions[0].bottom, 11) self.assertEqual(mark.transkription_positions[0].height, 10) self.assertEqual(mark.transkription_positions[0].top, 1) self.assertEqual(mark.transkription_positions[0].left, 0) self.assertEqual(mark.transkription_positions[0].width, 10) self.assertEqual(mark.text, '*') self.assertEqual(mark.line_number, 2) self.assertEqual(mark.transkription_positions[0].transform.isRotationMatrix(), True) #print(empty_tree.xpath('//mark-foreign-hands/content/text()')) #print(empty_tree.xpath('//mark-foreign-hands/content/@pen')) def test_get_semanticAndDataDict(self): dictionary = MarkForeignHands.get_semantic_dictionary() #print(dictionary) def test_find_content(self): page = Page(self.test_contentB_xml) transkription_field = TranskriptionField(page.source) svg_tree = ET.parse(page.source) page.update_line_number_area(transkription_field, svg_tree=svg_tree) mark_foreign_hands_word = [ word for word in page.words if word.text == MarkForeignHands.CLASS_MARK ][0] mark_foreign_hands = MarkForeignHands.create_cls_from_word(mark_foreign_hands_word) MarkForeignHands.find_content([ mark_foreign_hands ] , transkription_field, svg_tree, style_dict=page.style_dict) self.assertEqual(mark_foreign_hands.foreign_hands_text, 'W III, 104. (MXXIX, 3)') self.assertEqual(mark_foreign_hands.pen, 'Bleistift') if __name__ == "__main__": unittest.main() Index: fixes/ =================================================================== --- fixes/ (revision 108) +++ fixes/ (revision 109) @@ -1,1026 +1,1030 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from datetime import datetime from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET import re import shutil import string from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.path import Path as SVGPath from svgpathtools.path import Line from svgpathtools.parser import parse_path import sys import tempfile from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from import Bar import warnings from checker_handler import CheckerHandler from fix_old_data import save_page from fix_boxes import attach_box, split_into_parts_and_attach_box sys.path.append('svgscripts') from convert_wordPositions import HTMLConverter, JSONConverter from import Box from datatypes.faksimile import FaksimilePage from datatypes.archival_manuscript import ArchivalManuscriptUnity from datatypes.mark_foreign_hands import MarkForeignHands from import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.word import Word, update_transkription_position_ids from datatypes.word_deletion_path import WordDeletionPath from join_faksimileAndTranskription import sort_words, add_faksimile_image from util import back_up, back_up_svg_file, copy_faksimile_svg_file, change_title_of_svg from process_files import update_svgposfile_status from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from main_util import create_function_dictionary __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False MAX_SVG_XY_THRESHOLD = 10 class ResponseHandler: def __init__(self, response_starts_with=None, dialog_string=None, action_name=None, description=None): self.action_name = action_name self.dialog_string = dialog_string self.description = description self.response_starts_with = response_starts_with def create_requirement_list(self) ->list: """Create a requirement dictionary. """ return [] def create_json_dict(self)->dict: """Create a json dictionary. """ json_dict = { 'action_name': self.action_name, 'description': self.description } requirements = self.create_requirement_list() if len(requirements) > 0: json_dict.update({ 'requirements': requirements }) return json_dict def get_transkription_words(self, json_dict: dict) ->list: """Return words with transkription positions only. """ words = json_dict['words']\ if bool(json_dict.get('words'))\ else [] return [ w for w in words if bool(w.get('tp_id')) ] def get_requirement(self, json_dict: dict, index=0) ->tuple: """Return requirement tuple (name, input). """ name = requirement = None if dict_contains_keys(json_dict, ['response_handler','requirements'])\ and index < len(json_dict['response_handler']['requirements']): requirement_dict = json_dict['response_handler']['requirements'][index] if dict_contains_keys(requirement_dict, ['name'])\ and dict_contains_keys(requirement_dict, ['input']): name = requirement_dict['name'] requirement = requirement_dict['input'] return name, requirement def match(self, response: str) ->bool: """Return whether response matchs with handler. """ if self.response_starts_with is not None: return response.startswith(self.response_starts_with) return True def print_dialog(self): """Print dialog. """ if self.dialog_string is not None: print(f'[{self.dialog_string}]') def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ transkription_words = self.get_transkription_words(json_dict) json_word_ids = [ jw.get('id') for jw in transkription_words ] action_dictionary = { 'words': [ word for word in page.words if in json_word_ids ] } for index, item in enumerate(self.create_requirement_list()): name, requirement = self.get_requirement(json_dict, index=index) action_dictionary.update({name: requirement}) return self.run_change(page, action_dictionary) def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ return self.run_change(page, {}) def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 return exit_code class JoinWords(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response interactively and return exit code. """ action_dictionary = { 'words' : shell._get_words_from_response(re.compile('^\D+\s').sub('', response), page.words),\ 'add_white_space_between_words': re.match(r'^\D+\s', response) } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 add_white_space_between_words = action_dictionary['add_white_space_between_words']\ if bool(action_dictionary.get('add_white_space_between_words'))\ else False words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] if len(words) > 0: if len(set([ word.line_number for word in words ])) == 1\ and len(set([ word.deleted for word in words ])) == 1: new_word = words[0] for word2join in words[1:]: page.words.remove(word2join) new_word.join(word2join, add_white_space_between_words=add_white_space_between_words) else: new_word = Word.join_words(words, add_white_space_between_words=add_white_space_between_words) index = len(page.words) if words[0] in page.words: index = page.words.index(words[0]) elif len([ word for word in page.words if words[0] in word.word_parts ]) > 0: index = page.words.index([ word for word in page.words if words[0] in word.word_parts ][0]) for word2join in words: if word2join in page.words: page.words.remove(word2join) elif len([ word for word in page.words if word2join in word.word_parts ]) > 0: page.words.remove([ word for word in page.words if word2join in word.word_parts ][0]) page.words.insert(index, new_word) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class SimpleJoinWords(JoinWords): def match(self, response: str) ->bool: """Return whether response matchs with handler. """ return re.match(r'\d+', response) class SaveChanges(ResponseHandler): WORD_INDEX = 0 WDICT_INDEX = 1 RELEVANT_PROPERTIES = [ ('deleted','deleted'), ('line_number','line') ] def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ self.run_change(page, {}) return shell.run_interactive_editor(page) def _update_transkription_word(self, word, word_dict) ->int: """Update properites of word according to word_dict, return exit_code """ exit_code = 0 for relevant_property in self.RELEVANT_PROPERTIES: if len(word.word_parts) > 0: if len(word_dict['tp_id'].split(':')) == 3: wp_index = int(word_dict['tp_id'].split(':')[1].replace('w','')) word.word_parts[wp_index].__dict__[relevant_property[self.WORD_INDEX]] = word_dict[relevant_property[self.WDICT_INDEX]] else: return 2 else: word.__dict__[relevant_property[self.WORD_INDEX]] = word_dict[relevant_property[self.WDICT_INDEX]] return exit_code def _update_faksimile_word(self, word, word_dict, words) ->int: """Update properites of word according to word_dict, return exit_code """ exit_code = 0 if word_dict.get('old_id') is not None: fp_id = word_dict['fp_id'] old_id = int(word_dict['old_id']) if len([w for w in words if == old_id ]) > 0: old_word = [w for w in words if == old_id ][0] faksimile_position = None if len([ fp for fp in old_word.faksimile_positions if == fp_id ]) > 0: faksimile_position = [ fp for fp in old_word.faksimile_positions if == fp_id ][0] old_word.faksimile_positions.remove(faksimile_position) elif len([ fp for w in old_word.word_parts for fp in w.faksimile_positions if == fp_id ]) > 0: for w in old_word.word_parts: for fp in w.faksimile_positions: if == fp_id: faksimile_position = fp w.faksimile_positions.remove(faksimile_position) break if faksimile_position is not None: word.faksimile_positions.append(faksimile_position) else: return 2 else: return 3 else: fp_id = word_dict['fp_id'] print(, fp_id); return exit_code def _update_word(self, word, word_dict, words) ->int: """Update properites of word according to word_dict, return exit_code """ exit_code = 0 if bool(word_dict.get('tp_id')): exit_code = self._update_transkription_word(word, word_dict) if exit_code > 0: return exit_code elif bool(word_dict.get('fp_id')): exit_code = self._update_faksimile_word(word, word_dict, words) if exit_code > 0: print(exit_code) return exit_code else: return 2 return exit_code def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ svg_words = [ word for word in json_dict['words'] if str(word.get('id')).startswith('rect') ] if page.faksimile_svgFile is not None: for word in svg_words: word_id = word.get('id') word_text = word.get('text') print(f'Changing rect {word_id} to {word_text}') change_title_of_svg(page.faksimile_svgFile, word_id, word_text) json_word_ids = [ int(jw.get('id')) for jw in json_dict['words'] if not str(jw.get('id')).startswith('rect') ] for word in page.words: if in json_word_ids: print('updating word',, word.text) word_dict = [ jw for jw in json_dict['words'] if int(jw.get('id')) == ][0] if self._update_word(word, word_dict, page.words) > 0: return 2 return self.run_change(page, {}) def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) return exit_code class SavePositions(SaveChanges): def _update_word(self, word, word_dict_list) ->int: """Update properites of word according to word_dict, return exit_code """ exit_code = 0 for word_dict in word_dict_list: if bool(word_dict.get('tp_id')): exit_code = self._update_transkription_position(word, word_dict) if exit_code > 0: return exit_code elif bool(word_dict.get('fp_id')): exit_code = self._update_faksimile_position(word, word_dict) if exit_code > 0: return exit_code return exit_code def _update_transkription_position(self, word, word_dict) ->int: """Update transkription position properites of word according to word_dict, return exit_code """ tp_id_list = word_dict['tp_id'].split(':') if len(tp_id_list) == 3 and len(word.word_parts) > 0: wp_index = int(tp_id_list[1].replace('w','')) tp_index = int(tp_id_list[2].replace('tp','')) if wp_index < len(word.word_parts) and tp_index < len(word.word_parts[wp_index].transkription_positions): word.word_parts[wp_index].transkription_positions[tp_index].left = float(word_dict['left']) word.word_parts[wp_index].transkription_positions[tp_index].top = float(word_dict['top']) word.word_parts[wp_index].transkription_positions[tp_index].bottom = word.word_parts[wp_index].transkription_positions[tp_index].top\ + word.word_parts[wp_index].transkription_positions[tp_index].height else: return 2 elif len(tp_id_list) == 2: tp_index = int(tp_id_list[1].replace('tp','')) if tp_index < len(word.transkription_positions): word.transkription_positions[tp_index].left = float(word_dict['left']) word.transkription_positions[tp_index].top = float(word_dict['top']) word.transkription_positions[tp_index].bottom = word.transkription_positions[tp_index].top\ + word.transkription_positions[tp_index].height else: return 2 else: return 2 return 0 def _update_faksimile_position(self, word, word_dict) ->int: """Update faksimile position properites of word according to word_dict, return exit_code """ exit_code = 0 fp_id = word_dict['fp_id'] faksimile_position = None if len([ fp for fp in word.faksimile_positions if == fp_id ]) > 0: faksimile_position = [ fp for fp in word.faksimile_positions if == fp_id ][0] if len([ fp for w in word.word_parts for fp in w.faksimile_positions if == fp_id ]) > 0: faksimile_position = [ fp for w in word.word_parts for fp in w.faksimile_positions if == fp_id ][0] if faksimile_position is not None: faksimile_position.left = float(word_dict['left']) = float(word_dict['top']) faksimile_position.bottom = + faksimile_position.height else: return 2 return exit_code def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ json_word_ids = [ jw.get('id') for jw in json_dict['words'] ] for word in page.words: if in json_word_ids: word_dict_list = [ jw for jw in json_dict['words'] if jw.get('id') == ] if self._update_word(word, word_dict_list) > 0: return 2 return self.run_change(page, {}) class AddDeletionPath(SaveChanges): def _add_deletion_path(self, page, word, word_dict_list) ->int: """Update properites of word according to word_dict, return exit_code """ exit_code = 0 for word_dict in word_dict_list: if len([ path for path in word.deletion_paths if path.d_attribute == word_dict['deletion_path']]) == 0: dpath = page.get_word_deletion_path(d_attribute=word_dict['deletion_path']) if dpath is not None: word.deletion_paths.append(dpath) else: exit_code = 2 return exit_code def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ transkription_words = self.get_transkription_words(json_dict) json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ] for word in page.words: if in json_word_ids: word_dict_list = [ jw for jw in transkription_words if jw.get('id') == ] if self._add_deletion_path(page, word, word_dict_list) > 0: return 2 return self.run_change(page, {}) class RemoveDeletionPath(SaveChanges): def _remove_deletion_path(self, page, word, word_dict_list) ->int: """Update properites of word according to word_dict, return exit_code """ exit_code = 2 if len(word.word_parts) > 0: exit_code = 2 for wpart in word.word_parts: result = self._remove_deletion_path(page, wpart, word_dict_list) if result == 0: exit_code = 0 deletion_paths = [ path for path in word.deletion_paths if path.d_attribute in\ [ word_dict['deletion_path'] for word_dict in word_dict_list ] ] if len(deletion_paths) > 0: for path in deletion_paths: if path in word.deletion_paths: word.deletion_paths.remove(path) for node in page.page_tree.xpath(f'./{WordDeletionPath.XML_TAG}[@d="{path.d_attribute}"]'): node.getparent().remove(node) exit_code = 0 return exit_code def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ transkription_words = self.get_transkription_words(json_dict) json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ] for word in page.words: if in json_word_ids: word_dict_list = [ jw for jw in transkription_words if jw.get('id') == ] if self._remove_deletion_path(page, word, word_dict_list) > 0: return 2 return self.run_change(page, {}) class JoinDeletionPath(SaveChanges): def _join_deletion_path(self, page, word, word_dict_list) ->int: """Update properites of word according to word_dict, return exit_code """ deletion_paths = [ path for path in word.deletion_paths if path.d_attribute in\ [ word_dict['deletion_path'] for word_dict in word_dict_list ] ] if len(deletion_paths) > 1: path_string = '' for p in deletion_paths: path_string = path_string + ' ' + p.d_attribute.replace('M', 'L')\ if path_string != ''\ else p.d_attribute word.deletion_paths.remove(p) if p in page.word_deletion_paths: page.word_deletion_paths.remove(p) new_path = parse_path(path_string) word.deletion_paths.append(WordDeletionPath(Path(id=deletion_paths[0].id, path=new_path), deletion_paths[0].style)) page.word_deletion_paths.append(word.deletion_paths[-1]) for node in page.page_tree.xpath(f'./{WordDeletionPath.XML_TAG}'): node.getparent().remove(node) for p in page.word_deletion_paths: p.attach_object_to_tree(page.page_tree) return 0 return 2 def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ transkription_words = self.get_transkription_words(json_dict) json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ] for word in page.words: if in json_word_ids: word_dict_list = [ jw for jw in transkription_words if jw.get('id') == ] if self._join_deletion_path(page, word, word_dict_list) > 0: return 2 return self.run_change(page, {}) class RequestPathsNearWords(SaveChanges): def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ transkription_words = self.get_transkription_words(json_dict) json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ] for word in page.words: if in json_word_ids\ and 'add_paths_near_words' not in word.process_flags: word.process_flags.append('add_paths_near_words') return self.run_change(page, {}) class SetTaskDone(SaveChanges): def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ if not bool(json_dict.get('task')): return 2 task = json_dict.get('task') checker = CheckerHandler(page) checker.set_task_done(task) return self.run_change(page, {}) class Reload(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ return shell.run_interactive_editor(Page(page.page_tree.docinfo.URL)) class RestoreBackup(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ if page.bak_file is not None: return shell.run_interactive_editor(Page(page.bak_file)) else: print('Could not restore backup file, please restore manually!') return 2 class ChangeLine2Value(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ words = [] line_number = -1 if re.match(r'l:\d+\s\d+', response): line_number = int(response.replace('l:', '').split(' ')[0]) words = shell._get_words_from_response(re.compile('l:\d+\s').sub('', response), page.words) else: if not re.match(r'l:\d+$', response): new_response_line = input('Specify new line number>') if re.match(r'^\d+$', new_response_line): line_number = int(new_response_line) else: line_number = int(response.replace('l:', '')) new_response = input(f'Specify ids of words for which line number should be changed to {line_number}>') if re.match(r'\d+', new_response): words = shell_get_words_from_response(new_response, page.words) action_dictionary = { 'words': words, 'line_number' : line_number } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 line_number = action_dictionary['line_number']\ if bool(action_dictionary.get('line_number'))\ else -1 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] if line_number != -1: for word in words: word.line_number = line_number if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class CreateCorrectionHistory(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ if re.match(r'c\w*\s\d+', response): words = shell._get_words_from_response(re.compile('c\w*\s').sub('', response), page.words) else: new_response = input(f'Specify ids of words to create a correction history. >') if re.match(r'\d+', new_response): words = shell._get_words_from_response(new_response, page.words) action_dictionary = { 'words': words } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] if len(words) > 0: for word in words: word.create_correction_history() if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class DeleteCorrectionHistory(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response interactively and return exit code. """ if re.match(r'D\w*\s\d+', response): words = shell._get_words_from_response(re.compile('D\w*\s').sub('', response), page.words) else: new_response = input(f'Specify ids of words to delete their correction history. >') if re.match(r'\d+', new_response): words = shell._get_words_from_response(new_response, page.words) action_dictionary = { 'words' : words } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] if len(words) > 0: for word in words: print(word.text) word.earlier_version = None word.corrections = [] if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class ChangeDeletionStatus(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ if re.match(r'[du]\w*\s\d+', response): words = shell._get_words_from_response(re.compile('[du]\w*\s').sub('', response), page.words) else: deletion_target = 'delete' if response.startswith('d') else 'undelete' new_response = input(f'Specify ids of words to {deletion_target}. >') if re.match(r'\d+', new_response): words = shell._get_words_from_response(new_response, page.words) action_dictionary = { 'words': words, 'deleted': response.startswith('d') } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] word_should_be_deleted = bool(action_dictionary.get('deleted')) if len(words) > 0: for word in words: word.deleted = word_should_be_deleted if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class SplitWords(ResponseHandler): def _split_word(self, page, word, split_text): """Split word. """ index = page.words.index(word) _, left, right = word.split(split_text) + if left is None: + raise Exception(f'ERROR left word of word.split with split_text {split_text} is None!') + if right is None: + raise Exception(f'ERROR right word of word.split with split_text {split_text} is None!') page.words[index] = left page.words.insert(index+1, right) def create_requirement_list(self) ->list: """Create a requirement dictionary. """ return [{ 'name': 'split_text', 'type': 'string', 'input': None }] def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ if re.match(r's\s\w+\s\d+', response): words = shell._get_words_from_response(re.compile('s\s\w+\s').sub('', response), page.words) split_text = response.split(' ')[1] else: split_text = input('Input split text>') new_response = input(f'Specify ids of words to split. >') if re.match(r'\d+', new_response): words = shell._get_words_from_response(new_response, page.words) action_dictionary = { 'words': words, 'split_text': split_text } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] split_text = action_dictionary['split_text']\ if bool(action_dictionary.get('split_text'))\ else '' if len(words) > 0 and split_text != '': for word in words: self._split_word(page, word, split_text) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class AddBox(ResponseHandler): def create_requirement_list(self) ->list: """Create a requirement dictionary. """ return [{ 'name': 'box_text', 'type': 'string', 'input': None },\ { 'name': 'overwritten_by', 'type': 'string', 'input': None },\ { 'name': 'is_earlier_version', 'type': 'boolean', 'input': False }] def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] missing_text = action_dictionary.get('box_text') is_earlier_version = action_dictionary.get('is_earlier_version') overwritten_by = action_dictionary.get('overwritten_by') if len(words) > 0 and missing_text is not None: for word in words: if overwritten_by is not None: split_into_parts_and_attach_box(word, 0, missing_text, is_earlier_version, overwritten_by) else: attach_box(word, 0, missing_text, False) word.create_correction_history() if len(word.corrections) > 0: for wp in word.word_parts: wp.overwrites_word = None if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class ResponseOrganizer: RESULT = 'result' TIMESTAMP_NOT_SET = -1 def __init__(self, manuscript=None): self.manuscript = manuscript self.do_not_send = [] self.after_faksimile_merged = [] self.join_faksimile_positions = False self.response_handler_dictionary = {} self._add_response_handler(JoinWords(action_name='join words', description='join words')) self._add_response_handler(SplitWords(action_name='split words', description='split word according to split text')) self._add_response_handler(CreateCorrectionHistory(action_name='create correction history', description='creates a correction history for selected words'),\ is_after_faksimile_merged=True) self._add_response_handler(DeleteCorrectionHistory(action_name='delete correction history', description='deletes the correction history of selected words'),\ is_after_faksimile_merged=True) self._add_response_handler(AddBox(action_name='add box', description='add box with overwritten text'),\ is_after_faksimile_merged=True) self._add_response_handler(SaveChanges(action_name='save changes', description='save change to line number/deletion status for word(s)' )) self._add_response_handler(SavePositions(action_name='save positions', description='save new transkription position(s)' )) self._add_response_handler(AddDeletionPath(action_name='add deletion paths', description='add new deletion paths to word' ),\ is_after_faksimile_merged=True) self._add_response_handler(JoinDeletionPath(action_name='join deletion paths', description='join deletion paths of selected words' ),\ is_after_faksimile_merged=True) self._add_response_handler(RemoveDeletionPath(action_name='remove deletion paths', description='remove deletion paths of selected words' ),\ is_after_faksimile_merged=True) self._add_response_handler(RequestPathsNearWords(action_name='request paths near words', description='request paths near selected words' ),\ is_after_faksimile_merged=True) self._add_response_handler(Reload(action_name='reload', description='reload page from file' )) self._add_response_handler(SetTaskDone(action_name='set task done', description='reload page from file' ), add_to_do_not_send=True) def _add_faksimile_image(self, page, faksimile_page): """Add faksimile image to page. """ if faksimile_page.faksimile_image.text_field is None\ and faksimile_page.text_field is not None: faksimile_page.faksimile_image.text_field = faksimile_page.text_field page.faksimile_image = faksimile_page.faksimile_image page.faksimile_image.attach_object_to_tree(page.page_tree) page.update_data_source(faksimile_svgFile=faksimile_page.svg_source_file) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') def _add_response_handler(self, response_handler: ResponseHandler, add_to_do_not_send=False, is_after_faksimile_merged=False): """Add response_handler to response_handler_dictionary. """ if add_to_do_not_send: self.do_not_send.append(response_handler) if is_after_faksimile_merged: self.after_faksimile_merged.append(response_handler) self.response_handler_dictionary.update({response_handler.action_name: response_handler}) def _get_response_handlers(self) ->list: """Return a list of response_handlers. """ return [ response_handler for response_handler in self.response_handler_dictionary.values()\ if response_handler not in self.do_not_send\ and (not self.join_faksimile_positions or response_handler not in self.after_faksimile_merged) ] def create_json_dict(self, xml_file: str, svg_file=None, last_operation_result=None) ->dict: """Return a json dict of page with information about action. """ with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") page = Page(xml_file, add_paths_near_words=True, warn=True) checker = CheckerHandler(page) todos = checker.get_todos() replace_ligatures(page) faksimile_page = None faksimile_source_file = None if svg_file is None and page.faksimile_svgFile is not None: svg_file = page.faksimile_svgFile if svg_file is not None: fps = FaksimilePage.get_faksimile_pages(svg_file, page_number=page.number) if len(fps) > 0: faksimile_page = fps[0] if page.faksimile_image is None: add_faksimile_image(page, faksimile_page) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page, backup=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') if not isfile(xml_file.replace('xml/', 'xml/merged/'))\ and len([ word for word in page.words if len(word.faksimile_positions) == 0 ]) > 0: self.join_faksimile_positions = True faksimile_source_file = svg_file todos = [] converter = JSONConverter(page, faksimile_page=faksimile_page) json_dict = converter.create_json_dict() pages = [] if self.manuscript is not None and isfile(self.manuscript): manuscript_tree = ET.parse(self.manuscript) pages = [ p.replace('./', '') for p in manuscript_tree.xpath('//page/@output') if isfile(p) ] action_dict = { 'target_file': xml_file,\ 'faksimile_source_file': faksimile_source_file,\ 'pages': pages,\ 'date_stamp': os.path.getmtime(xml_file),\ 'join_faksimile_positions': str(self.join_faksimile_positions).lower(),\ 'tasks': todos } if last_operation_result is not None: action_dict.update({self.RESULT: last_operation_result }) if len(w) > 0: msg = str(w[-1].message)\ if last_operation_result is None\ else last_operation_result + '\n' + str(w[-1].message) action_dict.update({self.RESULT: msg }) response_handlers = [] for response_handler in self._get_response_handlers(): response_handlers.append(response_handler.create_json_dict()) action_dict.update({ 'response_handlers': response_handlers }) json_dict.update({ 'actions': action_dict}) return json_dict def handle_response(self, json_dict: dict) ->dict: """Handle response in json_dict and return new data json_dict. """ if bool(json_dict.get('target_file')): target_file = json_dict['target_file'] svg_file = json_dict['faksimile_source_file']\ if bool(json_dict.get('faksimile_source_file'))\ else None if bool(json_dict.get('date_stamp')): if json_dict['date_stamp'] == self.TIMESTAMP_NOT_SET\ or os.path.getmtime(target_file) <= json_dict['date_stamp']: exit_code = 2 operation = 'unknown' if bool(json_dict.get('response_handler'))\ and bool(self.response_handler_dictionary.get(json_dict['response_handler']['action_name'])): operation = json_dict['response_handler']['action_name'] response_handler = self.response_handler_dictionary[operation] exit_code = response_handler.handle_response(Page(target_file), json_dict) message = f'Operation "{operation}" succeeded!' if exit_code == 0 else f'Operation "{operation}" failed' return self.create_json_dict(target_file, svg_file=svg_file, last_operation_result=message) else: return self.create_json_dict(target_file,\ last_operation_result=f'FAIL: file {target_file} was changed between operations!') else: return self.create_json_dict(target_file,\ last_operation_result='ERROR: there was no key "date_stamp" in json') else: return { 'actions': { self.RESULT: 'ERROR: there was no key "target_file" in json!' }} class InteractiveShell: def __init__(self): self.response_handlers = [] self.response_handlers.append(SimpleJoinWords(dialog_string='specify ids of words to join [default]')) self.response_handlers.append(RestoreBackup(response_starts_with='b', dialog_string='b=restore backup')) self.response_handlers.append(CreateCorrectionHistory(response_starts_with='c', dialog_string='c=create correction history [+ ids]')) self.response_handlers.append(DeleteCorrectionHistory(response_starts_with='D', dialog_string='D=delete correction history [+ ids]')) self.response_handlers.append(ChangeDeletionStatus(response_starts_with='d', dialog_string='d=mark deleted [+ ids]')) self.response_handlers.append(SaveChanges(response_starts_with='i', dialog_string='i=fix ids' )) self.response_handlers.append(ChangeLine2Value(response_starts_with='l', dialog_string='l[:value]=change line to value for ids' )) self.response_handlers.append(Reload(response_starts_with='r', dialog_string='r=reload xml file')) self.response_handlers.append(SplitWords(response_starts_with='s', dialog_string='s=split and join word ("s splittext id")')) self.response_handlers.append(ChangeDeletionStatus(response_starts_with='u', dialog_string='u=undelete [+ ids]')) self.response_handlers.append(JoinWords(response_starts_with='w', dialog_string='w=join words with whitespace between them [+ ids]')) self.response_handlers.append(ResponseHandler()) def _get_words_from_response(self, response, words) ->list: """Return a list of word that correspond to indices """ if re.match(r'\d+-\d+', response)\ or re.match(r'\d+\+', response): index_boundaries = [] if response[-1] == '+': index_boundaries.append(int(response[:response.index('+')])) index_boundaries.append(index_boundaries[0]+1) else: index_boundaries = [ int(i) for i in response.split('-') ] index_boundaries_length_diff = len(response.split('-')[0]) - len(response.split('-')[1]) if index_boundaries_length_diff > 0: index_boundaries[1] = int(response.split('-')[0][0-index_boundaries_length_diff-1] + response.split('-')[1]) indices = [ i for i in range(index_boundaries[0], index_boundaries[1]+1) ] if index_boundaries[0] > index_boundaries[1]: indices = [ index_boundaries[0] ] while indices[-1] > index_boundaries[1]: indices.append(indices[-1]-1) else: indices = [ int(i) for i in response.split(' ') ] result_words = [] for index in indices: if len([ word for word in words if == index ]) > 0: result_words += [ word for word in words if == index ] return result_words def run_interactive_editor(self, page) -> int: """Run interactive shell. """ replace_ligatures(page) HTMLConverter(page).convert() for response_handler in self.response_handlers: response_handler.print_dialog() response = input('>') for response_handler in self.response_handlers: if response_handler.match(response): return response_handler.handle_interactive_response(page, response, self) def replace_ligatures(page): """Replace ligatures """ if len([ word for word in page.words if re.match(r'.*[flfi]', word.text) ]) > 0: for word in [ word for word in page.words if re.match(r'.*[fi]', word.text) ]: word.text = word.text.replace('fi', 'fi') for word in [ word for word in page.words if re.match(r'.*[fl]', word.text) ]: word.text = word.text.replace('fl', 'fl') save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') def dict_contains_keys(a_dict, key_list)->bool: """Return whether dict a_dict contains key path given by key_list. """ if len(key_list) == 0: return True else: if key_list[0] in a_dict.keys(): return dict_contains_keys(a_dict[key_list[0]], key_list[1:]) return False def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to fix faksimile position ->set them to their absolute value. fixes/ [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 xml_file = args[0] if isfile(xml_file): counter = 0 shell = InteractiveShell() for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK): if not UNITTESTING: print(Fore.CYAN + f'Processing {page.title}, {page.number} with interactive editor ...' + Style.RESET_ALL) back_up(page, page.xml_file) counter += 1 if shell.run_interactive_editor(page) == 0 else 0 if not UNITTESTING: print(Style.RESET_ALL + f'[{counter} pages changed by interactive shell]') else: raise FileNotFoundError('File {} does not exist!'.format(xml_file)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: py2ttl/ =================================================================== --- py2ttl/ (revision 108) +++ py2ttl/ (revision 109) @@ -1,118 +1,131 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert py objects to ontology and data in turtle format. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import lxml.etree as ET from os import sep, path, listdir from os.path import isfile, isdir, dirname, basename from import Bar import re import sys sys.path.append('svgscripts') from datatypes.archival_manuscript import ArchivalManuscriptUnity if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from class_spec import SemanticClass from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL from py2ttl_data import Py2TTLDataConverter from py2ttl_ontology import Py2TTLOntologyConverter sys.path.append('shared_util') from myxmlwriter import xml2dict from main_util import get_manuscript_files_and_include_status __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" FILE_TYPE_XML_PROJECT = "xmlProjectFile" def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert py objects to a owl:Ontology and rdf data in turtle format. py2ttl/ [OPTIONS] [ ...] xml file of type shared_util.myxmlwriter.FILE_TYPE_XML_MANUSCRIPT. OPTIONS: -h|--help: show help -i|--include-status=STATUS include pages with status = STATUS. STATUS is a ':' seperated string of status, e.g. 'OK:faksimile merged'. + -I|--Include-files-only create include files only with suffix INCLUDE_DATA.ttl. :return: exit code (int) """ check_config_files_exist() datatypes_dir = get_datatypes_dir() + include_only = False + containsAttr = 'status' source_ontology_file = PROJECT_ONTOLOGY_FILE target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, PROJECT_NAME) manuscript_file = None page_status_list = [ 'OK', 'faksimile merged' ] try: - opts, args = getopt.getopt(argv, "hi:", ["help", "include-status="]) + opts, args = getopt.getopt(argv, "hi:I", ["help", "include-status=", "Include-files-only"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-i', '--include-status'): page_status_list = arg.split(':') + elif opt in ('-I', '----Include-files-only'): + include_only = True + containsAttr = 'include' if len(args) < 1 : usage() return 2 ontology_created = False ontology_converter = Py2TTLOntologyConverter(project_ontology_file=source_ontology_file) output = 2 - for manuscript_file, include_status in get_manuscript_files_and_include_status(args): + status = ':'.join(page_status_list)\ + if not include_only\ + else 'OK' + for arg in get_manuscript_files_and_include_status(args, containsAttr, status): + if type(arg) == str: + manuscript_file, include_status = arg, None + else: + manuscript_file, include_status = arg[0], arg[1] if not isfile(manuscript_file): usage() return 2 if not ontology_created: print(Fore.CYAN + 'Create ontology from "{}" ...'.format(manuscript_file)) if ontology_converter.create_ontology(datatypes_dir, target_ontology_file) == 0: print(Fore.GREEN + '[Ontology file {0} created]'.format(target_ontology_file)) ontology_created = True else: return 2 current_page_status_list = page_status_list\ if include_status is None\ else include_status.split(':') print(Fore.CYAN + f'Create data from "{manuscript_file}" with status "{current_page_status_list}" ...') data_converter = Py2TTLDataConverter(manuscript_file, mapping_dictionary=ontology_converter.uri_mapping4cls_and_properties) output = data_converter.convert(page_status_list=current_page_status_list) return output if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: py2ttl/ =================================================================== --- py2ttl/ (revision 108) +++ py2ttl/ (revision 109) @@ -1,143 +1,146 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert py objects to data in turtle format. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import lxml.etree as ET from os import sep, path, listdir from os.path import isfile, isdir, dirname, basename from import Bar import re import sys sys.path.append('svgscripts') from datatypes.archival_manuscript import ArchivalManuscriptUnity from datatypes.super_page import SuperPage if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from class_spec import SemanticClass from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL from data_handler import RDFDataHandler sys.path.append('shared_util') from myxmlwriter import xml2dict __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Py2TTLDataConverter: """This class can be used convert py objects to rdf data in turtle format. """ UNITTESTING = False def __init__(self, manuscript_file, xml_dictionary_file=None, mapping_dictionary=None): if mapping_dictionary is None and xml_dictionary_file is not None: if not Py2TTLDataConverter.UNITTESTING: print(Fore.CYAN + 'initializing mapping dictionary from file "{}" ...'.format(xml_dictionary_file)) self.mapping_dictionary = xml2dict(xml_dictionary_file) if not Py2TTLDataConverter.UNITTESTING: print(Fore.GREEN + '[{} classes added]'.format(str(len(self.mapping_dictionary['classes'])))) elif mapping_dictionary is not None: self.mapping_dictionary = mapping_dictionary else: raise Exception('Error: Py2TTLDataConverter init expects either a xml_dictionary_file or a mapping_dictionary!') self.manuscript_file = manuscript_file def convert(self, page_status_list=None): """Convert manuscript instantiated with manuscript_file to rdf data and write to target_file. """ if page_status_list is None or len(page_status_list) < 1: page_status_list = ['OK', SuperPage.STATUS_MERGED_OK] not Py2TTLDataConverter.UNITTESTING and print(Fore.CYAN + 'initializing python objects with file "{}" ...'.format(self.manuscript_file)) manuscript = ArchivalManuscriptUnity.create_cls(self.manuscript_file, page_status_list=page_status_list, update_page_styles=True) - target_data_file = manuscript.title.replace(' ', '_') + '_DATA.ttl' + include_tag = '_INCLUDE'\ + if 'OK' in page_status_list and len(page_status_list) == 1\ + else '' + target_data_file = manuscript.title.replace(' ', '_') + include_tag + '_DATA.ttl' data_handler = RDFDataHandler(target_data_file, self.mapping_dictionary) if not Py2TTLDataConverter.UNITTESTING: print(Fore.GREEN + '[{} pages added]'.format(str(len([ page for page in manuscript.pages if 'xml_file' in page.__dict__.keys()])))) print(Fore.CYAN + 'adding triples to rdf graph ... ') data_handler.add_data(manuscript, '') if not Py2TTLDataConverter.UNITTESTING: print(Fore.GREEN + '[{} statements added]'.format(str(len(data_handler.data_graph)))) print(Fore.CYAN + 'writing graph to file "{}" ...'.format(target_data_file)) data_handler.write() if not Py2TTLDataConverter.UNITTESTING: print(Fore.GREEN + '[OK]') print(Style.RESET_ALL) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert py objects to rdf data in turtle format. py2ttl/ [OPTIONS] xml file of type shared_util.myxmlwriter.FILE_TYPE_XML_MANUSCRIPT. OPTIONS: -h|--help: show help -i|--include-status=STATUS include pages with status = STATUS. STATUS is a ':' seperated string of status, e.g. 'OK:faksimile merged'. -m|--mapping=mapping_dict.xml xml file generated by py2ttl/ containing mapping information for each property of a class. :return: exit code (int) """ check_config_files_exist() datatypes_dir = get_datatypes_dir() target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, PROJECT_NAME) xml_dictionary_file = 'mapping_file4' + datatypes_dir.replace(sep, '.') + '2' + target_ontology_file.replace('.' + sep, '').replace(sep, '.').replace('.ttl', '.xml') manuscript_file = None page_status_list = None try: opts, args = getopt.getopt(argv, "hi:m:", ["help", "include-status=", "mapping="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-i', '--include-status'): page_status_list = arg.split(':') elif opt in ('-m', '--mapping'): xml_dictionary_file = arg if len(args) < 1 : usage() return 2 manuscript_file = args[0] if not isfile(xml_dictionary_file) or not isfile(manuscript_file): usage() return 2 converter = Py2TTLDataConverter(manuscript_file, xml_dictionary_file=xml_dictionary_file) converter.convert(page_status_list=page_status_list) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: shared_util/ =================================================================== --- shared_util/ (revision 108) +++ shared_util/ (revision 109) @@ -1,103 +1,105 @@ import lxml.etree as ET from os.path import isfile, isdir, dirname, basename from svgpathtools import svg2paths2, svg_to_paths import sys sys.path.append('svgscripts') from datatypes.path import Path from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition FILE_TYPE_XML_PROJECT = "xmlProjectFile" def create_function_dictionary(list_of_keys, target_function, function_dictionary=None) -> dict: """Create a function_dictionary """ if function_dictionary is None: function_dictionary = {} for key in list_of_keys: function_dictionary.update({key: target_function}) return function_dictionary def get_manuscript_files(args: list) ->list: """Return a list of manuscript files. If first element is of type FILE_TYPE_XML_PROJECT read from xml file and return as list of filenames. """ if len(args) == 1\ and args[0].endswith('.xml')\ and ET.parse(args[0]).getroot().find('metadata/type').text == FILE_TYPE_XML_PROJECT: return ET.parse(args[0]).xpath('//manuscript[contains(@status, "OK")]/@file') return args -def get_manuscript_files_and_include_status(args: list) ->list: +def get_manuscript_files_and_include_status(args: list, containsAttr="status", status="OK") ->list: """Return a list tuples of manuscript files and optional include status. If first element is of type FILE_TYPE_XML_PROJECT read from xml file and return as list of tuples of filename (@files) and include status for manuscript pages (@include). """ if len(args) == 1\ and args[0].endswith('.xml')\ and ET.parse(args[0]).getroot().find('metadata/type').text == FILE_TYPE_XML_PROJECT: - return [ (node.get('file'),node.get('include')) for node in ET.parse(args[0]).xpath('//manuscript[contains(@status, "OK")]')] + statusList = status.split(':') + xpath = ' and '.join([ f'contains(@{containsAttr}, "{singleStatus}")' for singleStatus in statusList]) + return [ (node.get('file'),node.get('include')) for node in ET.parse(args[0]).xpath(f'//manuscript[{xpath}]')] return args def extract_paths_on_tf(page, transkription_field=None, new_style_prefix='tln', outsiders=None, outsider_attributes=None) ->list: """Extract all paths on transkription_field. :return: a list of datatypes.path.Path """ if page.source is not None and isfile(page.source): if transkription_field is None: transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) paths, attributes = svg_to_paths.svg2paths(page.source) allpaths_on_tf = [] for index, path in enumerate(paths): attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and path.bbox()[0] >= transkription_field.xmin\ and path.bbox()[1] <= transkription_field.xmax\ and path.bbox()[2] >= transkription_field.ymin\ and path.bbox()[3] <= transkription_field.ymax: style_class = attribute.get('class') if style_class is None and attribute.get('style') is not None: style_class = create_new_style(page, attribute.get('style'), new_style_prefix=new_style_prefix) allpaths_on_tf.append(Path.create_cls(id=index, path=path, style_class=style_class, page=page)) elif outsiders is not None\ and len(path) > 0\ and path != transkription_field.path: style_class = attribute.get('class') if style_class is None and attribute.get('style') is not None: style_class = create_new_style(page, attribute.get('style'), new_style_prefix=new_style_prefix) outsiders.append(Path.create_cls(id=index, path=path, style_class=style_class, page=page)) outsider_attributes.append(attribute) return allpaths_on_tf else: return [] def create_new_style(page, style_attribute_string, new_style_prefix='tln') ->str: """Create new style, update page and return new style_class. """ style_dict = {} style_class = None for key_content in style_attribute_string.split(';'): if ':' in key_content: key, content = tuple(key_content.split(':')) style_dict.update({ key: content}) if style_dict in page.style_dict.values(): style_class = list(page.style_dict.keys())[list(page.style_dict.values()).index(style_dict)] else: new_style_index = len([ k for k in page.style_dict.keys() if k.startswith(new_style_prefix) ]) style_class = f'{new_style_prefix}{new_style_index}' page.style_dict.update({style_class: style_dict }) page.add_style(sonderzeichen_list=page.sonderzeichen_list, letterspacing_list=page.letterspacing_list,\ style_dict=page.style_dict) return style_class def get_paths_near_position(tp: TranskriptionPosition, paths: list, xmin=0, ymin=0, do_not_include_d_attributes=None) ->list: """Given a transkription position and a list of svgscripts.datatypes.path.Path, return a list of paths near this position. """ tp_x = tp.left + (tp.width/2) + xmin tp_y = + (tp.height/2) + ymin do_not_include_d_attributes = do_not_include_d_attributes if do_not_include_d_attributes is not None else [] return [ path.d_attribute for path in Path.get_nearest_paths(paths, tp_x, tp_y) if path.d_attribute not in do_not_include_d_attributes ] Index: tests_shared_util/ =================================================================== --- tests_shared_util/ (revision 108) +++ tests_shared_util/ (revision 109) @@ -1,49 +1,57 @@ import unittest import os from os.path import isfile, isdir, dirname, sep, realpath from datetime import datetime import shutil import tempfile import xml.etree.ElementTree as ET import lxml.etree as LET from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD from xmldiff import main import sys sys.path.append('svgscripts') from import Page from datatypes.transkriptionField import TranskriptionField sys.path.append('shared_util') -from main_util import extract_paths_on_tf, get_paths_near_position +from main_util import extract_paths_on_tf, get_paths_near_position, get_manuscript_files_and_include_status class TestMain(unittest.TestCase): def setUp(self): self.test_dir = tempfile.mkdtemp() self.title = 'ASDF' DATADIR = dirname(__file__) + sep + 'test_data' = DATADIR + sep + 'N_VII_1_page001.xml' + self.project = DATADIR + sep + 'project.xml' + @unittest.skip('local file') def test_extract_paths_on_tf(self): page = Page('xml/Mp_XV_page74v.xml') outsiders, attributes = [],[] paths = extract_paths_on_tf(page, outsiders=outsiders, outsider_attributes=attributes) self.assertTrue(page.source is not None and isfile(page.source)) self.assertTrue(len(paths) > 0) print([ path.style_class for path in paths]) for path in paths: print(ET.dump(page.page_tree.xpath(f'//style/class[@name="{path.style_class}"]')[0])) + def test_get_manuscript_files(self): + args = [ self.project ] + self.assertEqual(len(get_manuscript_files_and_include_status(args, 'include')), 1) + self.assertEqual(len(get_manuscript_files_and_include_status(args, 'status', 'faksimile merged')), 2) + def test_get_paths_near_position(self): page = Page( transkription_field = TranskriptionField(page.source) paths = extract_paths_on_tf(page) word = [ w for w in page.words if w.deleted ][0] close_paths = get_paths_near_position(word.transkription_positions[0], paths, xmin=transkription_field.xmin, ymin=transkription_field.ymin) self.assertEqual(close_paths[0], word.deletion_paths[0].d_attribute) + if __name__ == "__main__": unittest.main() Index: tests_shared_util/test_data/project.xml =================================================================== --- tests_shared_util/test_data/project.xml (revision 0) +++ tests_shared_util/test_data/project.xml (revision 109) @@ -0,0 +1,39 @@ + + + + xmlProjectFile + + + 2017-07-06_15:50:54 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 3 + + + Index: Friedrich-Nietzsche-late-work-ontology.ttl =================================================================== --- Friedrich-Nietzsche-late-work-ontology.ttl (revision 108) +++ Friedrich-Nietzsche-late-work-ontology.ttl (revision 109) @@ -1,143 +1,160 @@ @prefix dct: . @prefix document: . @prefix homotypic: . @prefix stoff: . @prefix text: . @prefix owl: . @prefix rdfs: . @prefix rdf: . +@prefix skos: . @prefix xsd: . @prefix tln: . a owl:Ontology; dct:license ; dct:title "An ontology about the collected late works of Friedrich Nietzsche"@en; dct:description """Formal description of specific concepts in the scientific study of Friedrich Nietzsches late work."""@en; dct:creator "Dominique Steinbach, tool coordinator/software developer, NIE-INE/digital edition of der späte Nietzsche, Basel University, Switzerland"@en; dct:contributor "Christian Steiner, software developer, digital edition of der späte Nietzsche, University of Basel, Switzerland"@en; dct:publisher "Basel University, Switzerland"@en. tln:TextGenesis a owl:Class ; rdfs:label "identifies a genetic order of text versions"@en ; rdfs:comment "Identifies a genetic order of text versions, i.e. groups text units as earlier and later versions of each other."@en ; rdfs:isDefinedBy . tln:IdentifiedTextVersion a owl:Class ; rdfs:label "identifies a list of text unities as a text version"@en ; rdfs:comment "Identification of a list of text unities (e.g. pages or parts of pages) as a text version for which there is an earlier or later version."@en ; rdfs:isDefinedBy . tln:PartOfPageTextUnit a owl:Class ; rdfs:label "identifies a part of a page as a text unity"@en ; rdfs:comment "Identification of a part of page as a text unity."@en ; rdfs:isDefinedBy ; rdfs:subClassOf [ a owl:Restriction ; owl:cardinality "1"^^xsd:nonNegativeInteger ; owl:onProperty tln:belongsToPage ], [ a owl:Restriction ; owl:cardinality "1"^^xsd:nonNegativeInteger ; owl:onProperty tln:startLine ], [ a owl:Restriction ; owl:cardinality "1"^^xsd:nonNegativeInteger ; owl:onProperty tln:endLine ] . tln:ExternalTextUnit a owl:Class ; rdfs:label "a list text unit that has been published external to the digital edition"@en ; rdfs:comment "A text unit that has been published external to the digital edition."@en ; rdfs:isDefinedBy ; rdfs:subClassOf tln:IdentifiedTextVersion . tln:Page a owl:Class ; rdfs:subClassOf document:Page . tln:belongsToPage a owl:ObjectProperty ; rdfs:label "relates a part of a page with the page it is a part of"@en ; rdfs:comment "Relates a part of a page with the page it is a part of."@en ; rdfs:isDefinedBy ; rdfs:domain tln:PartOfPageTextUnit ; rdfs:range tln:Page. tln:startLine a owl:ObjectProperty ; rdfs:label "relates a part of a page with the line it starts with"@en ; rdfs:comment "Relates a part of a page with the line it starts with."@en ; rdfs:isDefinedBy ; rdfs:domain tln:PartOfPageTextUnit ; rdfs:range tln:Line. tln:endLine a owl:ObjectProperty ; rdfs:label "relates a part of a page with the line it ends with"@en ; rdfs:comment "Relates a part of a page with the line it ends with."@en ; rdfs:isDefinedBy ; rdfs:domain tln:PartOfPageTextUnit ; rdfs:range tln:Line. tln:identifiesAsVersion a owl:ObjectProperty ; rdfs:label "groups a list of text unities together as a identified text version"@en ; rdfs:comment "Groups a list of text unities together as a identified text version for which there is an ealier or later version."@en ; rdfs:isDefinedBy ; rdfs:domain tln:IdentifiedTextVersion ; rdfs:range rdf:List. tln:hasGeneticOrder a owl:ObjectProperty ; rdfs:label "relates a list of text versions to an identified genetic order"@en ; rdfs:comment "Relates a list of text versions to an identified genetic order. The position in the list determines the version of a text unit."@en ; rdfs:isDefinedBy ; rdfs:domain tln:TextGenesis ; rdfs:range rdf:List. tln:textUnitHasTitle a owl:ObjectProperty ; rdfs:label "relates a external published text unit with a title"@en ; rdfs:comment "Relates a external published text unit with a title by which it can be identified."@en ; rdfs:isDefinedBy ; rdfs:domain tln:ExternalTextUnit ; rdfs:range xsd:string . tln:textUnitHasUrl a owl:ObjectProperty ; rdfs:label "relates a external published text unit with a URL"@en ; rdfs:comment "Relates a external published text unit with a URL by which it can be visited."@en ; rdfs:isDefinedBy ; rdfs:domain tln:ExternalTextUnit ; rdfs:range xsd:anyURI . tln:hasImage a owl:ObjectProperty ; rdfs:label "relates a page to a image"@en ; rdfs:comment "relates a page to an image that has a textfield that specifies the area where the writing that constitutes the page can be found."@en ; rdfs:isDefinedBy ; rdfs:domain tln:Page ; rdfs:range tln:Image . tln:hasUrl a owl:DatatypeProperty ; rdfs:label "has Url"@en ; rdfs:domain tln:Image ; rdfs:isDefinedBy ; rdfs:range xsd:anyURI . tln:inheritOverwritesWord a owl:ObjectProperty ; rdfs:subPropertyOf tln:overwritesWord; rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ; rdfs:comment "The author has used this word in order to overwrite that word."@en ; rdfs:isDefinedBy ; owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ). tln:lineContinuesOn a owl:ObjectProperty ; rdfs:label "writing from subject line continues on object line"@en ; rdfs:comment "the writing that ends on subject line continues on object line"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Line ; rdfs:range tln:Line . tln:pageIsOnTextField a owl:ObjectProperty ; rdfs:label "page is on text field"@en ; rdfs:comment "the writing that is referred to as subject can be found on object"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Page ; rdfs:range tln:TextField . tln:writingContinuesWithWord a owl:ObjectProperty ; rdfs:label "writing continues with next word"@en ; rdfs:isDefinedBy ; rdfs:domain tln:Word ; rdfs:range tln:Word . +tln:selectableWordProperty a owl:ObjectProperty ; + rdfs:label "a property of a word for which it can be selected"@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:Word . + +tln:cardinalityGreaterOne a rdf:Property ; + rdfs:label "whether a tln:selectableWordProperty can have a greater cardinality then one"@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:selectableWordProperty ; + rdfs:range xsd:boolean . + +tln:suggestedMaxCardinality a rdf:Property ; + rdfs:label "the suggested max cardinaltiy of a tln:selectableWordProperty on a word"@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:selectableWordProperty ; + rdfs:range xsd:nonNegativeInteger . Index: svgscripts/datatypes/ =================================================================== --- svgscripts/datatypes/ (revision 108) +++ svgscripts/datatypes/ (revision 109) @@ -1,151 +1,158 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent an archival unity of manuscript pages, i.e. workbooks, notebooks, folders of handwritten pages. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile import sys from .color import Color from .description import Description from .earlier_description import EarlierDescription from .manuscript import ManuscriptUnity from .page import Page, FILE_TYPE_XML_MANUSCRIPT, FILE_TYPE_SVG_WORD_POSITION from .reconstructed_konvolut import ReconstructedKonvolut sys.path.append('py2ttl') from class_spec import SemanticClass sys.path.append('shared_util') from myxmlwriter import parse_xml_of_type, write_pretty, xml_has_type class ArchivalManuscriptUnity(ManuscriptUnity): """ This class represents an archival unity of manuscript pages (workbooks, notebooks and portfolios of handwritten pages). @label archival unity of manuscript pages Args: title title of archival unity manuscript_type type of manuscript: 'Arbeitsheft', 'Notizheft', 'Mappe' manuscript_tree lxml.ElementTree """ XML_TAG = 'manuscript' XML_COLORS_TAG = 'colors' + XML_GSA_PATH = 'signature/gsa' UNITTESTING = False - def __init__(self, title='', manuscript_type='', manuscript_tree=None): + def __init__(self, title='', gsa_signature=None, manuscript_type='', manuscript_tree=None): super(ArchivalManuscriptUnity,self).__init__(title=title, manuscript_type=manuscript_type,manuscript_tree=manuscript_tree) self.colors = [] self.earlier_descriptions = [] + self.gsa_signature = gsa_signature self.reconstructed_konvoluts = [] self.styles = [] @classmethod def create_cls(cls, xml_manuscript_file, page_status_list=None, page_xpath='', update_page_styles=False): """Create an instance of ArchivalManuscriptUnity from a xml file of type FILE_TYPE_XML_MANUSCRIPT. :return: ArchivalManuscriptUnity """ manuscript = super(ArchivalManuscriptUnity,cls).create_cls(xml_manuscript_file) manuscript_tree = manuscript.manuscript_tree manuscript.colors = [ Color.create_cls(node=color_node) for color_node in manuscript_tree.xpath('.//' + cls.XML_COLORS_TAG + '/' + Color.XML_TAG) ] if page_xpath == '': page_status = '' if page_status_list is not None\ and type(page_status_list) is list\ and len(page_status_list) > 0: page_status = '[' + ' and '.join([ f'contains(@status, "{status}")' for status in page_status_list ]) + ']' page_xpath = f'//pages/page{page_status}/@output' included_page_list = [ page_source\ for page_source in manuscript_tree.xpath(page_xpath)\ if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ] manuscript.pages = [ Page.create_cls(page_source, create_dummy_page=(page_source not in included_page_list))\ for page_source in manuscript_tree.xpath('//pages/page/@output')\ if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ] if update_page_styles: for page in manuscript.pages: if 'xml_file' in page.__dict__.keys(): page.update_styles(manuscript=manuscript, add_to_parents=True, create_css=True) description_node = manuscript_tree.xpath(Description.XML_TAG)[0]\ if len(manuscript_tree.xpath(Description.XML_TAG)) > 0\ else None if description_node is not None: manuscript.description = Description.create_cls_from_node(description_node.xpath(Description.ROOT_TAG)[0])\ if len(description_node.xpath(Description.ROOT_TAG)) > 0\ else None for earlier_description_node in description_node.xpath(EarlierDescription.ROOT_TAG): earlier_description = EarlierDescription.create_cls_from_node(earlier_description_node) if earlier_description is not None: manuscript.earlier_descriptions.append(earlier_description) + manuscript.reconstructed_konvoluts = [ ReconstructedKonvolut.create_cls(rk_node.get('output'), page_status_list=page_status_list, page_xpath=page_xpath)\ for rk_node in manuscript_tree.xpath(ReconstructedKonvolut.XML_TAG) ] + manuscript.gsa_signature = manuscript.manuscript_tree.xpath(f'//{cls.XML_GSA_PATH}')[0].text\ + if len(manuscript.manuscript_tree.xpath(f'//{cls.XML_GSA_PATH}')) > 0\ + else None return manuscript def get_color(self, hex_color) -> Color: """Return color if it exists or None. """ if hex_color in [ color.hex_color for color in self.colors ]: return [ color for color in self.colors if color.hex_color == hex_color ][0] return None @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = super(ArchivalManuscriptUnity,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', list)) + dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('gsa_signature', str)) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('reconstructed_konvoluts', ReconstructedKonvolut,\ name='partsBelongToReconstructedKonvolut',label='parts of manuscript belong to reconstructed convolut',\ comment='Some of the pages of this manuscript belong to a reconstructed convolut of pages.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_descriptions', EarlierDescription)) return cls.return_dictionary_after_updating_super_classes(dictionary) def update_colors(self, color): """Update manuscript colors if color is not contained. """ if self.get_color(color.hex_color) is None: self.colors.append(color) if self.manuscript_tree is not None: if len(self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)) > 0: self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)[0].getparent().remove(self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)[0]) colors_node = ET.SubElement(self.manuscript_tree.getroot(), self.XML_COLORS_TAG) for color in self.colors: color.attach_object_to_tree(colors_node) if not self.UNITTESTING: write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_tree.docinfo.URL,\ script_name=__file__, backup=True,\ file_type=FILE_TYPE_XML_MANUSCRIPT) def update_styles(self, *styles): """Update manuscript styles. """ for style in styles: if style not in self.styles: #print(style.css_styles) self.styles.append(style) Index: svgscripts/datatypes/ =================================================================== --- svgscripts/datatypes/ (revision 108) +++ svgscripts/datatypes/ (revision 109) @@ -1,905 +1,907 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy import inspect from lxml import etree as ET from operator import attrgetter import re import string import sys import warnings from .box import Box from .editor_comment import EditorComment from .matrix import Matrix from .path import Path from .simple_word import SimpleWord from .style import Style from .word_deletion_path import WordDeletionPath from .word_position import WordPosition from .transkription_position import TranskriptionPosition from .writing_process import WritingProcess SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation) def execute_function_on_parts(word_parts, func_name): """Execute function on parts and add those parts instead of original word to word_parts. :return: new word_parts, output from func """ copy_parts = word_parts[:] for word in word_parts: output = eval('word.{0}()'.format(func_name)) if len(word.word_parts) > 0: for part_word in word.word_parts: copy_parts.insert(copy_parts.index(word), part_word) copy_parts.remove(word) word.word_parts = [] return copy_parts, output def update_transkription_position_ids(word): """Update transkription_position' ids according to index. """ word_part_ids = [ for wp in word.word_parts ] if len(word_part_ids) != len(set(word_part_ids)): for id, wp in enumerate(word.word_parts): = id for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))): = index transkription_position.has_box = None transkription_position.deleted = False class Word(SimpleWord): """ This class represents a word. """ COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ] APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' } DATA = 'debug-data' RDFS_SUBCLASSOF_LIST = [''] XML_TAG = 'word' XML_EARLIER_VERSION = 'earlier-version' XML_OVERWRITES = 'overwrites' XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\ 'isDeletionOfWord': 'deletesEarlierPart',\ 'isExtensionOfWord': 'extendsEarlierVersion',\ 'isTransformationOfWord': 'transformsEarlierPart' } def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None): super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.corrections = [] self.deleted = deleted self.deletion_paths = [] self.deletion_paths_near_word = [] self.debug_container = {} self.debug_msg = None self.earlier_version = earlier_version self.edited_text = None self.editor_comment = None self.isClarificationOfWord = None self.isDeletionOfWord = None self.isExtensionOfWord = None self.isTransformationOfWord = None if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0: self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ]) self.overwrites_word = None self.process_flags = [] self.styles = styles\ if styles is not None\ else [] self.verified = None self.writing_process_id = writing_process_id self.writing_processes = [] self.word_insertion_mark = None self.word_box = None self.word_parts = word_parts if word_parts is not None else [] self.word_part_objs = word_part_objs if word_part_objs is not None else [] def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Add a word deletion path to word. """ if len(self.word_parts) > 0: for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) elif self.deleted: index = 0 while len(self.deletion_paths) == 0 and index < len(self.transkription_positions): include_pwps = (len(self.transkription_positions[index].positional_word_parts) > 0 and abs(self.transkription_positions[index].left-self.transkription_positions[index].positional_word_parts[0].left) < 10) word_path = Path.create_path_from_transkription_position(self.transkription_positions[index],\ tr_xmin=tr_xmin, tr_ymin=tr_ymin, include_pwps=include_pwps) self.deletion_paths += [ deletion_path for deletion_path in deletion_paths\ if not Path.is_path_contained(self.deletion_paths, deletion_path)\ and deletion_path.do_paths_intersect(word_path) ] index += 1 def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = super(Word,self).attach_word_to_tree(target_tree) if self.deleted is not None: word_node.set('deleted', str(self.deleted).lower()) if self.verified is not None: word_node.set('verified', str(self.verified).lower()) if self.edited_text is not None: word_node.set('edited-text', self.edited_text) if self.editor_comment is not None: self.editor_comment.attach_object_to_tree(word_node) if self.writing_process_id > -1: word_node.set('writing-process-id', str(self.writing_process_id)) if len(self.process_flags) > 0: word_node.set('process-flags', ' '.join(self.process_flags)) for index, word_part in enumerate(self.word_parts): = index word_part.attach_word_to_tree(word_node) if self.earlier_version is not None: earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION) self.earlier_version.attach_word_to_tree(earlier_node) if self.overwrites_word is not None\ and len(self.overwrites_word.transkription_positions) > 0: overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES) self.overwrites_word.attach_word_to_tree(overwrite_node) if self.word_box is not None: self.word_box.attach_object_to_tree(word_node) if len(self.corrections) > 0: word_node.set('corrections', ' '.join(set([ str( for word in self.corrections ]))) for deletion_id, deletion_path in enumerate(self.deletion_paths): = deletion_id deletion_path.tag = WordDeletionPath.XML_TAG deletion_path.attach_object_to_tree(word_node) for key in self.XML_CORRECTION_DICT.keys(): if self.__dict__[key] is not None: word_node.set(self.XML_CORRECTION_DICT[key], 'true') return word_node def belongs_to_multiple_writing_processes(self, include_parts=False): """Returns true if transkription_positions belong to different WritingProcesses. """ if len(self.word_parts) > 0 and include_parts: return len(set(word.writing_process_id for word in self.word_parts)) > 1 return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1 def set_parent_word_writing_process_id(self): """Set writing_process_id for parent word. """ ids = set(word.transkription_positions[0].style for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None) if len(ids) > 1: self.writing_process_id = max([style.writing_process_id for style in ids]) if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\ for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\ > 1: self.writing_process_id += 1 @classmethod def create_cls(cls, word_node): """Creates a word from a (lxml.Element) node. [:return:] Word """ cls = super(Word,cls).create_cls(word_node) cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1 cls.split_strings = None cls.join_string = word_node.get('join') if bool(word_node.get('split')): cls.split_strings = word_node.get('split').split(' ') if ''.join(cls.split_strings) != cls.text: error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\ format(word_node.getroottree().docinfo.URL, str(\ + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\ + 'Text attribute: "{0}".\n'.format(cls.text) raise Exception(error_msg) cls.verified = word_node.get('verified') == 'true'\ if bool(word_node.get('verified')) else None cls.deleted = word_node.get('deleted') == 'true'\ if bool(word_node.get('deleted')) else None cls.edited_text = word_node.get('edited-text') cls.editor_comment = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ][0]\ if len([ node for node in word_node.xpath('./' + EditorComment.XML_TAG) ]) > 0 else None cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ] if bool(word_node.get('corrections')): for index in [ int(i) for i in word_node.get('corrections').split(' ') ]: if index < len(cls.word_parts): cls.corrections.append(cls.word_parts[index]) cls.earlier_version = None if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0: cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0] for key_value in cls.XML_CORRECTION_DICT.values(): if word_node.get(key_value) == 'true': cls.__dict__[key_value] = True if cls.earlier_version is not None: for word_part in cls.word_parts: for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\ and len(cls.word_parts) <= len(cls.earlier_version.word_parts): try: word_part.__dict__[key] = cls.earlier_version.word_parts[] except Exception: msg = f'{} {cls.text}: {}' raise Exception(msg) for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls.earlier_version for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\ if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\ else None cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\ if len(word_node.xpath('./' + Box.XML_TAG)) > 0\ else None cls.deletion_paths = [ Path(node=node) for node in word_node.xpath(f'./{WordDeletionPath.XML_TAG}') ] cls.process_flags = word_node.get('process-flags').split(' ')\ if bool(word_node.get('process-flags'))\ else [] return cls @classmethod def join_words(cls, list_of_words, add_white_space_between_words=False): """Creates a word from a list of words. [:return:] Word """ if len(list_of_words) > 1: deleted = True in [ word.deleted for word in list_of_words ]\ and len(set([ word.deleted for word in list_of_words ])) == 1 line_number = list_of_words[0].line_number\ if len(set([ word.line_number for word in list_of_words ])) == 1\ else -1 + faksimile_positions = [] for word in list_of_words: if len(word.word_parts) > 0: + faksimile_positions += word.faksimile_positions index = list_of_words.index(word) list_of_words.remove(word) for part_word in reversed(word.word_parts): list_of_words.insert(index, part_word) new_word_text = ''.join([word.text for word in list_of_words])\ if not add_white_space_between_words\ else ' '.join([word.text for word in list_of_words]) - new_word = cls(id=list_of_words[0].id, text=new_word_text,\ + new_word = cls(id=list_of_words[0].id, text=new_word_text, faksimile_positions=faksimile_positions,\ line_number=line_number, deleted=deleted, word_parts=list_of_words) if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]: change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0] new_word.edited_text = new_word.text.replace(change_text, change_text[:-1]) for id, word in enumerate(new_word.word_parts): = id return new_word if len(list_of_words) > 0: return list_of_words[0] else: return None def create_earlier_version(self, root_word=None, id=0): """Create an earlier version of word. """ if root_word is None: root_word = self root_word.set_parent_word_writing_process_id() word_parts = [] non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\ if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ] non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts) if non_single_punctuation_word_parts_length > 0\ and len([ word_part for word_part in non_single_punctuation_word_parts\ if word_part.deleted ])\ == non_single_punctuation_word_parts_length: self.deleted = True for word_part in non_single_punctuation_word_parts: word_part.deleted = False for id, word_part in enumerate(self.word_parts): earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id) if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) elif word_part.overwrites_word is not None\ and ((len(word_part.transkription_positions) > 0\ and word_part.overwrites_word.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style\ != word_part.overwrites_word.transkription_positions[0].style) or word_part.word_box.earlier_version): = word_parts.append(word_part.overwrites_word) word_part.isTransformationOfWord = word_part.overwrites_word #print(f'transform: {self.text}') if word_part not in self.corrections: self.corrections.append(word_part) elif root_word.writing_process_id > -1\ and (len(word_part.transkription_positions) > 0\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style.writing_process_id\ == root_word.writing_process_id): word_part.extendsEarlierVersion = True #print('extends') if word_part not in self.corrections: self.corrections.append(word_part) else: if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) else: #print(f'default: {self.text}') word_parts.append(earlierWordPart) text = ''.join([ word.text for word in word_parts ])\ if len(word_parts) > 0\ else self.text if len(word_parts) == 1: self.transkription_positions += word_parts[0].transkription_positions self.faksimile_positions += word_parts[0].faksimile_positions word_parts = [] new_transkription_positions = copy.deepcopy(self.transkription_positions) if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None: writing_process_id = self.transkription_positions[0].style.writing_process_id for new_tp in new_transkription_positions: = writing_process_id return Word(id=id, text=text, transkription_positions=new_transkription_positions,\ faksimile_positions=self.faksimile_positions, line_number=self.line_number,\ word_parts=word_parts) def create_correction_history(self, page=None, box_style=None): """Create correction history. """ if self.word_box is not None: manuscript = self.transkription_positions[0].style.manuscript\ if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None\ else None style = Style() if box_style is not None: style = box_style if page is not None: style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript) for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]: style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions) for transkription_position in transkription_positions: = style self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\ line_number=self.line_number) for word_part in self.word_parts: word_part.create_correction_history(page=page, box_style=box_style) if len(self.word_parts) > 0: earlier_version = self.create_earlier_version() extending_words = self._get_parts_with_property_key('extendsEarlierVersion') if len(extending_words) > 0: for word in extending_words: word.isExtensionOfWord = earlier_version if self.has_mixed_status('deleted', include_parts=True): self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ]) if len(self.corrections) > 0: self.earlier_version = earlier_version @staticmethod def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): """Creates a word from a (lxml.Element) node or word_part_objs. [:return:] Word """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number text = word_node.get('text') deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true' transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ if len(word_node.findall('.//' + Word.DATA)) > 0\ else [ item.attrib for item in word_node.findall('.//part')] return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file WIDTH = 5 TOPCORRECTION = 2.0 FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize height = height x = round(float(word_part_objs[0]['x']), 3) if(page is not None and bool(page.style_dict)): HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size if endSign is not None and '%' in endSign: lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ if bool(page.style_dict[key].get('font-size'))] lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR elif endSign is not None and '%' in endSign: endX = float(endX) + WIDTH bottom = round(float(word_part_objs[0]['y']), 3) y = round(bottom - height + TOPCORRECTION, 3) width = round(float(endX) - x, 3) transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] text = ''.join([ dict['text'] for dict in word_part_objs]) line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) word.debug_msg = debug_msg return word else: error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(Word,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\ cardinality=1, cardinality_restriction='minCardinality',\ name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\ name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\ name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\ comment='Word has been deleted by the author using a deletion path.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comment', EditorComment,\ name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\ name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\ name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\ name='isClarificationOfWord', label='word is a clarification of word',\ comment='The author has used this part of the word in order to clarify the appearance of that word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\ name='isDeletionOfWord', label='word is a deletion of word',\ comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\ name='isExtensionOfWord', label='word is a extension of word',\ comment='The author has used this part of a word in order to extend an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\ name='isTransformationOfWord', label='word is a transformation of word',\ comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\ name='overwritesWord', label='word overwrites word',\ comment='The author has used this word in order to overwrite that word.')) # This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING, # cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class. dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\ name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\ subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING)) super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\ name='isCorrectionOfWord', label='word is a correction of word',\ comment='The author has used this word in order to correct that word.') for key in cls.XML_CORRECTION_DICT.keys(): correction_dict = dictionary[cls.PROPERTIES_KEY].get(key) correction_dict.update(super_property_dictionary) dictionary[cls.PROPERTIES_KEY].update({key: correction_dict}) return cls.return_dictionary_after_updating_super_classes(dictionary) def has_mixed_status(self, property_key, include_parts=False, concerns_word=True): """Returns true if transkription_positions have mixed status concerning the property_key in their __dict__. """ if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions): return False if len(self.word_parts) > 0 and include_parts: if concerns_word: if False in set(property_key in word.__dict__.keys() for word in self.word_parts): return False return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1 else: return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\ if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1 return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1 def init_word(self, page): """Initialize word with objects from page. """ super(Word,self).init_word(page) if self.writing_process_id > -1: self.writing_processes += [ wp for wp in page.writing_processes if == self.writing_process_id ] writing_processes = self.writing_processes for word_part in self.word_parts: word_part.init_word(page) self.lines += word_part.lines self.writing_processes += word_part.writing_processes self.lines = [ line for line in set(self.lines) ] self.writing_processes = [ wp for wp in set(self.writing_processes)] if self.overwrites_word is not None: self.overwrites_word.init_word(page) if self.earlier_version is not None: if self.earlier_version.writing_process_id == -1: self.earlier_version.writing_process_id = self.writing_process_id-1 if self.earlier_version.line_number == -1: self.earlier_version.line_number = self.line_number self.earlier_version.init_word(page) self.deletion_paths = [ page.get_word_deletion_path(path) for path in self.deletion_paths if path.path is not None ] def join(self, other_word, append_at_end_of_new_word=True, add_white_space_between_words=False): """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. """ if append_at_end_of_new_word: self.text = self.text + other_word.text\ if not add_white_space_between_words\ else self.text + ' ' + other_word.text for position in other_word.transkription_positions: = str(len(self.transkription_positions)) self.transkription_positions.append(position) for position in other_word.faksimile_positions: = str(len(self.faksimile_positions)) self.faksimile_positions.append(position) else: self.text = other_word.text + self.text index = 0 for position in other_word.transkription_positions: self.transkription_positions.insert(index, position) index += 1 while index < len(self.transkription_positions): self.transkription_positions[index].id = str(index) index += 1 index = 0 for position in other_word.faksimile_positions: self.faksimile_positions.insert(indexposition) index += 1 while index < len(self.faksimile_positions): self.faksimile_positions[index].id = str(index) index += 1 self.simplify_transkription_positions() def partition_according_to_deletion(self): """Partition a word according to its transkription_positions' deletion status ->split word and add partial words as its parts. """ if self.has_mixed_status('deleted'): transkription_positions = [] last_status = None for transkription_position in self.transkription_positions: if transkription_position.deleted != last_status\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) for tp in transkription_positions: newWord.deletion_paths += tp._deletion_paths self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.deleted if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) for tp in transkription_positions: newWord.deletion_paths += tp._deletion_paths self.word_parts.append(newWord) self.transkription_positions = [] self.line_number = -1 self.deleted = False elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion') elif not self.deleted\ and len(self.transkription_positions) > 0\ and self.transkription_positions[0].deleted: self.deleted = True for tp in self.transkription_positions: self.deletion_paths += tp._deletion_paths def partition_according_to_writing_process_id(self): """Partition a word according to its transkription_positions' writing_process_ids ->split word and add partial words as its parts. """ if self.belongs_to_multiple_writing_processes(): last_writing_process_id = -1 transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.writing_process_id != last_writing_process_id\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_writing_process_id = transkription_position.writing_process_id if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id') if self.belongs_to_multiple_writing_processes(include_parts=True): self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0] elif len(self.transkription_positions) > 0: self.writing_process_id = self.transkription_positions[0].writing_process_id def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False): """Determines whether word is over a word box. """ word_over_box = None if len(self.word_parts) > 0: for word in self.word_parts: current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None)) if current_word is not None and current_word.word_box is not None: word_over_box = current_word else: new_tp_dict = {} for index, transkription_position in enumerate(self.transkription_positions): if previous_word_has_box and index == 0: if len(transkription_position.positional_word_parts) > 0: transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2 #print(f'{self.text}: {transkription_position.positional_word_parts[0].left}') else: transkription_position.left += 1 word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) containing_boxes = [ box_path for box_path in box_paths\ if word_path.is_partially_contained_by(box_path)\ or box_path.do_paths_intersect(word_path) ] if len(containing_boxes) > 0: if previous_word_has_box: print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}') self._set_box_to_transkription_position(containing_boxes[0], word_path,\ transkription_position, new_tp_dict, tr_xmin) box_paths.remove(containing_boxes[0]) for replace_tp in new_tp_dict.keys(): for tp in new_tp_dict.get(replace_tp): self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp) self.transkription_positions.remove(replace_tp) word_over_box = self._get_partial_word_over_box() update_transkription_position_ids(self) return word_over_box def set_word_insertion_mark(self, word_insertion_mark): """Sets word_insertion_mark """ self.word_insertion_mark = word_insertion_mark def set_writing_process_id_to_transkription_positions(self, page): """Determines the writing process id of the transkription_positions. """ for transkription_position in self.transkription_positions: if len(transkription_position.positional_word_parts) > 0: for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): if font_key in page.fontsizekey2stage_mapping.keys(): transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) def simplify_transkription_positions(self): """Merge transkription_positions if possible. """ index = len(self.transkription_positions)-1 while index > 0\ and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]: current_tp = self.transkription_positions[index] index -= 1 previous_tp = self.transkription_positions[index] if previous_tp.is_mergebale_with(current_tp): positional_word_parts = previous_tp.positional_word_parts positional_word_parts += current_tp.positional_word_parts transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts, debug_msg_string='simplifying transkription positions', if len(transkription_positions) == 1: transkription_positions[0].writing_process_id = previous_tp.writing_process_id\ if previous_tp.writing_process_id != -1\ else current_tp.writing_process_id self.transkription_positions.pop(index+1) self.transkription_positions[index] = transkription_positions[0] #print(self.text, len(self.transkription_positions)) def split(self, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) previousWord.faksimile_positions = self.faksimile_positions current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) nextWord.faksimile_positions = self.faksimile_positions all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) currentWord.faksimile_positions = self.faksimile_positions return previousWord, currentWord, nextWord def split_according_to_status(self, status, splits_are_parts=False): """Split a word according to its transkription_positions' text. :return: a list of new word.Word """ new_words = [] if self.has_mixed_status(status): last_status = None transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.__dict__[status] != last_status\ and len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.__dict__[status] if len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, if splits_are_parts: self.word_parts += new_words if len(self.word_parts) > 0: self.transkription_positions = [] return new_words def undo_partitioning(self): """Undo partitioning. """ if len(self.word_parts) > 0: for word_part in self.word_parts: word_part.undo_partitioning() if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]): self.transkription_positions += word_part.transkription_positions self.earlier_version = None self.edited_text = None self.word_box = None self.word_parts = [] self.corrections = [] self.earlier_versions = [] self.box_paths = [] def _create_new_word(self, transkription_positions, status, new_id=0): """Create a new word from self and transkription_positions. """ newWord = Word(id=new_id, transkription_positions=transkription_positions) for key in self.COPY_PROPERTY_KEY: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys(): newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status]) else: newWord.__dict__[status] = transkription_positions[0].__dict__[status] return newWord def _get_parts_with_property_key(self, property_key): """Return a list of word_parts with property == property_key. """ word_parts = [] for word_part in self.word_parts: if property_key in word_part.__dict__.keys(): word_parts.append(word_part) else: word_parts += word_part._get_parts_with_property_key(property_key) return word_parts def _get_partial_word_over_box(self): """Partition a word according to its transkription_positions' has_box ->split word and add partial words as its parts. :return: word over box or self """ word_over_box = None if self.has_mixed_status('has_box'): transkription_positions = [] last_word_box = None for transkription_position in self.transkription_positions: if transkription_position.has_box != last_word_box\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box transkription_positions = [] transkription_positions.append(transkription_position) last_word_box = transkription_position.has_box if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box self.transkription_positions = [] elif len(self.word_parts) > 0: #self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box') for word_part in self.word_parts: if word_over_box is None: word_over_box = word_part._get_partial_word_over_box() else: break elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1: word_over_box = self word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box return word_over_box def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin): """Set box_path to transkription_position that is contained by box_path. Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary. """ if box_path.contains_path(word_path): transkription_position.has_box = box_path elif box_path.contains_start_of_path(word_path): split_position = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[0].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path elif box_path.contains_end_of_path(word_path): split_position = box_path.path.bbox()[0] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path else: # box_path in the middle of word_pathz split_position1 = box_path.path.bbox()[0] - tr_xmin split_position2 = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position1, split_position2) if len(new_tps) >= 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path def do_paths_intersect_saveMode(mypath1, mypath2): """Returns true if paths intersect, false if not or if there was an exception. """ try: return mypath1.path.intersect(mypath2.path, justonemode=True)\ or mypath1.is_partially_contained_by(mypath2) except AssertionError: return False Index: svgscripts/datatypes/ =================================================================== --- svgscripts/datatypes/ (revision 108) +++ svgscripts/datatypes/ (revision 109) @@ -1,155 +1,160 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent the mark for text by some foreign hand. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from .matrix import Matrix from .special_word import SpecialWord from .standoff_tag import StandoffTag from .text import Text class MarkForeignHands(SpecialWord): """ This class represents the mark for text by some foreign hand. """ XML_TAG = 'mark-foreign-hands' XML_SUB_TAG = 'text' CLASS_MARK = '$' REPLACE_DICT = { '+': 'x' } - def __init__(self, id=0, line_number=-1, text=CLASS_MARK, foreign_hands_text=None, pen='', transkription_positions=[], faksimile_positions=[]): + def __init__(self, id=0, line_number=-1, text=CLASS_MARK, foreign_hands_text=None, pen='', resolution=None, transkription_positions=[], faksimile_positions=[]): super(MarkForeignHands, self).__init__(id=id, text=text, line_number=line_number,\ transkription_positions=transkription_positions, faksimile_positions=faksimile_positions) self.foreign_hands_text = foreign_hands_text self.pen = pen + self.resolution = resolution def add_content(self, node): """Adds content to MarkForeignHands. """ self.pen = node.get('pen') + self.resolution = node.get('resolution') if node.text is not None: self.foreign_hands_text = Text(content=node.text) else: standoff_markups = [ StandoffTag.create_cls_from_node(stf) for stf in node.xpath('./' + '|./'.join(StandoffTag.MARKUP_STYLES))] content = node.xpath('./content')[0].text if len(node.xpath('./content')) > 0 else None self.foreign_hands_text = Text(content=content, standoff_markups=standoff_markups, tag='content') def attach_word_to_tree(self, target_tree): """Attaches MarkForeignHands to tree target_tree. """ node = super(MarkForeignHands,self).attach_word_to_tree(target_tree) if self.foreign_hands_text is not None: content_node = ET.SubElement(node, MarkForeignHands.XML_SUB_TAG) content_node.text = self.foreign_hands_text if type(self.foreign_hands_text) == str else self.foreign_hands_text.content if self.pen is not None and self.pen != '': content_node.set('pen', self.pen) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = super(MarkForeignHands,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('foreign_hands_text',\ Text, cardinality=1, name='textOfForeignHands', label='text traces of some foreign hand')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('pen',\ str, cardinality=1, cardinality_restriction='maxCardinality',\ name='penOfForeignHands', label='pen used to write text by some foreign hand')) + dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('resolution',\ + str, cardinality=1, cardinality_restriction='maxCardinality',\ + name='resolutionOfAbbreviation', label='resolution of the abbreviation')) return cls.return_dictionary_after_updating_super_classes(dictionary) @classmethod def get_special_char_list(cls): """Returns a list of the chars that define this special word. """ return [ cls.CLASS_MARK ] @staticmethod def find_content(list_of_special_words, transkription_field, svg_tree, style_dict=None, italic_classes=None, SonderzeichenList=None, marginals_extra=False, set_to_text_field_zero=True): """Find content for the MarkForeignHands. """ if style_dict is None: style_dict = {} if italic_classes is None: italic_classes = [] if SonderzeichenList is None: SonderzeichenList = [] if len(style_dict) > 0: if len(italic_classes) == 0: italic_classes = [ key for key in style_dict\ if bool(style_dict[key].get('font-family')) and style_dict[key]['font-family'].endswith('Italic') ] if len(SonderzeichenList) == 0: SonderzeichenList = [ key for key in style_dict\ if bool(style_dict[key].get('font-family')) and style_dict[key]['font-family'].startswith('Sonderzeichen') ] nodes_in_margin_field = [ item for item in filter(lambda x: Matrix.IS_IN_MARGIN_FIELD(x.get('transform'), transkription_field, marginals_on_extra_page=marginals_extra),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] provide_tf = transkription_field if set_to_text_field_zero else None for mark_foreign_hands in list_of_special_words: relevant_nodes = [ node for node in nodes_in_margin_field\ if is_close((mark_foreign_hands.transkription_positions[0].bottom+mark_foreign_hands.transkription_positions[0].top)/2,\ node.get('transform'), transkription_field=provide_tf) ] relevant_nodes = sorted(relevant_nodes, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) italic_found = False mark_foreign_hands_text = '' pen = '' for node in relevant_nodes: if len(node.getchildren()) == 0: if italic_found: pen += node.text elif any(style in italic_classes for style in node.get('class').split(' ')): italic_found = True pen = node.text else: mark_foreign_hands_text += get_text_from_node(node, SonderzeichenList) else: for tspan in node.getchildren(): if italic_found: pen += tspan.text elif any(style in italic_classes for style in tspan.get('class').split(' ')): italic_found = True pen = tspan.text else: mark_foreign_hands_text += get_text_from_node(tspan, SonderzeichenList) mark_foreign_hands.foreign_hands_text = mark_foreign_hands_text mark_foreign_hands.pen = pen def get_text_from_node(node, SonderzeichenList): """Returns the text of node. Replaces Sonderzeichen if node has a style class in SonderzeichenList. """ if any(style in SonderzeichenList for style in node.get('class').split(' '))\ and bool(MarkForeignHands.REPLACE_DICT.get(node.text)): return MarkForeignHands.REPLACE_DICT[node.text] else: return node.text def is_close(mark_foreign_hands_position, matrix_string, transkription_field=None): """Return true if mark_foreign_hands_position is == matrix.getY()+-THRESHOLD_Y """ THRESHOLD_Y = 4 matrix = Matrix(transform_matrix_string=matrix_string, transkription_field=transkription_field) return abs(mark_foreign_hands_position-matrix.getY()) < THRESHOLD_Y Index: svgscripts/datatypes/ =================================================================== --- svgscripts/datatypes/ (revision 108) +++ svgscripts/datatypes/ (revision 109) @@ -1,155 +1,157 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent all image types. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile import sys from .attachable_object import AttachableObject from .matrix import Matrix from .text_field import TextField sys.path.append('py2ttl') from class_spec import SemanticClass class Image(AttachableObject,SemanticClass): """ This super class represents all types of images. Args: file_name (str): name of the image file. node (lxml.etree.Element) node, containing information URL (str): URL of image file. height (float): height of image width (float): width of image text_field (.text_field.TextField) text_field on image representation """ stringKeys = [ 'file_name', 'URL', 'local_path' ] floatKeys = [ 'height', 'width' ] XML_TAG = 'image' SECONDARY_URL = 'http://localhost:8000/' FAKSIMILE_DIR = 'faksimiles/' def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, matrix=None, text_field=None, tag=XML_TAG): self.text_field = text_field self.tag = tag if node is not None: self.file_name = node.get('file-name') self.local_path = node.get('local-path') self.URL = node.get('URL') self.height = float(node.get('height')) self.width = float(node.get('width')) self.transform = Matrix(node.get('transform')) if bool(node.get('transform')) and 'matrix(' in node.get('transform') else None if len(node.findall(TextField.XML_TAG)) > 0: self.text_field = TextField(node=node.find(TextField.XML_TAG)) else: self.file_name = file_name self.local_path = local_path self.URL = URL self.height = height self.width = width self.transform = matrix self.primaryURL = self.URL self.secondaryURL = None if self.file_name is not None: self.secondaryURL = self.SECONDARY_URL + self.file_name.replace('./','')\ if self.file_name is not None and self.file_name.endswith('svg')\ else self.SECONDARY_URL + self.FAKSIMILE_DIR + self.file_name self.transform_string = self.transform.toString()\ if self.transform is not None\ else None def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.find('.//' + self.tag) \ if(len(target_tree.findall('.//' + self.tag)) > 0) \ else ET.SubElement(target_tree, self.tag) for key in self.floatKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) for key in self.stringKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), self.__dict__[key]) if self.transform is not None and self.transform.isRotationMatrix(): obj_node.set('transform', self.transform.toString()) if self.text_field is not None: self.text_field.attach_object_to_tree(obj_node) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} for floatKey in Image.floatKeys: properties.update(cls.create_semantic_property_dictionary(floatKey, float, cardinality=1)) properties.update(cls.create_semantic_property_dictionary('file_name', str, cardinality=1)) properties.update(cls.create_semantic_property_dictionary('text_field', TextField)) #properties.update(cls.create_semantic_property_dictionary('transform', str)) properties.update(cls.create_semantic_property_dictionary('transform_string', str, name='hasTransform')) properties.update(cls.create_semantic_property_dictionary('primaryURL', str, cardinality=1, subPropertyOf=cls.HAS_URL)) properties.update(cls.create_semantic_property_dictionary('secondaryURL', str, cardinality=1, subPropertyOf=cls.HAS_URL)) dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary class SVGImage(Image): """This class represents a svg image. """ XML_TAG = 'svg-image' + ASSETS_FOLDER = '/assets/' URL_PREFIX = '' def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG): if node is not None and node.tag != self.XML_TAG: file_name = node.get('file') height = float(node.get('height')) if bool(node.get('height')) else 0.0 width = float(node.get('width')) if bool(node.get('width')) else 0.0 node = None super(SVGImage, self).__init__(node=node, file_name=file_name, URL=URL,\ height=height, width=width, text_field=text_field, tag=self.XML_TAG) - self.primaryURL = self.URL_PREFIX + self.file_name.replace('./', '') + self.primaryURL = self.ASSETS_FOLDER + self.file_name.replace('./', '') + self.secondaryURL = self.URL_PREFIX + self.file_name.replace('./', '') def decontextualize_file_name(self, update_url=None): """Decontextualize file name. """ self.file_name = self.file_name.replace('./', '') if update_url is not None: self.URL = update_url + self.file_name # @classmethod # def get_semantic_dictionary(cls): # """ Creates and returns a semantic dictionary as specified by SemanticClass. # """ # dictionary = super(SVGImage,cls).get_semantic_dictionary() # return cls.return_dictionary_after_updating_super_classes(dictionary) Index: svgscripts/ =================================================================== --- svgscripts/ (revision 108) +++ svgscripts/ (revision 109) @@ -1,487 +1,495 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path as PathlibPath from import Bar import re import shutil import string from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.path import Path as SVGPath from svgpathtools.path import Line import sys import tempfile import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from import Box from datatypes.archival_manuscript import ArchivalManuscriptUnity from datatypes.mark_foreign_hands import MarkForeignHands from import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.word import Word, do_paths_intersect_saveMode, update_transkription_position_ids from extract_line_continuation import extract_line_continuations from util import back_up, process_warnings4status from process_files import update_svgposfile_status from process_footnotes import categorize_footnotes sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from main_util import extract_paths_on_tf __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False DEBUG_WORD = None MERGED_DIR = 'merged' WARNING_FOOTNOTES_ERROR = 'footnotes not processed' WARNING_LINE_CONTINUATION = 'line continuation fail' def categorize_paths(page, transkription_field=None): """Categorize all paths that are part of the transkription field. :return: a dictionary containig a list for each category of path. """ if page.source is not None and isfile(page.source): MAX_HEIGHT_LINES = 1 max_line = sorted(\ [ for line_number in page.line_numbers if % 2 == 0],\ reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17 tr_xmin = 0.0 tr_ymin = 0.0 if (page.svg_image is None or page.svg_image.text_field is None)\ and transkription_field is not None: tr_xmin = transkription_field.xmin tr_ymin = transkription_field.ymin paths, attributes = svg_to_paths.svg2paths(page.source) allpaths_outside_tf = [] attributes_outside_tf = [] if transkription_field is None: transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) allpaths_on_tf = extract_paths_on_tf(page, outsiders=allpaths_outside_tf, outsider_attributes=attributes_outside_tf, transkription_field=transkription_field) path_dict = { 'text_area_deletion_paths': [],\ 'deletion_or_underline_paths': [],\ 'box_paths': [],\ 'dots_paths': [],\ 'word_connector_paths': [],\ 'uncategorized_paths': [] } for mypath in allpaths_on_tf: xmin, xmax, ymin, ymax = mypath.path.bbox() start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin) if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1: path_dict.get('dots_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed(): path_dict.get('box_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed(): path_dict.get('word_connector_paths').append(mypath) elif abs(ymax-ymin) < MAX_HEIGHT_LINES: mypath.start_line_number = start_line_number path_dict.get('deletion_or_underline_paths').append(mypath) elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin): # Check for "ladder", i.e. a path with 3 segments (seg0 is horizontal on line x, seg1 moves to line x+1, seg2 is horizontal on line x+1) if start_line_number + 1 == page.get_line_number(mypath.path.end.imag-tr_ymin)\ and len(mypath.path._segments) == 3\ and abs(mypath.path._segments[0].bbox()[3]-mypath.path._segments[0].bbox()[2]) < MAX_HEIGHT_LINES\ and abs(mypath.path._segments[2].bbox()[3]-mypath.path._segments[2].bbox()[2]) < MAX_HEIGHT_LINES: for index in 0, 2: new_path = Path(parent_path=mypath, path=SVGPath(mypath.path._segments[index])) new_path.start_line_number = page.get_line_number(new_path.path.start.imag-tr_ymin) path_dict.get('deletion_or_underline_paths').append(new_path) else: path_dict.get('text_area_deletion_paths').append(mypath) else: path_dict.get('uncategorized_paths').append(mypath) underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin) path_dict.update({'underline_path': underline_path}) path_dict['uncategorized_paths'] += process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\ paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line) return path_dict elif not UNITTESTING: error_msg = 'Svg source file {} does not exist!'.format(page.source)\ if page.source is not None else 'Page does not contain a source file!' raise FileNotFoundError(error_msg) return {} def copy_page_to_merged_directory(page, manuscript_file=None): """Copy page to directory that contains the first version of all svg_pos_files that have been merged with the faksimile position data. MERGED_DIR is a subfolder of svg_pos_files-directory. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) target_dir = svg_pos_file.parent / MERGED_DIR if not target_dir.is_dir(): target_dir.mkdir() target_pos_file = target_dir / save_page(page, str(svg_pos_file), target_svg_pos_file=str(target_pos_file), status=STATUS_MERGED_OK, manuscript_file=manuscript_file) def find_special_words(page, transkription_field=None): """Find special words, remove them from words, process their content. """ if page.source is None or not isfile(page.source): raise FileNotFoundError('Page does not have a source!') if transkription_field is None: transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) set_to_text_field_zero = (page.svg_image is None or page.svg_image.text_field is None) special_char_list = MarkForeignHands.get_special_char_list() special_char_list += TextConnectionMark.get_special_char_list() single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ] if not UNITTESTING: bar = Bar('find special words', max=len(single_char_words)) for word in single_char_words: not bool(UNITTESTING) and if word.text == MarkForeignHands.CLASS_MARK: id = len(page.mark_foreign_hands) page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id)) page.words.remove(word) elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\ or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\ and any(style in page.sonderzeichen_list for style\ in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))): id = len(page.text_connection_marks) page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id)) page.words.remove(word) not bool(UNITTESTING) and bar.finish() svg_tree = ET.parse(page.source) page.update_page_type(transkription_field=transkription_field) page.update_line_number_area(transkription_field, svg_tree=svg_tree, set_to_text_field_zero=set_to_text_field_zero) if page.marginals_source is not None: svg_tree = ET.parse(page.marginals_source) italic_classes = [ key for key in page.style_dict\ if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ] if len(page.mark_foreign_hands) > 0: MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\ SonderzeichenList=page.sonderzeichen_list, set_to_text_field_zero=set_to_text_field_zero) if len(page.text_connection_marks) > 0: TextConnectionMark.find_content_in_footnotes(page, transkription_field, svg_tree) def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks all words that intersect with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] list of .path.Path that might be word_underline_paths """ if not UNITTESTING: bar = Bar('mark words that intersect with deletion paths', max=len(page.words)) for word in page.words: not bool(UNITTESTING) and word = mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) for part_word in word.word_parts: part_word = mark_word_if_it_intersects_with_paths_as_deleted(part_word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) word.partition_according_to_deletion() not bool(UNITTESTING) and bar.finish() # return those paths in deletion_paths that are not in page.word_deletion_paths return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ] def mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks word if it intersects with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] word """ word.deleted = False for transkription_position in word.transkription_positions: word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) intersecting_paths = [ deletion_path for deletion_path in deletion_paths\ if do_paths_intersect_saveMode(deletion_path, word_path) ] if DEBUG_WORD is not None and word.text == DEBUG_WORD.text and word.line_number == DEBUG_WORD.line_number: relevant_paths = [ path for path in deletion_paths if path.start_line_number == DEBUG_WORD.line_number ] #print(word.line_number, word_path.path.bbox(), [ path.path.bbox() for path in relevant_paths]) if len(intersecting_paths) > 0: #print(f'{word.line_number}: {}, {word.text}: {intersecting_paths}') transkription_position.deleted = True transkription_position._deletion_paths += intersecting_paths for deletion_path in intersecting_paths: if deletion_path.parent_path is not None: deletion_path = deletion_path.parent_path if deletion_path not in page.word_deletion_paths: deletion_path.tag = Path.WORD_DELETION_PATH_TAG deletion_path.attach_object_to_tree(page.page_tree) page.word_deletion_paths.append(deletion_path) return word def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None): """Process words after merging with faksimile word positions. """ if page is None and svg_pos_file is None: raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!') if page is None: page = Page(svg_pos_file) if page.source is None or not isfile(page.source): raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file)) if svg_pos_file is None: svg_pos_file = page.page_tree.docinfo.URL if new_words is not None: page.words = sorted(new_words, key=attrgetter('id')) for word_node in page.page_tree.xpath('.//word'): word_node.getparent().remove(word_node) manuscript = ArchivalManuscriptUnity.create_cls(manuscript_file)\ if manuscript_file is not None\ else None copy_page_to_merged_directory(page, manuscript_file=manuscript_file) transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) update_faksimile_line_positions(page) status = STATUS_MERGED_OK page.update_styles(manuscript=manuscript, partition_according_to_styles=True) save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) categorize_paths(page, transkription_field=transkription_field) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('default') try: find_special_words(page, transkription_field=transkription_field) + save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) categorize_footnotes(page) + save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) extract_line_continuations(page, warning_message=WARNING_LINE_CONTINUATION) except Exception: warnings.warn(WARNING_FOOTNOTES_ERROR) status = process_warnings4status(w, [ WARNING_FOOTNOTES_ERROR, WARNING_LINE_CONTINUATION ], status, STATUS_POSTMERGED_OK) save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17) -> list: """Process word boxes: partition words according to word boxes. [:return:] a list of paths that are not boxes """ MAX_HEIGHT_LINES = 1 not_boxes = [] try: if not UNITTESTING: bar = Bar('process word boxes', max=len(page.words)) svg_tree = ET.parse(page.source) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } allpaths_on_margin_field = [] tr_xmin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\ else transkription_field.xmin tr_ymin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\ else transkription_field.ymin if paths is None or attributes is None: paths = [] raw_paths, attributes = svg_to_paths.svg2paths(page.source) for index, raw_path in enumerate(raw_paths): paths.append(Path.create_cls(id=index, path=raw_path, style_class=attributes[index].get('class'), page=page)) for index, mypath in enumerate(paths): path = mypath.path xmin, xmax, ymin, ymax = path.bbox() attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\ or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\ and abs(ymax-ymin) < max_line: allpaths_on_margin_field.append(mypath)#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) box_line_number_dict = {} for box_path in sorted(box_paths, key=lambda path: path.get_median_y()): line_number = page.get_line_number(box_path.get_median_y(tr_ymin=tr_ymin)) if line_number > 0: if line_number not in box_line_number_dict.keys(): box_line_number_dict.update({ line_number: [ box_path ]}) else: box_line_number_dict.get(line_number).append(box_path) boxes = [] for line_number in box_line_number_dict.keys(): box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x()) margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\ if page.get_line_number(margin_box.get_median_y(tr_ymin=tr_ymin)) == line_number ],\ key=lambda path: path.get_x()) threshold = 3 if line_number % 2 == 0 else 1.5 if len(margin_boxes_on_line) > 0: for box_path in box_paths_on_line: #print(line_number, box_path.path.d(), len(margin_boxes_on_line)) box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\ namespaces=namespaces, threshold=threshold) if box is not None: boxes.append(box) else: not_boxes += box_paths_on_line - if len(boxes) > 0: + if len(boxes) > 0 and len(page.words) > 0: print(len(boxes)) - for word in page.words: - word.process_boxes(boxes, tr_xmin=tr_xmin, tr_ymin=tr_ymin) - word.create_correction_history(page) - if not bool(UNITTESTING): - - elif word.earlier_version is not None: - #print(f'{word.text} -> {word.earlier_version.text}') - if word.earlier_version.earlier_version is not None: - print(f'{word.earlier_version.earlier_version.text}') + startIndex = 0 + steps = round(len(page.words)/4) if not bool(UNITTESTING) else len(page.words) + while startIndex+steps <= len(page.words): + for word in page.words[startIndex:startIndex+steps]: + word.process_boxes(boxes, tr_xmin=tr_xmin, tr_ymin=tr_ymin) + word.create_correction_history(page) + if not bool(UNITTESTING): + + elif word.earlier_version is not None: + #print(f'{word.text} -> {word.earlier_version.text}') + if word.earlier_version.earlier_version is not None: + print(f'{word.earlier_version.earlier_version.text}') + save_page(page, page.page_tree.docinfo.URL) + page = Page.create_cls(page.page_tree.docinfo.URL) + startIndex += steps not bool(UNITTESTING) and bar.finish() except Exception as e: print(e) return not_boxes def reset_page(page): """Reset all words that have word_parts in order to run the script a second time. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) first_merge_version = svg_pos_file.parent / MERGED_DIR / if first_merge_version.exists(): page = Page(str(first_merge_version)) else: word_with_wordparts = [ word for word in page.words if len(word.word_parts) > 0 ] word_with_wordparts += [ word for word in page.words if word.earlier_version is not None ] page_changed = False if len(word_with_wordparts) > 0: for word in word_with_wordparts: word.undo_partitioning() update_transkription_position_ids(word) page_changed = True no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if len(no_line_numbers) > 0: for word in no_line_numbers: if len(word.transkription_positions) > 0: word.line_number = page.get_line_number((word.transkription_positions[0].top+word.transkription_positions[0].bottom)/2) else: msg = f'Word {} {word.text} has no transkription_position!' warnings.warn(msg) page_changed = True if page_changed: page.update_and_attach_words2tree() def save_page(page, svg_pos_file, target_svg_pos_file=None, status=None, manuscript_file=None): """Save page to target_file and update status of file. """ page.update_and_attach_words2tree() if not UNITTESTING: if target_svg_pos_file is None: target_svg_pos_file = svg_pos_file if status is not None: update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status) write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def update_faksimile_line_positions(page): """Update faksimile_positions of the lines """ num_lines = len(page.line_numbers) ymin = page.text_field.ymin\ if page.text_field is not None\ else 0.0 for line_number in page.line_numbers: if len([ word.faksimile_positions[0] for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == ]) > 0: line_number.faksimile_inner_top = min([ word.faksimile_positions[0].top for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == ]) line_number.faksimile_inner_bottom = max([ word.faksimile_positions[0].bottom for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == ]) if % 2 == 0: line_number.faksimile_outer_top = line_number.faksimile_inner_top - ymin line_number.faksimile_outer_bottom = line_number.faksimile_inner_bottom - ymin for index, line_number in enumerate(page.line_numbers): if line_number.faksimile_inner_bottom == 0.0\ or line_number.faksimile_inner_bottom < line_number.faksimile_inner_top: if index == 0 and num_lines > 1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].top elif index == num_lines-1 and page.text_field is not None: line_number.faksimile_inner_bottom = round(page.text_field.height + page.text_field.ymin, 3) elif index > 0 and index < num_lines-1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].faksimile_inner_top\ if page.line_numbers[index+1].faksimile_inner_top > page.line_numbers[index-1].faksimile_inner_bottom\ else page.line_numbers[index-1].faksimile_inner_bottom line_number.attach_object_to_tree(page.page_tree) def update_writing_process_ids(page): """Update the writing_process_ids of the words and split accordingly. """ for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to process words after they have been merged with faksimile data. svgscripts/ [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -i|--include-missing-line-number run script on files that contain words without line numbers -r|--rerun rerun script on a svg_pos_file that has already been processed :return: exit code (int) """ status_not_contain = STATUS_POSTMERGED_OK include_missing_line_number = False try: opts, args = getopt.getopt(argv, "hir", ["help", "include-missing-line-number", "rerun" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-i', '--include-missing-line-number'): include_missing_line_number = True elif opt in ('-r', '--rerun'): status_not_contain = '' if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=status_not_contain): reset_page(page) no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if not include_missing_line_number and len(no_line_numbers) > 0: not UNITTESTING and print(Fore.RED + f'Page {page.title}, {page.number} has words with no line number!') for word in no_line_numbers: not UNITTESTING and print(f'Word {}: {word.text}') else: back_up(page, page.xml_file) not UNITTESTING and print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL) post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/ =================================================================== --- svgscripts/ (revision 108) +++ svgscripts/ (revision 109) @@ -1,277 +1,282 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import lxml.etree as ET import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path as PathlibPath from import Bar import re import shutil import sys import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.archival_manuscript import ArchivalManuscriptUnity from import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.atypical_writing import AtypicalWriting from datatypes.clarification import Clarification from datatypes.editor_comment import EditorComment from datatypes.editor_correction import EditorCorrection from datatypes.footnotes import extract_footnotes from datatypes.line_continuation import LineContinuation from datatypes.standoff_tag import StandoffTag from datatypes.text import Text from datatypes.text_connection_mark import TextConnectionMark from datatypes.uncertain_decipherment import UncertainDecipherment from util import back_up from process_files import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False ATYPICAL_GROUP = re.compile(r'(.*:.*]\s*)(¿)(.*)') CLARIFICATION_GROUP = re.compile(r'(.*:.*]\s*)(Vk)(.*)') CONTINUATION_GROUP = re.compile(r'(.*:\s*)(Fortsetzung\s*)') COMMENT_GROUP = re.compile(r'(.*:.*])') EDITOR_CORRECTION_GROUP = re.compile(r'(.*:.*]\s*)(>[?]*)(.*)') LINE_REFERENCE_GROUP = re.compile(r'(\d+-|\d/(\d+/)*)*([0-9]+)(:.*)') LINE_REFERENCE_GROUP_START_INDEX = 1 LINE_REFERENCE_GROUP_MID_INDEX = 2 LINE_REFERENCE_GROUP_END_INDEX = 3 LINE_COMMENT_GROUP = re.compile(r'(.*\d+:)') UNCERTAINTY_WORD_GROUP = re.compile(r'(.*:.*]\s*)([>]*\?)(.*)') UNCERTAINTY_EDITOR_GROUP = re.compile(r'(.*)(\?)') WORD_REFERENCE_GROUP = re.compile(r'(.*[0-9]+:\s*)(.*)(].*)') DEBUG = False def categorize_footnotes(page, footnotes=None, debug=False, skip_after=-1.0, find_content=False): """Categorize footnotes. """ DEBUG = debug if footnotes is None: footnotes = extract_footnotes(page, skip_after=skip_after) for footnote in footnotes: line_match = re.match(LINE_REFERENCE_GROUP, footnote.content) if line_match is not None: _process_line_match(page, footnote, line_match) else: warnings.warn(f'Unknown editor comment without a line reference: <{footnote}>') if find_content and len(page.text_connection_marks) > 0: TextConnectionMark.find_content_in_footnotes(page, footnotes=footnotes) page.update_and_attach_words2tree() for line in page.lines: line.attach_object_to_tree(page.page_tree) DEBUG = False if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def _is_uncertain(footnote) -> bool: """Return whether footnote contains sign for uncertainty. """ uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content) return (uncertain_match is not None\ and len([ markup for markup in footnote.standoff_markups\ if markup.css_string.endswith('italic;')\ and uncertain_match.end() >= markup.startIndex\ and uncertain_match.end() <= markup.endIndex ]) > 0) def _process_line_match(page, footnote, line_match): """Process footnote if reference to a line matches. """ word_match = re.match(WORD_REFERENCE_GROUP, footnote.content) end_line_number = int( lines = [] if is not None: if is not None: line_ids = [ int(line_id) for line_id in\'/')\ if line_id != '' ] + [ end_line_number ] lines = [ line for line in page.lines if in line_ids ] else: start_line_number = int([0:-1]) lines = [ line for line in page.lines if >= start_line_number and <= end_line_number ] else: lines = [ line for line in page.lines if == end_line_number ] if word_match is not None: - _process_word_match(page, footnote, line_match,, end_line_number) + _process_word_match(page.words, footnote, line_match,, end_line_number) elif len(lines) > 0: uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content) for line in lines: _process_line_reference(page, footnote, line, _is_uncertain(footnote)) else: warnings.warn(f'Footnote refers to missing line {line_number}: {footnote}') def _process_line_reference(page, footnote, line, is_uncertain): """Process footnote if there is a line reference. """ continuation_match = re.match(CONTINUATION_GROUP, footnote.content) if continuation_match is not None: reference_string = footnote.content[continuation_match.end():] if is_uncertain: reference_string = reference_string[:-1] line.editor_comments.append(LineContinuation.create_cls(reference_string=reference_string, is_uncertain=is_uncertain)) else: comment_match = re.match(LINE_COMMENT_GROUP, footnote.content) if comment_match is not None: is_uncertain = _is_uncertain(footnote) comment = footnote.content[comment_match.end():-1].strip()\ if is_uncertain\ else footnote.content[comment_match.end():].strip() line.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain)) else: warnings.warn(f'Unknown editor comment for line "{}": <{footnote}>') -def _process_word_match(page, footnote, line_match, word_text, line_number, parent_word_composition=None): +def _process_word_match(words, footnote, line_match, word_text, line_number, parent_word_composition=None): """Process footnote if there is a word reference. """ - referred_words = [ word for word in page.words\ + referred_words = [ word for word in words\ if word.line_number == line_number\ and (word.text == word_text\ or re.match(rf'\W*{word_text}\W', word.text)\ or word.edited_text == word_text) ] - referred_word_parts = [ word.word_parts for word in page.words\ + referred_word_parts = [ word.word_parts for word in words\ if word.line_number == line_number\ and len(word.word_parts) > 0\ and word_text in [ wp.text for wp in word.word_parts ] ] - overwritten_word_matches = [ word for word in page.words\ + overwritten_word_matches = [ word for word in words\ if word.line_number == line_number\ and len(word.word_parts) > 0\ and len([word_part for word_part in word.word_parts\ if word_part.overwrites_word is not None\ and word_part.overwrites_word.text == word_text]) > 0] if len(referred_words) > 0\ or len(overwritten_word_matches) > 0\ or len(referred_word_parts) > 0: word = None if len(referred_words) == 1: word = referred_words[0] elif len(overwritten_word_matches) > 0: word = [ word_part.overwrites_word for word_part in overwritten_word_matches[0].word_parts\ if word_part.overwrites_word is not None and word_part.overwrites_word.text == word_text][0] elif len(referred_word_parts) > 0: word = [ word_part for word_part in referred_word_parts[0] if word_part.text == word_text ][0] else: word = [ better_word for better_word in referred_words if better_word.text == word_text][0] atypical_match = re.match(ATYPICAL_GROUP, footnote.content) correction_match = re.match(EDITOR_CORRECTION_GROUP, footnote.content) clarification_match = re.match(CLARIFICATION_GROUP, footnote.content) is_uncertain = re.match(UNCERTAINTY_WORD_GROUP, footnote.content) is not None if correction_match is not None: correction = word.editor_comment = EditorCorrection(correction_text=correction, is_uncertain=is_uncertain) if not is_uncertain: word.edited_text = correction elif clarification_match is not None: word.editor_comment = Clarification(text=footnote.extract_part(word_text, css_filter='bold;')) elif atypical_match is not None: text = footnote.extract_part(word_text, css_filter='bold;')\ if footnote.markup_contains_css_filter('bold;')\ else None word.editor_comment = AtypicalWriting(text=text) elif is_uncertain: word.editor_comment = UncertainDecipherment() else: comment_match = re.match(COMMENT_GROUP, footnote.content) if comment_match is not None: is_uncertain = _is_uncertain(footnote) comment = footnote.content[comment_match.end():-1].strip()\ if is_uncertain\ else footnote.content[comment_match.end():].strip() word.editor_comment = EditorComment(comment=comment, is_uncertain=is_uncertain) else: warnings.warn(f'Unknown editor comment for word "{word.text}": <{footnote}>') elif re.match(r'.*\s.*', word_text): for word_part in word_text.split(' '): - _process_word_match(page, footnote, line_match, word_part, line_number, parent_word_composition=word_text) + _process_word_match(words, footnote, line_match, word_part, line_number, parent_word_composition=word_text) + elif len([word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]) > 0: + new_words = [] + for word in [word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]: + new_words += word.word_parts + _process_word_match(new_words, footnote, line_match, word_text, line_number) else: warnings.warn(f'No word found with text "{word_text}" on line {line_number}: <{footnote}>') def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to process the footnotes of a page. svgscripts/ [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -s|--skip-until=left skip all nodes.get('X') < left :return: exit code (int) """ skip_after=-1.0 try: opts, args = getopt.getopt(argv, "hs:", ["help", "skip-until=" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-s', '--skip-until'): skip_after = float(arg) if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK): if not UNITTESTING: print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL) back_up(page, page.xml_file) categorize_footnotes(page, skip_after=skip_after, find_content=True) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/ =================================================================== --- svgscripts/ (revision 108) +++ svgscripts/ (revision 109) @@ -1,755 +1,760 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert the word positions to HTML for testing purposes. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import cairosvg import getopt import json from lxml.html import builder as E from lxml.html import open_in_browser import lxml from pathlib import Path as PathLibPath from os import sep, listdir, mkdir, path, remove from os.path import exists, isfile, isdir, dirname import re import sys from svgpathtools import svg_to_paths import xml.etree.ElementTree as ET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.matrix import Matrix from import Page from datatypes.page_creator import PageCreator from datatypes.transkriptionField import TranskriptionField from datatypes.text_field import TextField from datatypes.writing_process import WritingProcess from datatypes.word import Word sys.path.append('shared_util') from main_util import extract_paths_on_tf, get_paths_near_position __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" EXIST_DB = '' LOCAL_SERVER = 'http://localhost:8000/' class Converter: """The converter super class. """ def __init__(self, page, non_testing=True, show_word_insertion_mark=False): = page self.non_testing = non_testing self.show_word_insertion_mark = show_word_insertion_mark def _get_transkription_positions(self, transkription_positions, stage_version=''): """Returns the transkription_positions of the indicated stage_version. """ convertable_transkription_positions = transkription_positions if stage_version != '': convertable_transkription_positions = [] if re.match(r'^\d$', stage_version): writing_process_id = int(stage_version) for transkription_position in transkription_positions: if transkription_position.writing_process_id == writing_process_id: convertable_transkription_positions.append(transkription_position) elif re.match(r'^\d\+$', stage_version): version_range = [ *range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)) ] for transkription_position in transkription_positions: if transkription_position.writing_process_id in version_range: convertable_transkription_positions.append(transkription_position) elif re.match(r'^\d\-\d$', stage_version): start_stop = [ int(i) for i in re.split(r'-', stage_version) ] version_range = [ *range(start_stop[0], start_stop[1]+1) ] for transkription_position in transkription_positions: if transkription_position.writing_process_id in version_range: convertable_transkription_positions.append(transkription_position) return convertable_transkription_positions def _get_words(self, words, highlighted_words=None): """Return the words that will be hightlighted. """ return highlighted_words if highlighted_words is not None else words def convert(self, output_file=None, stage_version='', highlighted_words=None): """Prints all words. """ first_word_of_line = None out = sys.stdout if output_file is not None: out = open(output_file, 'w') for word in if first_word_of_line is None or first_word_of_line.line_number != word.line_number: out.write('\n') first_word_of_line = word - if word.line_number % 2 == 0: - out.write(str(word.line_number).zfill(2) + ' ') - else: - out.write(' ') + if output_file is None: + if word.line_number % 2 == 0: + out.write(str(word.line_number).zfill(2) + ' ') + else: + out.write(' ') if stage_version == '' or len(self._get_transkription_positions(word.transkription_positions, stage_version=stage_version)) > 0: - if word.text is not None: + if word.edited_text is not None: + out.write(word.edited_text + ' ') + elif word.text is not None: out.write(word.text + ' ') out.close() return 0 @classmethod def CREATE_CONVERTER(cls, page, non_testing=True, converter_type='', show_word_insertion_mark=False, key=''): """Returns a converter of type converter_type. [:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None """ cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() } cls_key = converter_type + 'Converter' if bool(cls_dict.get(cls_key)): converter_cls = cls_dict[cls_key] if converter_cls == JSONConverter: return converter_cls(page, non_testing=non_testing, key=key) return converter_cls(page, non_testing, show_word_insertion_mark) else: return Converter(page, non_testing, show_word_insertion_mark) class JSONConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a json file. """ def __init__(self, page, faksimile_page=None, non_testing=True, key=''): Converter.__init__(self, page, non_testing, False) self.faksimile_page = faksimile_page def _add_word_to_list(self, words, word, text, text_field=None, edited_text=None, earlier_version=None, overwrites_word=None, parent_id=-1, faksimile_positions=None): """Add word to list. """ id =\ if parent_id == -1\ else parent_id edited_text = word.edited_text\ if edited_text is None\ else edited_text earlier_version = word.earlier_version\ if earlier_version is None\ else earlier_version overwrites_word = word.overwrites_word\ if overwrites_word is None\ else overwrites_word line_number = word.line_number for tp in word.transkription_positions: tp_id = f'w{}:tp{}'\ if parent_id == -1\ else f'w{parent_id}:w{}:tp{}' if text_field is not None: word_dict = { 'id': id, 'text': text, 'left': tp.left + text_field.left, 'top': +,\ 'width': tp.width, 'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted } if tp.transform is not None: matrix = tp.transform.clone_transformation_matrix() xmin = text_field.left ymin = matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3) matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3) word_dict.update({ 'transform': matrix.toString() }) if tp.left > 0: word_dict.update({ 'left': round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)}) else: word_dict.update({ 'left': 0}) word_dict.update({ 'top': round((tp.height-1.5)*-1, 3)}) else: word_dict = { 'id': id, 'text': text, 'left': tp.left, 'top':, 'width': tp.width,\ 'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted } if tp.transform is not None: word_dict.update({ 'transform': tp.transform.toString() }) if edited_text is not None: word_dict.update({'edited_text': edited_text}) if earlier_version is not None: word_dict.update({'earlier_version': earlier_version.text }) if overwrites_word is not None: word_dict.update({'overwrites_word': overwrites_word.text }) if parent_id > -1: word_dict.update({'part_text': word.text }) if len(word.deletion_paths) > 0: for dp_index, dp in enumerate(word.deletion_paths): if bool(word_dict.get('deletion_path')): word_dict = word_dict.copy() word_dict.update({'deletion_path': dp.d_attribute}) words.append(word_dict) if len(word.deletion_paths_near_word) > 0: word_dict.update({'paths_near_word': word.deletion_paths_near_word }) words.append(word_dict) else: words.append(word_dict) if faksimile_positions is not None: faksimile_dict = {} for fp in word.faksimile_positions: self._add_faksimile_to_list(id, line_number, fp, word.deleted, faksimile_positions, text, edited_text=edited_text,\ earlier_version=earlier_version, overwrites_word=overwrites_word, parent_id=parent_id, word_text=word.text) for wp in word.word_parts: self._add_word_to_list(words, wp, text, text_field=text_field, edited_text=edited_text,\ earlier_version=earlier_version, overwrites_word=overwrites_word,, faksimile_positions=faksimile_positions) def _add_faksimile_to_list(self, id, line_number, fp, deleted, faksimile_positions, text, edited_text=None, earlier_version=None, overwrites_word=None, parent_id=-1, word_text='') ->dict: """Create and return a json dictionary. """ faksimile_dict = { 'id': id, 'text': text, 'left': fp.left, 'top':,\ 'width': fp.width, 'height': fp.height, 'line': line_number, 'fp_id':, 'deleted': deleted } if fp.transform is not None: faksimile_dict.update({ 'transform': fp.transform.toString() }) if len(faksimile_dict) > 0: if edited_text is not None: faksimile_dict.update({'edited_text': edited_text}) if earlier_version is not None: faksimile_dict.update({'earlier_version': earlier_version.text }) if overwrites_word is not None: faksimile_dict.update({'overwrites_word': overwrites_word.text }) if parent_id > -1: faksimile_dict.update({'part_text': word_text }) faksimile_positions.append(faksimile_dict) def create_json_dict(self) ->dict: """Create and return a json dictionary. """ words = [] faksimile_positions = [] text_field = None if is not None: if is None: text_field = = TranskriptionField( for word in self._add_word_to_list(words, word, word.text, text_field=text_field, faksimile_positions=faksimile_positions) lines = [] faksimile_lines = [] offset = 0 if text_field is None else text_field.ymin svg_image = self.add_object2dict( if self.faksimile_page is not None: if is None: if self.faksimile_page.faksimile_image.text_field is None\ and self.faksimile_page.text_field is not None: self.faksimile_page.faksimile_image.text_field = self.faksimile_page.text_field = self.faksimile_page.faksimile_image for fp in self.faksimile_page.word_positions: if not in [ f_dict.get('fp_id') for f_dict in faksimile_positions ]: self._add_faksimile_to_list(, -1, fp, False, faksimile_positions, fp.text) faksimile_image = self.add_object2dict( if svg_image is not None: svg_image.update({ 'URL': }) svg_image.update({ 'x': }) svg_image.update({ 'y': }) if faksimile_image is not None: + if bool(faksimile_image.get('transform_string')): + faksimile_image.update({ 'transform': faksimile_image.get('transform_string') }) faksimile_image.update({ 'secondaryURL': LOCAL_SERVER + "faksimiles/" + }) faksimile_image.update({ 'x': 0 }) faksimile_image.update({ 'y': 0 }) for line in lines.append({ 'id':, 'number':, 'top': + offset, 'bottom': line.bottom }) faksimile_lines.append({ 'id':, 'number':, 'top': line.faksimile_inner_top, 'bottom': line.faksimile_inner_bottom }) return { 'title':, 'number':, 'words': words, 'svg': svg_image, 'lines': lines,\ 'faksimile': faksimile_image, 'faksimile_positions': faksimile_positions, 'faksimile_lines': faksimile_lines } def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to JSON. """ if output_file is None: output_file = 'output.json' json_file = open(output_file, "w+") try: json.dump(self.create_json_dict(), json_file) except Exception: raise Exception('Error in json.dump') json_file.close() return 0 def add_object2dict(self, object_instance): """Add an object to json_dict and generate json data and interfaces. [:return:] json dict or object_instance """ json_dict = {} object_type = type(object_instance) if object_type.__module__ == 'builtins': if object_type != list: return object_instance else: items = [] for item in object_instance: items.append(self.add_object2dict(item)) if len(items) > 0: return items else: return { self.key: [] } semantic_dictionary = object_type.get_semantic_dictionary() for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]: content = object_instance.__dict__.get(key) if content_type == list\ and content is not None\ and len(content) > 0\ and type(content[0]).__module__ != 'builtins': content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item)) json_dict.update({key: content_list}) elif content_type.__module__ == 'builtins': if content is not None: json_dict.update({key: content}) else: if content is not None and type(content) == list: content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item)) json_dict.update({key: content_list}) else: if content is not None: json_dict.update({key: self.add_object2dict(content)}) return json_dict class oldJSONConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a json file. """ PY2TS_DICT = { float: 'number', int: 'number', bool: 'boolean', str: 'string' } def __init__(self, page, non_testing=True, key=''): Converter.__init__(self, page, non_testing, False) self.key = key self.interface_output_dir = PathLibPath('ts_interfaces') if not self.interface_output_dir.is_dir(): self.interface_output_dir.mkdir() elif len(list(self.interface_output_dir.glob('*.ts'))) > 0: for ts_file in self.interface_output_dir.glob('*.ts'): remove(ts_file) def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to JSON. """ if output_file is None: output_file = 'output.json' class_dict = {} if self.key != '': object_instance = if object_instance is not None: json_dict = self.add_object2dict(object_instance, class_dict) if type(json_dict) == list: json_dict = { self.key : json_dict } else: print(f'Page initialized from {} does not have an object at "{self.key}"!') return 2 else: json_dict = self.add_object2dict(, class_dict) json_file = open(output_file, "w+") try: json.dump(json_dict, json_file) except Exception: raise Exception('Error in json.dump') json_file.close() self.create_imports(class_dict) return 0 def add_object2dict(self, object_instance, class_dict): """Add an object to json_dict and generate json data and interfaces. [:return:] json dict or object_instance """ json_dict = {} interface_list = [] object_type = type(object_instance) if object_type.__module__ == 'builtins': if object_type != list: return object_instance else: items = [] for item in object_instance: items.append(self.add_object2dict(item, class_dict)) if len(items) > 0: return { self.key: items } else: return { self.key: 'null' } semantic_dictionary = object_type.get_semantic_dictionary() for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]: content = object_instance.__dict__.get(key) if content_type == list\ and content is not None\ and len(content) > 0\ and type(content[0]).__module__ != 'builtins': content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item, class_dict)) json_dict.update({key: content_list}) interface_list.append(f'{key}: {type(content[0]).__name__}[];') elif content_type.__module__ == 'builtins': if content_type != list: ts_type = self.PY2TS_DICT[content_type]\ if content_type in self.PY2TS_DICT.keys()\ else 'string' interface_list.append(f'{key}: {ts_type};') json_dict.update({key: content}) else: if content is not None and type(content) == list: interface_list.append(f'{key}: {content_type.__name__}[];') content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item, class_dict)) json_dict.update({key: content_list}) else: interface_list.append(f'{key}: {content_type.__name__};') if content is not None: json_dict.update({key: self.add_object2dict(content, class_dict)}) if object_type not in class_dict.keys(): class_dict.update({object_type: self.create_interface(object_type.__name__, interface_list)}) return json_dict def create_imports(self, class_dict): """Create an ts interface from a list of key and content_types. [:return:] file_name of interface """ ts_file = PathLibPath('ts_imports.ts') file = open(ts_file, "w+") file.write(f'//import all interfaces from {self.interface_output_dir} ' + '\n') for interface_name, path_name in class_dict.items() : file.write('import {' + interface_name.__name__ + '} from \'./' + str(self.interface_output_dir.joinpath(path_name.stem)) + '\';\n') file.close() return ts_file def create_interface(self, class_name, interface_list) -> PathLibPath: """Create an ts interface from a list of key and content_types. [:return:] file_name of interface """ ts_file = self.interface_output_dir.joinpath(PathLibPath(f'{class_name.lower()}.ts')) import_list = [ import_class_name for import_class_name in\ [ import_class_name.split(': ')[1].replace(';','').replace('[]','') for import_class_name in interface_list ]\ if import_class_name not in set(self.PY2TS_DICT.values()) ] file = open(ts_file, "w") for import_class_name in set(import_list): file.write('import {' + import_class_name + '} from \'./' + import_class_name.lower() + '\';\n') file.write(f'export interface {class_name} ' + '{\n') for interace_string in interface_list: file.write(f'\t' + interace_string + '\n') file.write('}') file.close() return ts_file class SVGConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text. """ BG_COLOR = 'yellow' OPACITY = '0.2' def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY): Converter.__init__(self, page, non_testing, show_word_insertion_mark) self.bg_color = bg_color self.opacity = opacity def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to SVG """ title = if( is not None) else 'Test Page' title = '{}, S. {}'.format(title, if ( is not None) else title svg_file = if svg_file is None and is not None: svg_file = elif svg_file is None: msg = f'ERROR: xml_source_file {} does neither have a svg_file nor a svg_image!' raise Exception(msg) transkription_field = TranskriptionField(svg_file) if bool(transkription_field.get_svg_attributes('xmlns')): ET.register_namespace('', transkription_field.get_svg_attributes('xmlns')) if bool(transkription_field.get_svg_attributes('xmlns:xlink')): ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink')) svg_tree = ET.parse(svg_file) transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'}) colors = [ 'yellow', 'orange' ] if self.bg_color == self.BG_COLOR else [ self.bg_color ] if highlighted_words is not None: colors = ['yellow'] else: highlighted_words = [] color_index = 0 for word in word_id = 'word_' + str( for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version): transkription_position_id = word_id + '_' + str( color = colors[color_index] if word not in highlighted_words else self.bg_color rect_node = ET.SubElement(transkription_node, 'rect',\ attrib={'id': transkription_position_id, 'x': str(transkription_position.left + transkription_field.xmin),\ 'y': str( + transkription_field.ymin), 'width': str(transkription_position.width),\ 'height': str(transkription_position.height), 'fill': color, 'opacity': self.opacity}) if transkription_position.transform is not None: matrix = transkription_position.transform.clone_transformation_matrix() matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3) matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3) rect_node.set('transform', matrix.toString()) rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3))) rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3))) ET.SubElement(rect_node, 'title').text = word.text color_index = (color_index + 1) % len(colors) if output_file is not None: svg_tree.write(output_file) return 0 class HTMLConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a test HTML file. """ CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; } .highlight1 { background-color: pink; opacity: 0.2; } .highlight2 { background-color: red; opacity: 0.2; } .foreign { background-color: blue; opacity: 0.4; } .overwritten { background-color: green; opacity: 0.4; } .word-insertion-mark { background-color: orange; opacity: 0.2; } .deleted { background-color: grey; opacity: 0.2; } """ def __init__(self, page, non_testing=True, show_word_insertion_mark=False): Converter.__init__(self, page, non_testing, show_word_insertion_mark) self.text_field = TextField() def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to HTML """ title = if( is not None) else 'Test Page' title = '{}, S. {}'.format(title, if ( is not None) else title if stage_version != '': title = title + ', Schreibstufe: ' + stage_version if is not None: width = height = svg_file = if is not None: self.text_field = print('Textfield found ->adjusting data') elif is not None: svg_file = transkription_field = TranskriptionField(svg_file) width = transkription_field.getWidth() height = transkription_field.getHeight() style_content = ' position: relative; width: {}px; height: {}px; background-image: url("{}"); background-size: {}px {}px '\ .format(width, height, path.abspath(svg_file), width, height) style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS) head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style) transkription = E.DIV(id="transkription") counter = 0 for word in highlight_class = 'highlight' + str(counter)\ if not word.deleted else 'deleted' if highlighted_words is not None\ and word in highlighted_words: highlight_class = 'highlight2' earlier_text = '' if word.earlier_version is None else word.earlier_version.text if earlier_text == '' and len(word.word_parts) > 0: earlier_versions = [ word for word in word.word_parts if word.earlier_version is not None ] earlier_text = earlier_versions[0].text if len(earlier_versions) > 0 else '' if earlier_text != '': word_title = 'id: {}/line: {}\n0: {}\n1: {}'.format(str(, str(word.line_number), earlier_text, word.text) else: word_title = 'id: {}/line: {}\n{}'.format(str(, str(word.line_number), word.text) if word.edited_text is not None: word_title += f'\n>{word.edited_text}' for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version): self._append2transkription(transkription, highlight_class, word_title, transkription_position) if word.overwrites_word is not None: overwritten_title = f'{word.text} overwrites {word.overwrites_word.text}' for overwritten_transkription_position in word.overwrites_word.transkription_positions: self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position) for part_word in word.word_parts: highlight_class = 'highlight' + str(counter)\ if not part_word.deleted else 'deleted' for part_transkription_position in self._get_transkription_positions(part_word.transkription_positions, stage_version=stage_version): self._append2transkription(transkription, highlight_class, word_title, part_transkription_position) if part_word.overwrites_word is not None: overwritten_title = f'{word.text} overwrites {part_word.overwrites_word.text}' for overwritten_transkription_position in part_word.overwrites_word.transkription_positions: self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position) counter = (counter + 1) % 2 word_insertion_mark_class = 'word-insertion-mark' counter = 0 for mark_foreign_hands in highlight_class = 'foreign' title = 'id: {}/line: {}\n{} {}'.format(str(, str(mark_foreign_hands.line_number),\ mark_foreign_hands.foreign_hands_text, mark_foreign_hands.pen) for transkription_position in mark_foreign_hands.transkription_positions: self._append2transkription(transkription, highlight_class, title, transkription_position) if self.show_word_insertion_mark: for word_insertion_mark in wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(, str(word_insertion_mark.line_number)) style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height) link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content) transkription.append(link) html = E.HTML(head,E.BODY(transkription)) bool(self.non_testing) and open_in_browser(html) if output_file is not None: with open(output_file, 'wb') as f: f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8')) f.closed return 0 def _append2transkription(self, transkription, highlight_class, title, transkription_position): """Append content to transkription-div. """ style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\ -, transkription_position.left - self.text_field.left, transkription_position.width, transkription_position.height) if transkription_position.transform is not None: style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString()) transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\ if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0 style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height) link = E.A(' ', E.CLASS(highlight_class), title=title, style=style_content) transkription.append(link) def create_pdf_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, pdf_file_name='output.pdf', bg_color=SVGConverter.BG_COLOR): """Creates a pdf file highlighting some words. """ if not pdf_file_name.endswith('pdf'): pdf_file_name = pdf_file_name + '.pdf' tmp_svg_file = pdf_file_name.replace('.pdf', '.svg') create_svg_with_highlighted_words(xml_source_file=xml_source_file, page=page, highlighted_words=highlighted_words,\ svg_file_name=tmp_svg_file, bg_color=bg_color) if isfile(tmp_svg_file): cairosvg.svg2pdf(url=tmp_svg_file, write_to=pdf_file_name) remove(tmp_svg_file) def create_svg_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, svg_file_name='output.svg', bg_color=SVGConverter.BG_COLOR): """Creates a svg file highlighting some words. """ if page is None and xml_source_file is not None: page = Page(xml_source_file) converter = SVGConverter(page, bg_color=bg_color) if not svg_file_name.endswith('svg'): svg_file_name = svg_file_name + '.svg' converter.convert(output_file=svg_file_name, highlighted_words=highlighted_words) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes. svgscripts/ OPTIONS OPTIONS: -h|--help: show help -H|--HTML [default] convert to HTML test file -k|--key=key option for json converter: only convert object == page.__dict__[key] -o|--output=outputFile save output to file outputFile -P|--PDF convert to PDF test file -S|--SVG convert to SVG test file -s|--svg=svgFile: svg web file -T|--TEXT convert to TEXT output -t|--text=text highlight word -w|--word-insertion-mark show word insertion mark on HTML -v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. } -x|--testing execute in test mode, do not write to file or open browser :return: exit code (int) """ convert_to_type = None key = '' non_testing = True output_file = None page = None show_word_insertion_mark = False stage_version = '' svg_file = None text = None try: opts, args = getopt.getopt(argv, "hk:t:HPSTws:o:v:x", ["help", "key=", "text=", "HTML", "PDF", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version=", "testing"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-v', '--version'): if re.match(r'^(\d|\d\+|\d\-\d)$', arg): stage_version = arg else: raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg)) elif opt in ('-w', '--word-insertion-mark'): show_word_insertion_mark = True elif opt in ('-P', '--PDF'): convert_to_type = 'PDF' elif opt in ('-S', '--SVG'): convert_to_type = 'SVG' elif opt in ('-T', '--TEXT'): convert_to_type = 'TEXT' elif opt in ('-H', '--HTML'): convert_to_type = 'HTML' elif opt in ('-x', '--testing'): non_testing = False elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-o', '--output'): output_file = arg elif opt in ('-k', '--key'): key = arg elif opt in ('-t', '--text'): text = arg print(arg) if len(args) < 1: usage() return 2 if convert_to_type is None: if output_file is not None and len(re.split(r'\.', output_file)) > 1: output_file_part_list = re.split(r'\.', output_file) convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper() else: convert_to_type = 'HTML' exit_code = 0 for word_position_file in args: if not isfile(word_position_file): print("'{}' does not exist!".format(word_position_file)) return 2 if convert_to_type == 'PDF': if output_file is None: output_file = 'output.pdf' highlighted_words = None if text is not None: page = Page(word_position_file) highlighted_words = [ word for word in page.words if word.text == text ] create_pdf_with_highlighted_words(word_position_file, pdf_file_name=output_file, highlighted_words=highlighted_words) else: if svg_file is not None: if isfile(svg_file): page = PageCreator(word_position_file, svg_file=svg_file) else: print("'{}' does not exist!".format(word_position_file)) return 2 else: page = Page(word_position_file) if page.svg_file is None: print('Please specify a svg file!') usage() return 2 highlighted_words = None if text is not None: highlighted_words = [ word for word in page.words if word.text == text ] print([ (, word.text) for word in highlighted_words ]) converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark, key=key) exit_code = converter.convert(output_file=output_file, stage_version=stage_version, highlighted_words=highlighted_words) return exit_code if __name__ == "__main__": sys.exit(main(sys.argv[1:]))