Index: tests_svgscripts/test_fix_old_data.py =================================================================== --- tests_svgscripts/test_fix_old_data.py (revision 101) +++ tests_svgscripts/test_fix_old_data.py (revision 102) @@ -1,49 +0,0 @@ -import lxml.etree as ET -from os import sep, path, remove -from os.path import isdir, isfile, dirname -import shutil -import sys -import tempfile -import unittest -import warnings - -sys.path.append('svgscripts') - -import fix_old_data -from datatypes.faksimile import FaksimilePage -from datatypes.mark_foreign_hands import MarkForeignHands -from datatypes.page import Page -from datatypes.path import Path -from datatypes.positional_word_part import PositionalWordPart -from datatypes.text_connection_mark import TextConnectionMark -from datatypes.transkriptionField import TranskriptionField -from datatypes.word import Word -from datatypes.word_position import WordPosition - - -class TestFixFaksimile(unittest.TestCase): - def setUp(self): - fix_old_data.UNITTESTING = True - DATADIR = path.dirname(__file__) + sep + 'test_data' - self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml' - - def test_fix_faksimile(self): - page = Page(self.xml_file) - fp = page.words[0].faksimile_positions[0] - left = fp.left - top = fp.top - self.assertEqual(fix_old_data.fix_faksimile_positions(page), True) - self.assertEqual(fp.left, left + page.text_field.xmin) - self.assertEqual(fp.top, top + page.text_field.ymin) - - def test_fix_faksimile_line_position(self): - page = Page(self.xml_file) - fix_old_data.fix_faksimile_line_position(page) - for line_number in page.line_numbers: - #print(line_number.id) - self.assertTrue(line_number.faksimile_inner_top < line_number.faksimile_inner_bottom) - - - -if __name__ == "__main__": - unittest.main() Index: tests_svgscripts/test_extract_line_continuation.py =================================================================== --- tests_svgscripts/test_extract_line_continuation.py (revision 101) +++ tests_svgscripts/test_extract_line_continuation.py (revision 102) @@ -1,52 +1,52 @@ import unittest from os import sep, path, remove from os.path import isfile import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') import extract_line_continuation from datatypes.page import Page from datatypes.transkriptionField import TranskriptionField class TestExtractLineContinuation(unittest.TestCase): def setUp(self): extract_line_continuation.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.w_I_8_125_svg = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.w_I_8_125_xml = DATADIR + sep + 'W_I_8_new_page125.xml' def test_get_arrow_y(self): arrow = ET.Element('text') arrow.set('transform', 'matrix(1 0 0 1 10 20)') self.assertEqual(extract_line_continuation._get_arrow_y(arrow), 20.0) tspan = ET.SubElement(arrow, 'tspan') tspan.set('y', '10.0') self.assertEqual(extract_line_continuation._get_arrow_y(tspan), 30.0) def test_get_line_of_arrow(self): svg_tree = ET.parse(self.w_I_8_125_svg) page = Page(self.w_I_8_125_xml) transkription_field = TranskriptionField(self.w_I_8_125_svg) arrows = extract_line_continuation._extract_arrow_nodes(svg_tree, 'st7') - line = extract_line_continuation._get_line_of_arrow(arrows[0], page, transkription_field) + line = extract_line_continuation._get_line_of_arrow(arrows[0], page, transkription_field.ymin) self.assertEqual(line.id, 15) def test_extract_line_continuations(self): page = Page(self.w_I_8_125_xml) extract_line_continuation.extract_line_continuations(page, svg_file=self.w_I_8_125_svg) lines_with_continuations = [ line for line in page.lines if len(line.editor_comments) > 0] self.assertEqual(len(lines_with_continuations), 2) page = Page('xml/N_VII_1_page029.xml') extract_line_continuation.extract_line_continuations(page) lines_with_continuations = [ line for line in page.lines if len(line.editor_comments) > 0] #print(lines_with_continuations) self.assertEqual(len(lines_with_continuations), 1) page = Page('xml/Mp_XV_page75v.xml') extract_line_continuation.extract_line_continuations(page) lines_with_continuations = [ line for line in page.lines if len(line.editor_comments) > 0] self.assertTrue(len(lines_with_continuations) > 0) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_data/Mp_XIV_1_page420.xml =================================================================== --- tests_svgscripts/test_data/Mp_XIV_1_page420.xml (revision 101) +++ tests_svgscripts/test_data/Mp_XIV_1_page420.xml (revision 102) @@ -1,3310 +1,3295 @@ svgWordPosition 2019-04-16 21:35:00 2019-04-24 17:03:54 2019-04-24 17:09:27 - - - - - - - - - - - - - - - Index: tests_svgscripts/test_data/Mp_XIV_page420.xml =================================================================== --- tests_svgscripts/test_data/Mp_XIV_page420.xml (revision 0) +++ tests_svgscripts/test_data/Mp_XIV_page420.xml (revision 102) @@ -0,0 +1,8354 @@ + + + + + + + + + + + + + + + + + + + + + + svgWordPosition + + + 2020-08-31 15:57:43 + + 2020-08-31 15:57:44 + 2020-09-14 15:23:15 + 2020-09-09 15:26:34 + 2020-09-09 17:46:21 + 2020-10-22 09:25:50 + 2020-10-22 10:02:04 + 2020-09-14 15:25:15 + 2020-09-14 15:25:12 + 2020-09-14 15:25:12 + + 2020-09-14 16:37:58 + tp errors + box errors + + 2020-10-19 17:12:10 + 2020-10-21 16:14:38 + 2020-10-21 16:28:23 + 2020-10-21 16:28:24 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Pessimismus + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + eine + + + + + + + + + + + + + + + + + + Seele + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Buchs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Medizin + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lehren + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nicht + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Anrecht + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + worden + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + antipes- + + + + + + + + + + + + + + + + + + + + + + + + simistischer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Vorrede zu Menschliches II. + + Index: tests_svgscripts/test_page_creator.py =================================================================== --- tests_svgscripts/test_page_creator.py (revision 101) +++ tests_svgscripts/test_page_creator.py (revision 102) @@ -1,69 +1,72 @@ import unittest from os import sep, path from os.path import isdir, isfile, dirname, basename import lxml.etree as ET import sys import tempfile sys.path.append('svgscripts') dir_changed = False if not isdir('datatypes'): sys.path.append(dirname(sys.path[0])) dir_changed = True +from datatypes.image import SVGImage from datatypes.lineNumber import LineNumber from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page_creator import PageCreator from datatypes.page import STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark +from datatypes.text_field import TextField from datatypes.transkriptionField import TranskriptionField from datatypes.writing_process import WritingProcess from datatypes.word import Word class TestPage(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_tcm_xml = DATADIR + sep + 'N_VII_1_page001.xml' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' self.svg_file125 = DATADIR + sep + 'W_I_8_page125_web.svg' self.xml_file125 = DATADIR + sep + 'W_I_8_page125.xml' self.tmp_dir = tempfile.mkdtemp() def test_init(self): xml_target_file = self.tmp_dir + sep + 'asdf.xml' page = PageCreator(xml_target_file, svg_file=self.svg_file125) self.assertEqual(page.svg_image.file_name, self.svg_file125) - page = PageCreator(self.xml_file125, svg_file=self.svg_file125) + page = PageCreator(self.xml_file125, svg_file=self.svg_file125, svg_text_field=TextField(width=10, height=10)) self.assertEqual(page.svg_image.file_name, self.svg_file125) + self.assertEqual(page.svg_image.text_field.width, 10) self.assertEqual(page.title, 'W I 8') self.assertEqual(page.number, '125') def test_init_line_numbers(self): page = PageCreator(self.test_file) line_numbers = [ LineNumber(id=2, top=20, bottom=40), LineNumber(id=4, top=50, bottom=60), LineNumber(id=6, top=70, bottom=90) ] page.init_line_numbers(line_numbers, 122.345) self.assertEqual(len(page.line_numbers), 7) self.assertEqual(page.line_numbers[0].id, 1) self.assertEqual(page.line_numbers[6].id, 7) self.assertEqual(page.line_numbers[6].top, 91) self.assertEqual(page.line_numbers[6].bottom, 122.345) self.assertEqual(page.get_line_number(122), 7) self.assertEqual(page.get_line_number(92), 7) self.assertEqual(page.get_line_number(22), 2) def test_create_writing_process(self): page = PageCreator(self.test_file) page.create_writing_processes_and_attach2tree() self.assertEqual(len(page.writing_processes), 3) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_show_highlighted_svg_file.py =================================================================== --- tests_svgscripts/test_show_highlighted_svg_file.py (revision 101) +++ tests_svgscripts/test_show_highlighted_svg_file.py (revision 102) @@ -1,30 +1,31 @@ import unittest from os import sep, path, remove, listdir from os.path import isdir, isfile, dirname, basename import shutil import sys import lxml.etree as ET import sys import tempfile import warnings sys.path.append('svgscripts') import show_highlighted_svg_file class TestCopy(unittest.TestCase): def setUp(self): show_highlighted_svg_file.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.faksimile_dir = DATADIR + sep + 'faksimile_svg' self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg' self.tmp_dir = tempfile.mkdtemp() + @unittest.skip('opens external program "inkscape"') def test_main(self): show_highlighted_svg_file.main([self.faksimile_file, 'Muster']) def tearDown(self): shutil.rmtree(self.tmp_dir, ignore_errors=True) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_lineNumber.py =================================================================== --- tests_svgscripts/test_lineNumber.py (revision 101) +++ tests_svgscripts/test_lineNumber.py (revision 102) @@ -1,117 +1,117 @@ import unittest from os import sep, path from os.path import isdir, dirname import lxml.etree as ET import sys import sys sys.path.append('svgscripts') dir_changed = False if not isdir('datatypes'): sys.path.append(dirname(sys.path[0])) dir_changed = True from datatypes.lineNumber import LineNumber, get_bottoms from datatypes.transkriptionField import TranskriptionField class TestLineNumber(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_target_file = DATADIR + sep + 'test.xml' self.test_source_file = DATADIR + sep + 'test_ai.svg' self.test_file = DATADIR + sep + 'Mp_XIV_1_mytest_421.svg' self.small_lines = f'{DATADIR}{sep}pdfsvg{sep}csv{sep}45.svg' self.id = 24 def test_init(self): lnr = LineNumber(id=self.id) self.assertEqual(lnr.id, self.id) svg_tree = ET.parse(self.test_source_file) tf = TranskriptionField(self.test_source_file) raw_text_node = svg_tree.getroot().xpath('.//text[@transform="%s"]' % 'matrix(1 0 0 1 180.8755 386.7129)')[0] lnr = LineNumber(raw_text_node=raw_text_node, transkription_field=tf) self.assertEqual(lnr.id, self.id) self.assertEqual(lnr.bottom, 311.8129) tree = ET.ElementTree(ET.Element('svg')) node = ET.SubElement(tree.getroot(), 'text', attrib={'transform': 'matrix(1 0 0 1 180.8755 386.7129)'}) node.text = str(self.id) lnr = LineNumber(raw_text_node=node, transkription_field=tf) self.assertEqual(lnr.id, self.id) self.assertEqual(lnr.bottom, 311.8129) node = ET.SubElement(tree.getroot(), 'text', attrib={'transform': 'matrix(1 0 0 1 180.8755 386.7129)'}) subnode = ET.SubElement(node, 'tspan') subnode.text = '1' subnode = ET.SubElement(node, 'tspan') subnode.text = '0' lnr = LineNumber(raw_text_node=node, transkription_field=tf) self.assertEqual(lnr.id, 10) self.assertEqual(lnr.bottom, 311.8129) def test_init_from_xml(self): xml_tree = ET.parse(self.test_target_file) line_numbers = [ LineNumber(xml_text_node=node) for node in xml_tree.getroot().xpath('.//line-number') ] self.assertEqual(len(line_numbers), 49) self.assertEqual(line_numbers[0].id, 1) self.assertEqual(line_numbers[48].id, 49) def test_attach_object_to_tree(self): empty_tree = ET.ElementTree(ET.Element('page')) lnr = LineNumber(id=self.id) lnr.attach_object_to_tree(empty_tree) lnr_nodes = empty_tree.getroot().xpath('//' + LineNumber.XML_TAG + '[@id="%s"]' % self.id) self.assertEqual(len(lnr_nodes), 1) self.assertEqual(lnr_nodes[0].get('id'), str(self.id)) def test_extract_line_numbers(self): svg_tree = ET.parse(self.test_file) tf = TranskriptionField(self.test_file) - line_numbers = LineNumber.extract_line_numbers(svg_tree, tf) + line_numbers = LineNumber.extract_line_numbers(svg_tree, tf, set_to_text_field_zero=False) self.assertEqual(line_numbers[0].id, 2) self.assertEqual(len(line_numbers), 24) - self.assertEqual(line_numbers[0].top, 45.163) + self.assertEqual(line_numbers[0].top, 45.163 + tf.ymin) svg_tree = ET.parse(self.small_lines) tf = TranskriptionField(self.small_lines) - line_numbers = LineNumber.extract_line_numbers(svg_tree, tf) + line_numbers = LineNumber.extract_line_numbers(svg_tree, tf, set_to_text_field_zero=False) def test_IS_A_LINE_NUMBER(self): tree = ET.ElementTree(ET.Element('svg')) node = ET.SubElement(tree.getroot(), 'text') node.text = '2' self.assertEqual(LineNumber.IS_A_LINE_NUMBER(node), True) node = ET.SubElement(tree.getroot(), 'text') subnode = ET.SubElement(node, 'tspan') subnode.text = '1' subnode = ET.SubElement(node, 'tspan') subnode.text = '0' self.assertEqual(LineNumber.IS_A_LINE_NUMBER(node), True) subnode.text = 'x' self.assertEqual(LineNumber.IS_A_LINE_NUMBER(node), False) def test_get_semanticAndDataDict(self): xml_tree = ET.parse(self.test_target_file) line_number = [ LineNumber(xml_text_node=node) for node in xml_tree.getroot().xpath('.//line-number') ][1] #self.assertEqual(line_number.get_data_dictionary()['body'].get('id'), 2) #print(LineNumber.get_semantic_dictionary()) def test_get_bottoms(self): svg_tree = ET.parse(self.test_file) mybottoms = get_bottoms(svg_tree.getroot()) self.assertEqual(mybottoms[0], 57.1914) self.assertEqual(len(mybottoms), 106) self.assertEqual(mybottoms[-1], 1155.6899) mybottoms = get_bottoms(svg_tree.getroot(), from_position=100.0, to_position=800.0) self.assertEqual(mybottoms[0], 100.5132) self.assertEqual(len(mybottoms), 84) self.assertEqual(mybottoms[-1], 792.8218) tf = TranskriptionField(self.test_file) mybottoms = get_bottoms(svg_tree.getroot(), transkription_field=tf) self.assertEqual(mybottoms[0], 91.7134) self.assertEqual(len(mybottoms), 75) self.assertEqual(mybottoms[-1], 681.7134) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_matrix.py =================================================================== --- tests_svgscripts/test_matrix.py (revision 101) +++ tests_svgscripts/test_matrix.py (revision 102) @@ -1,245 +1,245 @@ import unittest import lxml.etree as ET from os import sep, path from os.path import isdir, dirname import sys sys.path.append('svgscripts') from datatypes.matrix import Matrix from datatypes.transkriptionField import TranskriptionField class FakeTF: def __init__(self): self.xmin = 297.6379999999997 self.xmax = 765.354 self.ymin = 157.328 self.ymax = 752.6040160033832 class TestMatrix(unittest.TestCase): def setUp(self): self.x = 219.4058 self.y = 106.4634 self.matrix_string = 'matrix(1 0 0 1 {} {})'.format(str(self.x), str(self.y)) self.test_data_dir = dirname(__file__) + sep + 'test_data' if not isdir(self.test_data_dir): self.test_data_dir = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = self.test_data_dir + sep + 'test_ai.svg' self.rotation_angle = 20 self.rotation_matrix_string = 'matrix(0.94 0.342 -0.342 0.94 0 0)' self.test_margin_field_file = self.test_data_dir + sep + 'W_I_8_neu_125-01.svg' self.test_place_printing_verso = self.test_data_dir + sep + 'N_VII_1_xp5_4_page5.svg' self.test_place_printing_recto = self.test_data_dir + sep + 'N_VII_1_xp5_4_page6.svg' self.multipage = f'{self.test_data_dir}{sep}pdfsvg{sep}csv{sep}15.svg' self.marginals_extra = f'{self.test_data_dir}{sep}pdfsvg{sep}csv{sep}45.svg' self.marginals_extra_fn = f'{self.test_data_dir}{sep}pdfsvg{sep}csv{sep}44.svg' def test_Matrix(self): matrix = Matrix(self.matrix_string) self.assertEqual(matrix.getX(), self.x) self.assertEqual(matrix.add2X(1), self.x + 1) self.assertEqual(matrix.getY(), self.y) matrix = Matrix('matrix(0.98966578,0.1433933,-0.0913015,0.9958233,0,0)') self.assertEqual(matrix.getX(), 0) matrix = Matrix('matrix(1 2.998719e-04 -2.998719e-04 1 415.3643 476.7988)') def test_Matrix_rotation(self): rotation_string = 'rotate({})'.format(self.rotation_angle) rotation_stringC = 'rotate(-{})'.format(self.rotation_angle) matrixA = Matrix(rotation_string) matrixB = Matrix(self.rotation_matrix_string) matrixC = Matrix(rotation_stringC) self.assertEqual(matrixA.matrix[Matrix.A], matrixB.matrix[Matrix.A]) self.assertEqual(matrixA.matrix[Matrix.B], matrixB.matrix[Matrix.B]) self.assertEqual(matrixA.matrix[Matrix.C], matrixB.matrix[Matrix.C]) self.assertEqual(matrixA.matrix[Matrix.D], matrixB.matrix[Matrix.D]) self.assertEqual(matrixA.matrix[Matrix.E], matrixB.matrix[Matrix.E]) self.assertEqual(matrixA.matrix[Matrix.F], matrixB.matrix[Matrix.F]) self.assertEqual(matrixA.toString(), self.rotation_matrix_string) self.assertEqual(matrixC.toCSSTransformString(), 'rotate(-{}deg)'.format(self.rotation_angle)) def test_get_rotation_direction(self): rotation_string = 'rotate(-{})'.format(self.rotation_angle) matrixA = Matrix(rotation_string) matrixB = Matrix(self.rotation_matrix_string) matrixC = Matrix(self.matrix_string) self.assertEqual(matrixA.get_rotation_direction(), Matrix.UP) self.assertEqual(matrixB.get_rotation_direction(), Matrix.DOWN) self.assertEqual(matrixC.get_rotation_direction(), Matrix.STRAIGHT) def test_isRotationMatrix(self): rotation_string = 'rotate({})'.format(self.rotation_angle) matrixA = Matrix(rotation_string) self.assertEqual(matrixA.isRotationMatrix(), True) matrixB = Matrix(self.matrix_string) self.assertEqual(matrixB.isRotationMatrix(), False) def test_toCSSTransformString(self): rotation_string = 'rotate({})'.format(self.rotation_angle) matrixA = Matrix(rotation_string) self.assertEqual(matrixA.toCSSTransformString(), 'rotate({}deg)'.format(self.rotation_angle)) matrixB = Matrix(self.rotation_matrix_string) self.assertEqual(matrixB.toCSSTransformString(), 'rotate({}deg)'.format(self.rotation_angle)) def test_Matrix_Exception(self): with self.assertRaises(Exception): Matrix('matrix({})'.format(' '.join([ '0.0' for i in range(5)]))) def test_Matrix_TranskriptionField(self): tf = TranskriptionField(self.test_file) matrix = Matrix(self.matrix_string, transkription_field=tf) self.assertEqual(round(matrix.getX(), 3) , 28.706) self.assertEqual(round(matrix.getY(), 3) , 31.563) def test_get_transformed_positions(self): # Test relies on the example from "https://developer.mozilla.org/en-US/docs/Web/SVG/Attribute/transform" x = 10 y = 10 width = 30 height = 20 matrix = Matrix(transform_matrix_string='matrix(3 1 -1 3 30 40)') new_x, new_y, new_width, new_height = matrix.get_transformed_positions(x=x, y=y, width=width, height=height) self.assertEqual(new_x, 50) self.assertEqual(new_y, 80) self.assertEqual(new_width, 90) self.assertEqual(new_height, 60) def test_is_matrix_horizontal(self): matrix = Matrix(transform_matrix_string='matrix(3 1 -1 3 30 40)') self.assertEqual(matrix.is_matrix_horizontal(), False) matrix = Matrix(transform_matrix_string='matrix(1 0 0 1 30 40)') self.assertEqual(matrix.is_matrix_horizontal(), True) def test_is_part_of_transkription_field(self): tf = TranskriptionField(self.test_file) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 244.1211 91.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 244.1211 51.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 44.1211 91.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 244.1211 891.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 844.1211 91.7134)'}) self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), False) text_node = ET.Element('text', attrib={'transform': 'matrix(0.866 -0.5 0.5 0.866 356.4303 753.4836)'}) tspan_node = ET.SubElement(text_node, 'tspan', attrib={'x': '41.82', 'y': '0'}) tspan_node.text = 'De' fake_tf = FakeTF() self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(fake_tf, text_node=text_node), True) """ - local_file = '/home/knister0/ownCloud/myNietzscheDE/KGW-IX_12/Bd_12_XIV-XVI_Druck_als_SVG/03.svg' + local_file = '/home/knister0/ownCloud/myNietzscheDE/KGW-IX_12/Bd_12_XIV-XVI_Druck_als_SVG/18.svg' tf = TranskriptionField(local_file) svg_tree = ET.parse(local_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } - text_node = svg_tree.xpath('//ns:text[@transform="matrix(1 0 0 1 173.7407 144.8535)"]', namespaces=namespaces)[0] - self.assertEqual(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node), True) + text_node = svg_tree.xpath('//ns:text[@transform="matrix(1 0 0 1 115.6299 719.3535)"]', namespaces=namespaces)[0] + self.assertTrue(Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(tf, text_node=text_node)) """ def test_is_nearx_tf(self): tf = TranskriptionField(self.test_file) matrix_string = 'matrix(1 0 0 1 180.8755 315.9131)' self.assertEqual(Matrix.IS_NEARX_TRANSKRIPTION_FIELD(matrix_string, tf), True) matrix_string = 'matrix(1 0 0 1 100.8755 315.9131)' self.assertEqual(Matrix.IS_NEARX_TRANSKRIPTION_FIELD(matrix_string, tf), False) def test_do_conversion_factors_differ(self): self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(None, None), False) matrix_a = Matrix('matrix(1 0 0 1 180.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, None), True) matrix_b = Matrix('matrix(1 0 0 1 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), False) matrix_b = Matrix('matrix(0 0 0 1 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), True) matrix_b = Matrix('matrix(1 1 0 1 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), True) matrix_b = Matrix('matrix(1 0 1 1 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), True) matrix_b = Matrix('matrix(1 0 0 0 100.8755 315.9131)') self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), True) def test_clone_transformation_matrix(self): matrix_a = Matrix(matrix_list=[ 1, 0, 0, 1, 180.8755, 315.9131 ]) matrix_b = matrix_a.clone_transformation_matrix() self.assertEqual(Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b), False) self.assertEqual(matrix_b.matrix[Matrix.E], 0) self.assertEqual(matrix_b.matrix[Matrix.F], 0) def test_toString(self): matrix_string = 'matrix(1.0 0.0 0.0 1.0 180.8755 315.9131)' matrix = Matrix(matrix_string) self.assertEqual(matrix.toString(), matrix_string) def test_get_semanticAndDataDict(self): matrix = Matrix('rotate(20)') #self.assertEqual(matrix.get_data_dictionary()['body'].get('matrix'), matrix.matrix) def test_is_in_margin_field(self): tf = TranskriptionField(self.test_margin_field_file) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 178.8916 182.0127)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 357.7339 818.3276)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf), False) tf = TranskriptionField(self.marginals_extra) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 778.519 407.1094)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf, marginals_on_extra_page=True), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 51.8503 1056.1182)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf, marginals_on_extra_page=True), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 552.9165 1072.1025)'}) self.assertEqual(Matrix.IS_IN_MARGIN_FIELD(text_node.get('transform'), tf, marginals_on_extra_page=True), False) def test_is_in_place_of_printing_area(self): tf = TranskriptionField(self.test_place_printing_verso) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 42.5195 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 109.145 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 191.0571 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), False) tf = TranskriptionField(self.test_place_printing_recto) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 28.3462 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 28.3462 583.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 170.0791 575.8736)'}) self.assertEqual(Matrix.IS_IN_PLACE_OF_PRINTING_AREA(text_node.get('transform'), tf), False) def test_is_in_footnote_area(self): tf = TranskriptionField(self.test_place_printing_verso) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 42.5195 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 109.145 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 191.0571 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), True) tf = TranskriptionField(self.test_place_printing_recto) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 28.3462 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 28.3462 583.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 170.0791 575.8736)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), True) tf = TranskriptionField(self.multipage, multipage_index=0) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 395.7141 463.6953)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 395.7141 453.6953)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf), True) tf = TranskriptionField(self.marginals_extra) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 552.9165 1072.1025)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf, marginals_on_extra_page=True), True) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 51.8503 1056.1182)'}) self.assertEqual(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf, x=5.352, marginals_on_extra_page=True), False) text_node = ET.Element('text', attrib={'transform': 'matrix(1 0 0 1 215.5483 1056.1182)'}) self.assertTrue(Matrix.IS_IN_FOOTNOTE_AREA(text_node.get('transform'), tf, x=24.732, marginals_on_extra_page=True)) svg_tree = ET.parse(self.marginals_extra_fn) tf = TranskriptionField(self.marginals_extra) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } node = svg_tree.xpath('//ns:text[@transform="matrix(1 0 0 1.0101 698.1499 85.3594)"]', namespaces=namespaces)[0] self.assertFalse(Matrix.NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, tf, marginals_on_extra_page=True)) self.assertFalse(Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), tf, marginals_on_extra_page=True)) node = svg_tree.xpath('//ns:text[@transform="matrix(1 0 0 1 215.5483 1056.1182)"]', namespaces=namespaces)[0] self.assertTrue(Matrix.NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, tf, marginals_on_extra_page=True)) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_util.py =================================================================== --- tests_svgscripts/test_util.py (revision 101) +++ tests_svgscripts/test_util.py (revision 102) @@ -1,242 +1,259 @@ import unittest from os import sep, path, remove, listdir from os.path import isdir, isfile, dirname, basename import shutil import sys import lxml.etree as ET import sys import tempfile import warnings sys.path.append('svgscripts') import util from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT from datatypes.faksimile import FaksimilePage from datatypes.page import Page +from datatypes.page_creator import PageCreator from datatypes.positional_word_part import PositionalWordPart +from datatypes.text_field import TextField from datatypes.transkriptionField import TranskriptionField from datatypes.word_position import WordPosition from datatypes.word import Word sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT +sys.path.append('fixes') +from fix_old_data import save_page class TestCopy(unittest.TestCase): def setUp(self): util.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_dir = DATADIR self.faksimile_dir = DATADIR + sep + 'faksimile_svg' self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg' self.image = DATADIR + sep + 'image.jpg' self.svg_testrecord = DATADIR + sep + 'TESTRECORD.svg' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' + self.Mp_XIV_page420 = DATADIR + sep + 'Mp_XIV_page420.xml' self.tmp_dir = tempfile.mkdtemp() def test_copy(self): tmp_image = self.tmp_dir + sep + basename(self.image) target_file = 'asdf.svg' shutil.copy(self.image, self.tmp_dir) util.copy_faksimile_svg_file(target_file, faksimile_source_file=self.faksimile_file,\ target_directory=self.tmp_dir, local_image_path=tmp_image) self.assertEqual(isfile(self.tmp_dir + sep + target_file), True) util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_file,\ target_directory=self.tmp_dir, local_image_path=tmp_image) self.assertEqual(isfile(self.tmp_dir + sep + basename(self.faksimile_file)), True) with self.assertRaises(Exception): util.copy_faksimile_svg_file() with self.assertRaises(Exception): util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_source_file) def test_copy_xml(self): old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) self.assertEqual(isfile(xml_file), True) page = Page(xml_file) self.assertEqual(len(page.words), len(old_page.words)) self.assertEqual(len(page.line_numbers), 0) def test_create_highlighted_svg_file(self): target_file = self.tmp_dir + sep + basename(self.faksimile_file) tmp_image = self.tmp_dir + sep + basename(self.image) faksimile_tree = ET.parse(self.faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } node_ids = ['rect947', 'rect951', 'rect953', 'rect955', 'rect959', 'rect961', 'rect963'] highlight_color = 'blue' util.create_highlighted_svg_file(faksimile_tree, node_ids, target_directory=self.tmp_dir, highlight_color=highlight_color, namespaces=namespaces) self.assertEqual(isfile(target_file), True) new_tree = ET.parse(target_file) for node in new_tree.xpath('//ns:rect[@fill="{0}"]|//ns:path[@fill="{0}"]'.format(highlight_color), namespaces=namespaces): node_ids.remove(node.get('id')) self.assertEqual(len(node_ids), 0) def test_get_empty_node_ids(self): faksimile_tree = ET.parse(self.faksimile_file) faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] empty_node_ids = util.get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page) self.assertEqual('rect1085' in empty_node_ids, True) def test_record_changes(self): new_tree = ET.parse(self.faksimile_file) old_tree = ET.parse(self.faksimile_file) empty_node_id = 'rect1085' title_node_id = 'test001' namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() } node = new_tree.xpath('//ns:rect[@id="{0}"]'.format(empty_node_id), namespaces=namespaces)[0] title = ET.SubElement(node, 'title', attrib={ 'id': title_node_id }) title.text = 'test' new_file = self.tmp_dir + sep + 'new.svg' old_file = self.tmp_dir + sep + 'old.svg' util.copy_faksimile_svg_file(target_file=new_file, faksimile_tree=new_tree) util.copy_faksimile_svg_file(target_file=old_file, faksimile_tree=old_tree) util.record_changes(old_file, new_file, [ empty_node_id ], namespaces=namespaces) test_tree = ET.parse(old_file) self.assertEqual(len(test_tree.xpath('//ns:rect[@id="{0}"]/ns:title[@id="{1}"]'.format(empty_node_id, title_node_id), namespaces=namespaces)), 1) def test_replace_chars(self): page = Page(self.xml_file) faksimile_tree = ET.parse(self.faksimile_file) namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } word_position = WordPosition(id='rect1159', text='„Gedächtniß"') wps, texts = util.replace_chars(page.words, [ word_position ]) self.assertEqual(texts[0].endswith('“'), True) self.assertEqual(wps[0].text.endswith('“'), True) word_position = WordPosition(id='rect1173', text='-') wps, texts = util.replace_chars(page.words, [ word_position ]) self.assertEqual(wps[0].text.endswith('–'), True) def test_mismatch_words(self): page = Page(self.xml_file) faksimile_tree = ET.parse(self.faksimile_file) faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] page = Page('xml/N_VII_1_page174.xml') faksimile_tree = ET.parse('faksimile_svg/N-VII-1,173et174.svg') faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0] self.assertEqual('-' in [ tp.text for tp in faksimile_page.word_positions], True) wps, texts = util.replace_chars(page.words,faksimile_page.word_positions) self.assertEqual('–' in texts, True) self.assertEqual(len([ faksimile_position for faksimile_position in wps\ if faksimile_position.text == '–' ]), 4) mismatching_words, mismatching_faksimile_positions = util.get_mismatching_ids(page.words, faksimile_page.word_positions) self.assertEqual(len([word for word in mismatching_words if word.text.endswith('“') ]), 0) self.assertEqual(len([word for word in mismatching_words if word.text.endswith('–') ]), 0) def test_process_warnings(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter('default') warnings.warn('Test1: asdf') warnings.warn('Test2: asdf') status = util.process_warnings4status(w, ['Test1', 'Test2' ], 'asdf', 'OK', status_prefix='with warnings') #print(status) self.assertTrue('Test1' in status.split(':')) self.assertTrue('Test2' in status.split(':')) @unittest.skip('test uses external program, has been tested') def test_show_files(self): list_of_files = [ self.test_dir + sep + file for file in listdir(self.test_dir) if file.endswith('pdf') ][0:2] util.ExternalViewer.show_files(single_file=self.faksimile_file, list_of_files=list_of_files) def test_record_changes_to_page(self): page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 1 ]) old_length = len(page.words) self.assertEqual(page.words[1].text, 'asdf') self.assertEqual(page.words[1].transkription_positions[0].width, 353) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 13 ]) self.assertEqual(page.words[13].text, 'er') self.assertEqual(page.words[14].text, '=') self.assertEqual(len(page.words), old_length+1) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 64 ]) self.assertEqual(page.words[64].text, 'Simplifications-apparat') self.assertEqual(len(page.words[64].transkription_positions), 3) self.assertEqual(len(page.words), old_length-1) @unittest.skipUnless(__name__ == "__main__", 'tests all words') def test_extended__record_changes_to_page(self): page = Page(self.xml_file) old_length = len(page.words) page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord) self.assertEqual(page.words[1].text, 'asdf') self.assertEqual(page.words[13].text, 'er') self.assertEqual(page.words[14].text, '=') self.assertEqual(page.words[65].text, 'Simplifications-apparat') self.assertEqual(len(page.words), old_length) def test_copy_faksimile_update_image_location(self): test_dir = self.tmp_dir #FAKSIMILE_LOCATION + '/Myriam/Fertig/' util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir) with self.assertWarns(UserWarning): util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir) def test_record_changes_on_xml(self): old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) node = tree.xpath('//word[@id="135"]')[0] counter =0 while node.get('text') != 'gar' or counter > 5: counter += 1 nextnode = node.getnext() node.set('text', node.get('text') + nextnode.get('text')) for element in nextnode.getchildren(): node.append(element) nextnode.getparent().remove(nextnode) write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(new_page.words), len(old_page.words)-2) self.assertEqual(len([ word for word in new_page.words if word.text == 'gar']), 1) old_page = Page(self.xml_file) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) node = tree.xpath('//word[@id="138"]')[0] counter =0 while node.get('text') != 'nichtvorkommt.' or counter > 5: counter += 1 nextnode = node.getnext() node.set('text', node.get('text') + nextnode.get('text')) for element in nextnode.getchildren(): node.append(element) nextnode.getparent().remove(nextnode) node.set('split', 'nicht vorkommt.') write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) joined_page = Page(xml_file) self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.']), 1) self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.'][0].split_strings), 2) self.assertEqual(len(joined_page.words), len(old_page.words)-1) new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(new_page.words), len(old_page.words)) self.assertEqual(len([word for word in new_page.words if word.text == 'vorkommt.']), 1) self.assertEqual(len([word for word in old_page.words if word.text == 'nicht']),\ len([word for word in new_page.words if word.text == 'nicht'])) xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir) tree = ET.parse(xml_file) old_page = Page(xml_file) nodes = tree.xpath('//word[@id>="85" and @id<="87"]') self.assertEqual(len(nodes), 3) prevWordText = nodes[0].get('text') nodes[0].set('join', prevWordText + 'z') nodes[1].set('split', 'z u') lastWordText = nodes[2].get('text') nodes[2].set('join', 'u' + lastWordText) write_pretty(xml_element_tree=tree, file_name=xml_file,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) joined_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file) self.assertEqual(len(joined_page.words), len(old_page.words)-1) + def test_reset_tp_with_matrix(self): + page = Page(self.Mp_XIV_page420) + util.reset_tp_with_matrix(page, page.words[0].transkription_positions) + self.assertTrue(page.words[0].transkription_positions[0].left > 0 and page.words[0].transkription_positions[0].top > -5) + transformed_words = [w for w in page.words if (len(w.transkription_positions) > 0 and w.transkription_positions[0].transform is not None) ] + util.reset_tp_with_matrix(page, transformed_words[0].transkription_positions) + self.assertEqual(transformed_words[0].transkription_positions[0].left, 0) + self.assertEqual(transformed_words[0].transkription_positions[0].top, -5) + page.svg_image.text_field = TextField() + util.reset_tp_with_matrix(page, transformed_words[1].transkription_positions) + self.assertTrue(transformed_words[1].transkription_positions[0].left > 0 and transformed_words[1].transkription_positions[0].top > -5) + def test_back_up(self): test_dir = self.tmp_dir page = Page(self.xml_file) target_file_name = util.back_up(page, self.xml_file, bak_dir=test_dir) self.assertEqual(isfile(target_file_name), True) svg_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } util.back_up_svg_file(svg_tree, namespaces) def tearDown(self): shutil.rmtree(self.tmp_dir, ignore_errors=True) pass if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_extractWordPosition.py =================================================================== --- tests_svgscripts/test_extractWordPosition.py (revision 101) +++ tests_svgscripts/test_extractWordPosition.py (revision 102) @@ -1,179 +1,189 @@ import unittest import os from os import sep, path from os.path import isfile, isdir, dirname import re import shutil import tempfile import lxml.etree as ET from lxml.etree import XMLSyntaxError import sys sys.path.append('svgscripts') import extractWordPosition from myxmlwriter import write_pretty from datatypes.transkriptionField import TranskriptionField from datatypes.matrix import Matrix from datatypes.page_creator import PageCreator, FILE_TYPE_SVG_WORD_POSITION +from datatypes.page import Page from datatypes.pdf import PDFText from datatypes.word import Word from datatypes.lineNumber import LineNumber from datatypes.word_insertion_mark import WordInsertionMark def test_write(xml_element_tree=None, file_name=None): write_pretty(xml_element_tree=xml_element_tree, file_name=None, script_name='test', file_type=FILE_TYPE_SVG_WORD_POSITION) class TestExtractor(unittest.TestCase): def setUp(self): extractWordPosition.Extractor.UNITTESTING = True DATADIR = dirname(__file__) + sep + 'test_data' self.test_file_find_word = DATADIR + sep + 'test_find_word.xml' self.test_dir = tempfile.mkdtemp() self.title = 'ABC 111' self.matrix_string = 'matrix(1 0 0 1 183.6558 197.9131)' self.test_file = DATADIR + sep + 'Mp_XIV_1_mytest_421.svg' self.test_empty_file = DATADIR + sep + 'my_empty_test.svg' self.test_source = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml' self.xml420 = DATADIR + sep + 'Mp_XIV_1_page420.xml' self.pdf420 = DATADIR + sep + 'Mp_XIV_1_online_420.pdf' self.pdf_file = DATADIR + sep + 'W_I_8_page125.pdf' self.faulty_xml = DATADIR + sep + 'W_I_8_faulty_page125.xml' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.testA = DATADIR + sep + 'testA.xml' self.multipage = DATADIR + sep + 'multipage_small_above.svg' def test_extract_information(self): extractor = extractWordPosition.Extractor() page = extractor.extract_information(self.multipage, multipage_index=0) self.assertEqual(len(page.words), 59) self.assertEqual(page.multipage_index, 0) page = extractor.extract_information(self.multipage, multipage_index=1) self.assertEqual(page.multipage_index, 1) self.assertTrue(len(page.words) > 59) + extractor = extractWordPosition.Extractor() + source_page = Page('xml/Mp_XV_page78v.xml') + extractor = extractWordPosition.Extractor() + transkription_field = TranskriptionField(source_page.source) + svg_tree = ET.parse(source_page.source) + text_items = extractor.get_text_items(svg_tree.getroot(), transkription_field=transkription_field) + self.assertTrue('matrix(1 0 0 1 115.6299 719.3535)' in [ item.get('transform') for item in text_items ]) + page = extractor.extract_information(source_page.source, svg_file=source_page.svg_file) + self.assertTrue(page.svg_image.text_field is not None) def test_update_title(self): extractor = extractWordPosition.Extractor(xml_dir=self.test_dir) extractor.update_title_and_manuscript('test') self.assertEqual(extractor.title, 'test') self.assertEqual(extractor.manuscript_file, '{}/test.xml'.format(self.test_dir)) self.assertEqual(isfile('{}/test.xml'.format(self.test_dir)), True) def test_get_page_number(self): extractor = extractWordPosition.Extractor() self.assertEqual(extractor.get_page_number(self.test_file, page_number='1'), '001') self.assertEqual(extractor.get_page_number(self.test_file), '421') def test_get_file_name(self): extractor = extractWordPosition.Extractor() self.assertEqual(extractor.get_file_name(self.test_file), 'xml/Mp_XIV_1_mytest_421.xml') extractor = extractWordPosition.Extractor(title=self.title) self.assertEqual(extractor.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_'))) extractorA = extractWordPosition.Extractor(title=self.title) extractorB = extractWordPosition.Extractor(manuscript_file=extractorA.manuscript_file) self.assertEqual(extractorB.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_'))) def test_get_style(self): extractor = extractWordPosition.Extractor() svg_tree = ET.parse(self.test_file) sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot()) self.assertEqual(sonderzeichen_list, [ 'st21', 'st23']) self.assertEqual(style_dict.get('st11').get('font-family'), 'Frutiger-Europeen') self.assertEqual(style_dict.get('st5').get('stroke'), '#CED5CE') def test_get_word_from_part_obj(self): extractor = extractWordPosition.Extractor() mylist = [{'text': 'a', 'class': 'asdf' }, {'text': 'b', 'endX': 0 }, {'text': 'c'}] self.assertEqual(extractor.get_word_from_part_obj(mylist), 'abc') def test_get_text_items(self): svg_tree = ET.parse(self.test_file) extractor = extractWordPosition.Extractor() mytest_items = [ x for x in extractor.get_text_items(svg_tree.getroot()) ] self.assertEqual(len(mytest_items), 300) self.assertEqual(mytest_items[0].get('transform'), 'matrix(1 0 0 1 386.8218 57.1914)') tf = TranskriptionField(self.test_file) mytest_itemsTF = [ x for x in extractor.get_text_items(svg_tree.getroot(), transkription_field=tf) ] self.assertEqual(mytest_itemsTF[0].get('transform'), 'matrix(1 0 0 1 204.8618 91.7134)') def test_init_tree_and_target_file(self): target_file = self.testA page = PageCreator(target_file, title=self.title) tree = page.page_tree self.assertEqual(tree.getroot().get('title'), self.title) self.assertEqual(tree.getroot().findall('./style'), []) test_write(xml_element_tree=tree, file_name=target_file) page = PageCreator(target_file) tree = page.page_tree self.assertEqual(tree.getroot().get('title'), self.title) self.assertEqual(tree.getroot().findall('./style'), []) isfile(target_file) and os.remove(target_file) def test_add_style(self): extractor = extractWordPosition.Extractor() svg_tree = ET.parse(self.test_file) sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot()) target_file = self.testA page = PageCreator(target_file,title=self.title) page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict) test_write(xml_element_tree=page.page_tree, file_name=target_file) fromTarget_xml_tree = ET.parse(target_file) self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title) self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23") self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE') self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen') page = PageCreator(target_file) page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict) test_write(xml_element_tree=page.page_tree, file_name=target_file) fromTarget_xml_tree = ET.parse(target_file) self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title) self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23") self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE') self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen') isfile(target_file) and os.remove(target_file) def test_add_word(self): extractor = extractWordPosition.Extractor() svg_tree = ET.parse(self.test_file) mylist = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] matrix = Matrix(self.matrix_string) for dict in mylist: dict['class'] = 'st22' dict['x'] = matrix.add2X(0) dict['y'] = matrix.getY() target_file = self.test_dir + sep + 'asdfasdf.xml' page = PageCreator(target_file) sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot()) page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict) self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 1) mylist[1]['text'] = 'A' mylist[1]['class'] = 'st21' mylist[1]['x'] = matrix.add2X(1) self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 2) page.update_and_attach_words2tree() self.assertEqual(page.page_tree.getroot().xpath('//word[@id="1"]')[0].get('text'), 'a') self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]')[0].get('text'), 'c') self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('left'), '183.506') self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('height'), '8.25') def test_extractor(self): extractor = extractWordPosition.Extractor() self.assertEqual(extractor.title, None) self.assertEqual(extractor.manuscript_file, None) self.assertEqual(extractor.xml_dir, 'xml/') self.assertEqual(extractor.manuscript_tree, None) def test_write_title_to_manuscript_file(self): extractor = extractWordPosition.Extractor(xml_dir=self.test_dir, title=self.title) self.assertEqual(isfile(extractor.manuscript_file), True) extractor = extractWordPosition.Extractor(manuscript_file=extractor.manuscript_file) self.assertEqual(extractor.title, self.title) def tearDown(self): isdir(self.test_dir) and shutil.rmtree(self.test_dir) isfile('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) and os.remove('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) if __name__ == "__main__": unittest.main() Index: fixes/interactive_editor.py =================================================================== --- fixes/interactive_editor.py (revision 101) +++ fixes/interactive_editor.py (revision 102) @@ -1,676 +1,676 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from datetime import datetime from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET import re import shutil import string from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.path import Path as SVGPath from svgpathtools.path import Line import sys import tempfile from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from progress.bar import Bar import warnings from fix_old_data import save_page from fix_boxes import attach_box, split_into_parts_and_attach_box sys.path.append('svgscripts') from convert_wordPositions import HTMLConverter, JSONConverter from datatypes.box import Box from datatypes.faksimile import FaksimilePage from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.word import Word, update_transkription_position_ids from join_faksimileAndTranskription import sort_words from util import back_up, back_up_svg_file, copy_faksimile_svg_file from process_files import update_svgposfile_status from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from main_util import create_function_dictionary __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False MAX_SVG_XY_THRESHOLD = 10 class ResponseHandler: def __init__(self, response_starts_with=None, dialog_string=None, action_name=None, description=None): self.action_name = action_name self.dialog_string = dialog_string self.description = description self.response_starts_with = response_starts_with def create_requirement_list(self) ->list: """Create a requirement dictionary. """ return [] def create_json_dict(self)->dict: """Create a json dictionary. """ json_dict = { 'action_name': self.action_name, 'description': self.description } requirements = self.create_requirement_list() if len(requirements) > 0: json_dict.update({ 'requirements': requirements }) return json_dict def get_requirement(self, json_dict: dict, index=0) ->tuple: """Return requirement tuple (name, input). """ name = requirement = None if dict_contains_keys(json_dict, ['response_handler','requirements'])\ and index < len(json_dict['response_handler']['requirements']): requirement_dict = json_dict['response_handler']['requirements'][index] if dict_contains_keys(requirement_dict, ['name'])\ and dict_contains_keys(requirement_dict, ['input']): name = requirement_dict['name'] requirement = requirement_dict['input'] return name, requirement def match(self, response: str) ->bool: """Return whether response matchs with handler. """ if self.response_starts_with is not None: return response.startswith(self.response_starts_with) return True def print_dialog(self): """Print dialog. """ if self.dialog_string is not None: print(f'[{self.dialog_string}]') def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ json_word_ids = [ jw.get('id') for jw in json_dict['words'] ] action_dictionary = { 'words': [ word for word in page.words if word.id in json_word_ids ] } for index, item in enumerate(self.create_requirement_list()): name, requirement = self.get_requirement(json_dict, index=index) action_dictionary.update({name: requirement}) return self.run_change(page, action_dictionary) def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ return self.run_change(page, {}) def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 return exit_code class JoinWords(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response interactively and return exit code. """ action_dictionary = { 'words' : shell._get_words_from_response(re.compile('^\D+\s').sub('', response), page.words),\ 'add_white_space_between_words': re.match(r'^\D+\s', response) } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 add_white_space_between_words = action_dictionary['add_white_space_between_words']\ if bool(action_dictionary.get('add_white_space_between_words'))\ else False words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] if len(words) > 0: if len(set([ word.line_number for word in words ])) == 1\ and len(set([ word.deleted for word in words ])) == 1: new_word = words[0] for word2join in words[1:]: page.words.remove(word2join) new_word.join(word2join, add_white_space_between_words=add_white_space_between_words) else: new_word = Word.join_words(words, add_white_space_between_words=add_white_space_between_words) index = len(page.words) if words[0] in page.words: index = page.words.index(words[0]) elif len([ word for word in page.words if words[0] in word.word_parts ]) > 0: index = page.words.index([ word for word in page.words if words[0] in word.word_parts ][0]) for word2join in words: if word2join in page.words: page.words.remove(word2join) elif len([ word for word in page.words if word2join in word.word_parts ]) > 0: page.words.remove([ word for word in page.words if word2join in word.word_parts ][0]) page.words.insert(index, new_word) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) + save_page(page, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class SimpleJoinWords(JoinWords): def match(self, response: str) ->bool: """Return whether response matchs with handler. """ return re.match(r'\d+', response) class SaveChanges(ResponseHandler): RELEVANT_PROPERTIES = [ ('deleted','deleted'), ('line_number','line') ] # 0 = word, 1 = word_dict def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ self.run_change(page, {}) return shell.run_interactive_editor(page) def _update_word(self, word, word_dict) ->int: """Update properites of word according to word_dict, return exit_code """ exit_code = 0 for relevant_property in self.RELEVANT_PROPERTIES: if len(word.word_parts) > 0: if len(word_dict['tp_id'].split(':')) == 3: wp_index = int(word_dict['tp_id'].split(':')[1].replace('w','')) word.word_parts[wp_index].__dict__[relevant_property[0]] = word_dict[relevant_property[1]] else: return 2 else: word.__dict__[relevant_property[0]] = word_dict[relevant_property[1]] return exit_code def handle_response(self, page: Page, json_dict: dict) -> int: """Handle response and return exit code. """ json_word_ids = [ jw.get('id') for jw in json_dict['words'] ] for word in page.words: if word.id in json_word_ids: word_dict = [ jw for jw in json_dict['words'] if jw.get('id') == word.id ][0] if self._update_word(word, word_dict) > 0: return 2 return self.run_change(page, {}) def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) + save_page(page, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) return exit_code class Reload(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ return shell.run_interactive_editor(Page(page.page_tree.docinfo.URL)) class RestoreBackup(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ if page.bak_file is not None: return shell.run_interactive_editor(Page(page.bak_file)) else: print('Could not restore backup file, please restore manually!') return 2 class ChangeLine2Value(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ words = [] line_number = -1 if re.match(r'l:\d+\s\d+', response): line_number = int(response.replace('l:', '').split(' ')[0]) words = shell._get_words_from_response(re.compile('l:\d+\s').sub('', response), page.words) else: if not re.match(r'l:\d+$', response): new_response_line = input('Specify new line number>') if re.match(r'^\d+$', new_response_line): line_number = int(new_response_line) else: line_number = int(response.replace('l:', '')) new_response = input(f'Specify ids of words for which line number should be changed to {line_number}>') if re.match(r'\d+', new_response): words = shell_get_words_from_response(new_response, page.words) action_dictionary = { 'words': words, 'line_number' : line_number } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 line_number = action_dictionary['line_number']\ if bool(action_dictionary.get('line_number'))\ else -1 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] if line_number != -1: for word in words: word.line_number = line_number if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) + save_page(page, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class CreateCorrectionHistory(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ if re.match(r'c\w*\s\d+', response): words = shell._get_words_from_response(re.compile('c\w*\s').sub('', response), page.words) else: new_response = input(f'Specify ids of words to create a correction history. >') if re.match(r'\d+', new_response): words = shell._get_words_from_response(new_response, page.words) action_dictionary = { 'words': words } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] if len(words) > 0: for word in words: word.create_correction_history() if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) + save_page(page, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class DeleteCorrectionHistory(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response interactively and return exit code. """ if re.match(r'D\w*\s\d+', response): words = shell._get_words_from_response(re.compile('D\w*\s').sub('', response), page.words) else: new_response = input(f'Specify ids of words to delete their correction history. >') if re.match(r'\d+', new_response): words = shell._get_words_from_response(new_response, page.words) action_dictionary = { 'words' : words } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] if len(words) > 0: for word in words: print(word.text) word.earlier_version = None word.corrections = [] if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) + save_page(page, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class ChangeDeletionStatus(ResponseHandler): def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ if re.match(r'[du]\w*\s\d+', response): words = shell._get_words_from_response(re.compile('[du]\w*\s').sub('', response), page.words) else: deletion_target = 'delete' if response.startswith('d') else 'undelete' new_response = input(f'Specify ids of words to {deletion_target}. >') if re.match(r'\d+', new_response): words = shell._get_words_from_response(new_response, page.words) action_dictionary = { 'words': words, 'deleted': response.startswith('d') } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] word_should_be_deleted = bool(action_dictionary.get('deleted')) if len(words) > 0: for word in words: word.deleted = word_should_be_deleted if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) + save_page(page, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class SplitWords(ResponseHandler): def _split_word(self, page, word, split_text): """Split word. """ index = page.words.index(word) _, left, right = word.split(split_text) page.words[index] = left page.words.insert(index+1, right) def create_requirement_list(self) ->list: """Create a requirement dictionary. """ return [{ 'name': 'split_text', 'type': 'string', 'input': None }] def handle_interactive_response(self, page: Page, response: str, shell) -> int: """Handle response and return exit code. """ if re.match(r's\s\w+\s\d+', response): words = shell._get_words_from_response(re.compile('s\s\w+\s').sub('', response), page.words) split_text = response.split(' ')[1] else: split_text = input('Input split text>') new_response = input(f'Specify ids of words to split. >') if re.match(r'\d+', new_response): words = shell._get_words_from_response(new_response, page.words) action_dictionary = { 'words': words, 'split_text': split_text } if self.run_change(page, action_dictionary) == 0: return shell.run_interactive_editor(page) return 2 def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] split_text = action_dictionary['split_text']\ if bool(action_dictionary.get('split_text'))\ else '' if len(words) > 0 and split_text != '': for word in words: self._split_word(page, word, split_text) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) + save_page(page, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class AddBox(ResponseHandler): def create_requirement_list(self) ->list: """Create a requirement dictionary. """ return [{ 'name': 'box_text', 'type': 'string', 'input': None },\ { 'name': 'overwritten_by', 'type': 'string', 'input': None },\ { 'name': 'is_earlier_version', 'type': 'boolean', 'input': False }] def run_change(self, page: Page, action_dictionary: dict) -> int: """Run changes on page and return exit code. """ exit_code = 0 words = action_dictionary['words']\ if bool(action_dictionary.get('words'))\ else [] missing_text = action_dictionary.get('box_text') is_earlier_version = action_dictionary.get('is_earlier_version') overwritten_by = action_dictionary.get('overwritten_by') if len(words) > 0 and missing_text is not None: for word in words: if overwritten_by is not None: split_into_parts_and_attach_box(word, 0, missing_text, is_earlier_version, overwritten_by) else: attach_box(word, 0, missing_text, False) word.create_correction_history() if len(word.corrections) > 0: for wp in word.word_parts: wp.overwrites_word = None if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) + save_page(page, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}') page = Page(page.page_tree.docinfo.URL) else: exit_code = 2 return exit_code class ResponseOrganizer: RESULT = 'result' def __init__(self): self.response_handler_dictionary = {} self._add_response_handler(JoinWords(action_name='join words', description='join words')) self._add_response_handler(SplitWords(action_name='split words', description='split word according to split text')) self._add_response_handler(CreateCorrectionHistory(action_name='create correction history', description='creates a correction history for selected words')) self._add_response_handler(DeleteCorrectionHistory(action_name='delete correction history', description='deletes the correction history of selected words')) self._add_response_handler(AddBox(action_name='add box', description='add box with overwritten text')) self._add_response_handler(SaveChanges(action_name='save changes', description='save change to line number/deletion status for word(s)' )) def _add_response_handler(self, response_handler: ResponseHandler): """Add response_handler to response_handler_dictionary. """ self.response_handler_dictionary.update({response_handler.action_name: response_handler}) def create_json_dict(self, xml_file: str, last_operation_result=None) ->dict: """Return a json dict of page with information about action. """ page = Page(xml_file) replace_ligatures(page) converter = JSONConverter(page) json_dict = converter.create_json_dict() action_dict = { 'target_file': xml_file,\ 'date_stamp': os.path.getmtime(xml_file) } if last_operation_result is not None: action_dict.update({self.RESULT: last_operation_result }) response_handlers = [] for response_handler in self.response_handler_dictionary.values(): response_handlers.append(response_handler.create_json_dict()) action_dict.update({ 'response_handlers': response_handlers }) json_dict.update({ 'actions': action_dict}) return json_dict def handle_response(self, json_dict: dict) ->dict: """Handle response in json_dict and return new data json_dict. """ if bool(json_dict.get('target_file')): target_file = json_dict['target_file'] if bool(json_dict.get('date_stamp')): current_stamp = os.path.getmtime(target_file) if current_stamp <= json_dict['date_stamp']: exit_code = 2 operation = 'unknown' if bool(json_dict.get('response_handler'))\ and bool(self.response_handler_dictionary.get(json_dict['response_handler']['action_name'])): operation = json_dict['response_handler']['action_name'] response_handler = self.response_handler_dictionary[operation] exit_code = response_handler.handle_response(Page(target_file), json_dict) message = f'Operation "{operation}" succeeded!' if exit_code == 0 else f'Operation "{operation}" failed' return self.create_json_dict(target_file, last_operation_result=message) else: return self.create_json_dict(target_file,\ last_operation_result=f'FAIL: file {target_file} was changed between operations!') else: return self.create_json_dict(target_file,\ last_operation_result='ERROR: there was no key "date_stamp" in json') else: return { 'actions': { self.RESULT: 'ERROR: there was no key "target_file" in json!' }} class InteractiveShell: def __init__(self): self.response_handlers = [] self.response_handlers.append(SimpleJoinWords(dialog_string='specify ids of words to join [default]')) self.response_handlers.append(RestoreBackup(response_starts_with='b', dialog_string='b=restore backup')) self.response_handlers.append(CreateCorrectionHistory(response_starts_with='c', dialog_string='c=create correction history [+ ids]')) self.response_handlers.append(DeleteCorrectionHistory(response_starts_with='D', dialog_string='D=delete correction history [+ ids]')) self.response_handlers.append(ChangeDeletionStatus(response_starts_with='d', dialog_string='d=mark deleted [+ ids]')) self.response_handlers.append(SaveChanges(response_starts_with='i', dialog_string='i=fix ids' )) self.response_handlers.append(ChangeLine2Value(response_starts_with='l', dialog_string='l[:value]=change line to value for ids' )) self.response_handlers.append(Reload(response_starts_with='r', dialog_string='r=reload xml file')) self.response_handlers.append(SplitWords(response_starts_with='s', dialog_string='s=split and join word ("s splittext id")')) self.response_handlers.append(ChangeDeletionStatus(response_starts_with='u', dialog_string='u=undelete [+ ids]')) self.response_handlers.append(JoinWords(response_starts_with='w', dialog_string='w=join words with whitespace between them [+ ids]')) self.response_handlers.append(ResponseHandler()) def _get_words_from_response(self, response, words) ->list: """Return a list of word that correspond to indices """ if re.match(r'\d+-\d+', response)\ or re.match(r'\d+\+', response): index_boundaries = [] if response[-1] == '+': index_boundaries.append(int(response[:response.index('+')])) index_boundaries.append(index_boundaries[0]+1) else: index_boundaries = [ int(i) for i in response.split('-') ] index_boundaries_length_diff = len(response.split('-')[0]) - len(response.split('-')[1]) if index_boundaries_length_diff > 0: index_boundaries[1] = int(response.split('-')[0][0-index_boundaries_length_diff-1] + response.split('-')[1]) indices = [ i for i in range(index_boundaries[0], index_boundaries[1]+1) ] if index_boundaries[0] > index_boundaries[1]: indices = [ index_boundaries[0] ] while indices[-1] > index_boundaries[1]: indices.append(indices[-1]-1) else: indices = [ int(i) for i in response.split(' ') ] result_words = [] for index in indices: if len([ word for word in words if word.id == index ]) > 0: result_words += [ word for word in words if word.id == index ] return result_words def run_interactive_editor(self, page) -> int: """Run interactive shell. """ replace_ligatures(page) HTMLConverter(page).convert() for response_handler in self.response_handlers: response_handler.print_dialog() response = input('>') for response_handler in self.response_handlers: if response_handler.match(response): return response_handler.handle_interactive_response(page, response, self) def replace_ligatures(page): """Replace ligatures """ if len([ word for word in page.words if re.match(r'.*[flfi]', word.text) ]) > 0: for word in [ word for word in page.words if re.match(r'.*[fi]', word.text) ]: word.text = word.text.replace('fi', 'fi') for word in [ word for word in page.words if re.match(r'.*[fl]', word.text) ]: word.text = word.text.replace('fl', 'fl') def dict_contains_keys(a_dict, key_list)->bool: """Return whether dict a_dict contains key path given by key_list. """ if len(key_list) == 0: return True else: if key_list[0] in a_dict.keys(): return dict_contains_keys(a_dict[key_list[0]], key_list[1:]) return False def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to fix faksimile position ->set them to their absolute value. fixes/interactive_editor.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 xml_file = args[0] if isfile(xml_file): counter = 0 shell = InteractiveShell() for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK): if not UNITTESTING: print(Fore.CYAN + f'Processing {page.title}, {page.number} with interactive editor ...' + Style.RESET_ALL) back_up(page, page.xml_file) counter += 1 if shell.run_interactive_editor(page) == 0 else 0 if not UNITTESTING: print(Style.RESET_ALL + f'[{counter} pages changed by interactive shell]') else: raise FileNotFoundError('File {} does not exist!'.format(xml_file)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: fixes/test_fix_old_data.py =================================================================== --- fixes/test_fix_old_data.py (revision 101) +++ fixes/test_fix_old_data.py (revision 102) @@ -1,81 +1,85 @@ import lxml.etree as ET from os import sep, path, remove from os.path import isdir, isfile, dirname, basename import shutil import sys import tempfile import unittest import warnings import fix_old_data sys.path.append('svgscripts') from datatypes.faksimile import FaksimilePage from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.path import Path from datatypes.positional_word_part import PositionalWordPart from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.word import Word from datatypes.word_position import WordPosition from process_words_post_merging import MERGED_DIR class TestFixFaksimile(unittest.TestCase): def setUp(self): fix_old_data.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml' self.fix_transkription_positions = DATADIR + sep + 'Mp_XIV_page419a.xml' def test_convert_old_matrix(self): page = Page(self.xml_file) xmin = 28.346 ymin = 49.921 tp = page.words[63].transkription_positions[0] matrix, x, y = fix_old_data.convert_old_matrix(tp, xmin, ymin) #print(matrix.toString(), x, y) def test_fix_faksimile(self): page = Page(self.xml_file) fp = page.words[0].faksimile_positions[0] left = fp.left top = fp.top self.assertEqual(fix_old_data.fix_faksimile_positions(page), True) self.assertEqual(fp.left, left + page.text_field.xmin) self.assertEqual(fp.top, top + page.text_field.ymin) def test_fix_faksimile_line_position(self): page = Page(self.xml_file) fix_old_data.fix_faksimile_line_position(page) for line_number in page.line_numbers: #print(line_number.id) self.assertTrue(line_number.faksimile_inner_top < line_number.faksimile_inner_bottom) @unittest.skip('already tested, interactive') def test_fix_transkription_positions(self): page = Page(self.fix_transkription_positions) merged_page = Page(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)) fix_old_data.sync_words_linewise(merged_page.words, page.words, page.line_numbers) - self.assertTrue(fix_old_data.fix_transkription_positions(page)) - - @unittest.skip('already tested, interactive') - def test_join_words(self): - page = Page(self.fix_transkription_positions) - fix_old_data.join_words_interactive(page) + self.assertTrue(fix_old_data.merge_transkription_positions(page)) @unittest.skip('already tested, local file') def test_fix_graphical_svg_file(self): fix_old_data.fix_graphical_svg_file(Page('xml/Mp_XIV_page418.xml')) @unittest.skip('already tested, local file') def test_get_words(self): page = Page('xml/Mp_XIV_page418.xml') print([ word.text for word in page.words if word.id == 300]) words = fix_old_data._get_words_from_response('300-310', page.words) print(words) - + + def test_fix_tp_of_word(self): + page = Page(self.fix_transkription_positions) + old_left = page.words[0].transkription_positions[0].left + old_top = page.words[0].transkription_positions[0].top + fix_old_data.fix_transkription_positions(page) + self.assertTrue(page.svg_image.text_field is not None) + self.assertEqual(page.words[0].transkription_positions[0].left, old_left + page.svg_image.text_field.left) + self.assertEqual(page.words[0].transkription_positions[0].top, old_top + page.svg_image.text_field.top) + if __name__ == "__main__": unittest.main() Index: fixes/fix_old_data.py =================================================================== --- fixes/fix_old_data.py (revision 101) +++ fixes/fix_old_data.py (revision 102) @@ -1,567 +1,470 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" This program can be used to process words after they have been merged with faksimile data. +""" This program can be used to fix old data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET import re import shutil import string from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.path import Path as SVGPath from svgpathtools.path import Line import sys import tempfile from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from progress.bar import Bar import warnings sys.path.append('svgscripts') from convert_wordPositions import HTMLConverter from datatypes.box import Box from datatypes.faksimile import FaksimilePage from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.matrix import Matrix -from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK +from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from datatypes.path import Path +from datatypes.word import Word from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.word import Word, update_transkription_position_ids from join_faksimileAndTranskription import sort_words -from util import back_up, back_up_svg_file, copy_faksimile_svg_file +from util import back_up, back_up_svg_file, copy_faksimile_svg_file, reset_tp_with_matrix from process_files import update_svgposfile_status from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from main_util import create_function_dictionary __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False MAX_SVG_XY_THRESHOLD = 10 #TODO: fix all svg graphical files: change xlink:href to href!!!! def convert_old_matrix(tp, xmin, ymin) ->(Matrix, float, float): """Return new matrix, x and y for old transkription_position. """ matrix = tp.transform.clone_transformation_matrix() matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3) matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3) x = round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)\ if tp.left > 0\ else 0 y = round((tp.height-1.5)*-1, 3) return matrix, x, y -def save_page(page, attach_first=False, backup=False): +def save_page(page, attach_first=False, backup=False, script_name=None): """Write page to xml file """ if backup: back_up(page, page.xml_file) if attach_first: page.update_and_attach_words2tree() - script_name = f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}' + if script_name is None: + script_name = f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}' write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=script_name, file_type=FILE_TYPE_SVG_WORD_POSITION) def page_already_changed(page) -> bool: """Return whether page has alreadybeen changed by function """ return len(\ page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}:{inspect.currentframe().f_back.f_code.co_name}"]')\ ) > 0 def fix_faksimile_line_position(page, redo=False) -> bool: """Create a faksimile line position. """ if not redo and page_already_changed(page): return False; update_faksimile_line_positions(page) if not UNITTESTING: save_page(page) return True def check_faksimile_positions(page, redo=False) -> bool: """Check faksimile line position. """ if len(page.page_tree.xpath('//data-source/@file')) > 0: svg_file = page.page_tree.xpath('//data-source/@file')[0] svg_tree = ET.parse(svg_file) positions_are_equal_counter = 0 page_changed = False for faksimile_page in FaksimilePage.GET_FAKSIMILEPAGES(svg_tree): if page.title == faksimile_page.title\ and page.number == faksimile_page.page_number: #print([fp.id for fp in faksimile_page.word_positions ]) for word in page.words: for fp in word.faksimile_positions: rect_fps = [ rfp for rfp in faksimile_page.word_positions if rfp.id == fp.id ] if len(rect_fps) > 0: rfp = rect_fps[0] if fp.left != rfp.left or fp.top != rfp.top: #print(f'{fp.id}: {fp.left}/{rfp.left} {fp.top}/{rfp.top}') fp.left = rfp.left fp.top = rfp.top fp.bottom = fp.top + rfp.height word.attach_word_to_tree(page.page_tree) page_changed = True else: positions_are_equal_counter += 1 print(f'{positions_are_equal_counter}/{len(page.words)} are equal') if page_changed and not UNITTESTING: save_page(page) return page_changed def fix_faksimile_positions(page, redo=False) -> bool: """Set faksimile positions to absolute values. [:return:] fixed """ if not redo and len(page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}"]')) > 0: return False x_min = page.text_field.xmin y_min = page.text_field.ymin for word in page.words: for fp in word.faksimile_positions: fp.left = fp.left + x_min fp.top = fp.top + y_min fp.bottom = fp.bottom + y_min word.attach_word_to_tree(page.page_tree) if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) return True +def _fix_tp_of_word(page, word, text_field): + """Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top + """ + for tp in word.transkription_positions: + tp.left += text_field.left + tp.top += text_field.top + reset_tp_with_matrix(page, word.transkription_positions) + if type(word) == Word: + words_in_word = word.word_parts + [ item for item in word.__dict__.items() if type(item) == Word ] + for wp in words_in_word: + _fix_tp_of_word(page, wp, text_field) + def fix_transkription_positions(page, redo=False) -> bool: + """Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top + + [:return:] fixed + """ + if page.svg_image is not None\ + and page.svg_image.text_field is None: + if page.svg_image is None: + if page.svg_file is not None: + transkription_field = TranskriptionField(page.svg_file) + width = round(tf.documentWidth, 3) + height = round(tf.documentHeight, 3) + page.svg_image = SVGImage(file_name=svg_file, width=width,\ + height=height, text_field=transkription_field.convert_to_text_field()) + page.svg_image.attach_object_to_tree(page.page_tree) + else: + raise Exception(f'ERROR page {page.page_tree.docinfo.URL} does not have a svg_file!') + elif page.svg_image.text_field is None: + page.svg_image.text_field = TranskriptionField(page.svg_image.file_name).convert_to_text_field() + page.svg_image.attach_object_to_tree(page.page_tree) + for line_number in page.line_numbers: + line_number.top += page.svg_image.text_field.top + line_number.bottom += page.svg_image.text_field.top + line_number.attach_object_to_tree(page.page_tree) + for word in page.words: + _fix_tp_of_word(page, word, page.svg_image.text_field) + for mark in page.mark_foreign_hands: + _fix_tp_of_word(page, mark, page.svg_image.text_field) + for tcm in page.text_connection_marks: + _fix_tp_of_word(page, tcm, page.svg_image.text_field) + if not UNITTESTING: + print(f'writing to {page.page_tree.docinfo.URL}') + save_page(page, attach_first=True) + return True + return False + +def merge_transkription_positions(page, redo=False) -> bool: """Fix transkription positions of merged words [:return:] fixed """ if not isdir(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR)\ or not isfile(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)): return False merged_page = Page(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)) sync_dictionary = sync_words_linewise(merged_page.words, page.words, merged_page.line_numbers) words = [] for source_word in merged_page.words: words.append(source_word) if bool(sync_dictionary.get(source_word)): _sync_transkriptions_with_words(source_word, sync_dictionary) if source_word.text != ''.join([ t.get_text() for t in source_word.transkription_positions ]): text = ''.join([ t.get_text() for t in source_word.transkription_positions ]) print(f'{source_word.line_number}: {source_word.text} has transkription_positions with text "{text}".') response = input('Change? [Y/n]>') if not response.startswith('n'): new_sync_dictionary = sync_words_linewise(merged_page.words, page.words,\ [ line for line in merged_page.line_numbers if line.id == source_word.line_number ], force_sync_on_word=source_word) if bool(new_sync_dictionary.get(source_word)): _sync_transkriptions_with_words(source_word, new_sync_dictionary) else: raise Exception(f'Could not find sourc_word {source_word.text} in {new_sync_dictionary}!') page.words = words page.update_and_attach_words2tree() if not UNITTESTING: print(f'writing to {page.page_tree.docinfo.URL}') save_page(page) return True def fix_graphical_svg_file(page, redo=False) -> bool: """Fix glyphs of word for which there is a /changed-word in page.page_tree """ svg_tree = ET.parse(page.svg_file) transkription_field = TranskriptionField(page.source) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } back_up_svg_file(svg_tree, namespaces=namespaces) + tr_xmin = transkription_field.xmin if (page.svg_image is None or page.svg_image.text_field is None) else 0 + tr_ymin = transkription_field.ymin if (page.svg_image is None or page.svg_image.text_field is None) else 0 for deleted_word_node in page.page_tree.xpath('//deleted-word'): deleted_word = Word.create_cls(deleted_word_node) - _run_function_on_nodes_for_word(svg_tree, namespaces, deleted_word, transkription_field, _set_node_attribute_to, 'visibility', 'hidden') + _run_function_on_nodes_for_word(svg_tree, namespaces, deleted_word, tr_xmin, tr_ymin, _set_node_attribute_to, 'visibility', 'hidden') for changed_word_node in page.page_tree.xpath('//changed-word'): changed_word = Word.create_cls(changed_word_node) try: word = [ word for word in page.words if word.id == changed_word.id and word.text == changed_word.text ][0] left_difference = word.transkription_positions[0].left - changed_word.transkription_positions[0].left - _run_function_on_nodes_for_word(svg_tree, namespaces, word, transkription_field, _add_value2attribute, 'x', left_difference) + _run_function_on_nodes_for_word(svg_tree, namespaces, word, tr_xmin, tr_ymin, _add_value2attribute, 'x', left_difference) except IndexError: warnings.warn(f'There is no word for changed_word {changed_word.id}: "{changed_word.text}" in {page.page_tree.docinfo.URL}!') copy_faksimile_svg_file(target_file=page.svg_file, faksimile_tree=svg_tree, namespaces=namespaces) def _add_value2attribute(node, attribute, value): """Add left_difference to x of node. """ node.set(attribute, str(float(node.get(attribute)) + value)) node.set('changed', 'true') def _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=0.1) -> list: """Return nodes with symbol_id n x = svg_x and y = svg_y. """ nodes = [ node for node in svg_tree.xpath(\ f'//ns:use[@xlink:href="#{symbol_id}" and @x > {svg_x-threshold} and @x < {svg_x+threshold} and @y > {svg_y-threshold} and @y < {svg_y+threshold} ]',\ namespaces=namespaces) if not bool(node.get('changed')) ] if len(nodes) == 0 and threshold < MAX_SVG_XY_THRESHOLD: return _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=threshold+1) return nodes -def _run_function_on_nodes_for_word(svg_tree, namespaces, word, transkription_field, function_on_node, attribute, value): +def _run_function_on_nodes_for_word(svg_tree, namespaces, word, tr_xmin, tr_ymin, function_on_node, attribute, value): """Run function on nodes for words. """ for tp in word.transkription_positions: for pwp in tp.positional_word_parts: symbol_id = pwp.symbol_id - svg_x = pwp.left + transkription_field.xmin - svg_y = pwp.bottom + transkription_field.ymin + svg_x = pwp.left + tr_xmin + svg_y = pwp.bottom + tr_ymin nodes = _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y) if len(nodes) > 0: node = nodes[0] function_on_node(node, attribute, value) def _set_node_attribute_to(node, attribute, value): """Set attribute of node to value. """ node.set(attribute, str(value)) node.set('changed', 'true') -def _get_words_from_response(response, words) ->list: - """Return a list of word that correspond to indices - """ - if re.match(r'\d+-\d+', response)\ - or re.match(r'\d+\+', response): - index_boundaries = [] - if response[-1] == '+': - index_boundaries.append(int(response[:response.index('+')])) - index_boundaries.append(index_boundaries[0]+1) - else: - index_boundaries = [ int(i) for i in response.split('-') ] - index_boundaries_length_diff = len(response.split('-')[0]) - len(response.split('-')[1]) - if index_boundaries_length_diff > 0: - index_boundaries[1] = int(response.split('-')[0][0-index_boundaries_length_diff-1] + response.split('-')[1]) - indices = [ i for i in range(index_boundaries[0], index_boundaries[1]+1) ] - if index_boundaries[0] > index_boundaries[1]: - indices = [ index_boundaries[0] ] - while indices[-1] > index_boundaries[1]: - indices.append(indices[-1]-1) - else: - indices = [ int(i) for i in response.split(' ') ] - result_words = [] - for index in indices: - if len([ word for word in words if word.id == index ]) > 0: - result_words += [ word for word in words if word.id == index ] - return result_words - -def _split_word(page, word, split_text): - """Split word. - """ - index = page.words.index(word) - _, left, right = word.split(split_text) - page.words[index] = left - page.words.insert(index+1, right) - -def join_words_interactive(page, redo=False) -> bool: - """Join words interactively. - """ - HTMLConverter(page).convert() - print('Specify ids of words to join.') - print('[s=split and join word ("s splittext id")]') - print('[c=create correction history]') - print('[d=mark deleted|i=fix ids|u=undelete|l[:value]=change line to value for ids|r=reload|b=restore backup|q=quit]>') - response = input('>') - if response.startswith('i'): - print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) - return join_words_interactive(Page(page.page_tree.docinfo.URL)) - elif response.startswith('r'): - return join_words_interactive(Page(page.page_tree.docinfo.URL)) - elif response.startswith('b'): - if page.bak_file is not None: - return join_words_interactive(Page(page.bak_file)) - else: - print('Could not restore backup file, please restore manually!') - elif response.startswith('l'): - words = [] - line_number = -1 - if re.match(r'l:\d+\s\d+', response): - line_number = int(response.replace('l:', '').split(' ')[0]) - words = _get_words_from_response(re.compile('l:\d+\s').sub('', response), page.words) - else: - if not re.match(r'l:\d+$', response): - new_response_line = input('Specify new line number>') - if re.match(r'^\d+$', new_response_line): - line_number = int(new_response_line) - else: - line_number = int(response.replace('l:', '')) - new_response = input(f'Specify ids of words for which line number should be changed to {line_number}>') - if re.match(r'\d+', new_response): - words = _get_words_from_response(new_response, page.words) - if line_number != -1: - for word in words: word.line_number = line_number - if not UNITTESTING: - print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) - page = Page(page.page_tree.docinfo.URL) - return join_words_interactive(page) - elif response.startswith('c'): - if re.match(r'c\w*\s\d+', response): - words = _get_words_from_response(re.compile('c\w*\s').sub('', response), page.words) - else: - new_response = input(f'Specify ids of words to create a correction history. >') - if re.match(r'\d+', new_response): - words = _get_words_from_response(new_response, page.words) - if len(words) > 0: - for word in words: word.create_correction_history() - if not UNITTESTING: - print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) - page = Page(page.page_tree.docinfo.URL) - return join_words_interactive(page) - elif response.startswith('d') or response.startswith('u'): - if re.match(r'[du]\w*\s\d+', response): - words = _get_words_from_response(re.compile('[du]\w*\s').sub('', response), page.words) - else: - deletion_target = 'delete' if response.startswith('d') else 'undelete' - new_response = input(f'Specify ids of words to {deletion_target}. >') - if re.match(r'\d+', new_response): - words = _get_words_from_response(new_response, page.words) - if len(words) > 0: - for word in words: word.deleted = response.startswith('d') - if not UNITTESTING: - print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) - page = Page(page.page_tree.docinfo.URL) - return join_words_interactive(page) - elif response.startswith('s'): - if re.match(r's\s\w+\s\d+', response): - words = _get_words_from_response(re.compile('s\s\w+\s').sub('', response), page.words) - split_text = response.split(' ')[1] - else: - split_text = input('Input split text>') - new_response = input(f'Specify ids of words to split. >') - if re.match(r'\d+', new_response): - words = _get_words_from_response(new_response, page.words) - if len(words) > 0 and split_text != '': - for word in words: _split_word(page, word, split_text) - if not UNITTESTING: - print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) - page = Page(page.page_tree.docinfo.URL) - return join_words_interactive(page) - elif re.match(r'\d+', response): - words = _get_words_from_response(response, page.words) - if len(words) > 0: - if len(set([ word.line_number for word in words ])) == 1\ - and len(set([ word.deleted for word in words ])) == 1: - new_word = words[0] - for word2join in words[1:]: - page.words.remove(word2join) - new_word.join(word2join) - else: - new_word = Word.join_words(words) - index = len(page.words) - if words[0] in page.words: - index = page.words.index(words[0]) - elif len([ word for word in page.words if words[0] in word.word_parts ]) > 0: - index = page.words.index([ word for word in page.words if words[0] in word.word_parts ][0]) - for word2join in words: - if word2join in page.words: - page.words.remove(word2join) - elif len([ word for word in page.words if word2join in word.word_parts ]) > 0: - page.words.remove([ word for word in page.words if word2join in word.word_parts ][0]) - page.words.insert(index, new_word) - if not UNITTESTING: - print(f'writing to {page.page_tree.docinfo.URL}') - save_page(page, attach_first=True) - page = Page(page.page_tree.docinfo.URL) - return join_words_interactive(page) - return True - def sync_words_linewise(source_words, target_words, lines, force_sync_on_word=None) -> dict: """Sync words an create a dictionary with source_words as keys, refering to a list of corresponding words. """ result_dict = {} for word in target_words + source_words: word.processed = False for line in lines: source_words_on_line = sorted([ word for word in source_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left) target_words_on_line = sorted([ word for word in target_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left) if len(target_words_on_line) == len(source_words_on_line): _sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word) elif len(source_words_on_line) < len(target_words_on_line): _sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word) else: print('okey dokey') return result_dict def _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict): """Force sync on word. """ unprocessed_target_words = [t_word for t_word in target_words_on_line if not t_word.processed] if len(unprocessed_target_words) > 0: print([ (i, t_word.text) for i, t_word in enumerate(unprocessed_target_words)]) response = input(f'Please specify indices of words to sync {force_sync_on_word.text} with: [default:0-{len(unprocessed_target_words)-1}]>') indices = [ i for i in range(0, len(unprocessed_target_words)) ] if re.match(r'\d+-\d+', response): index_strings = response.split('-') indices = [ i for i in range(int(index_strings[0]), int(index_strings[1])+1) ] elif response != '': indices = [ int(i) for i in response.split(' ') ] target_words = [] for i in indices: target_words.append(unprocessed_target_words[i]) result_dict.update({ force_sync_on_word: target_words }) else: raise Exception(f'There are no unprocessed target_words for {force_sync_on_word.text} on line {force_sync_on_word.line_number}!') def _sync_transkriptions_with_words(word, sync_dictionary): """Sync transkription_positions of word with syncronized words. """ word.transkription_positions = [] for target_word in sync_dictionary[word]: word.transkription_positions += target_word.transkription_positions def _sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None): """Sync if there are more target words. """ current_source_word = None for target_word in target_words_on_line: if current_source_word is not None\ and current_source_word.text.startswith(''.join([ w.text for w in result_dict[current_source_word]]) + target_word.text): result_dict[current_source_word].append(target_word) target_word.processed = True if current_source_word.text == ''.join([ w.text for w in result_dict[current_source_word]]): current_source_word = None elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ]) > 0: source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ][0] target_word.processed = True source_word.processed = True result_dict.update({ source_word: [ target_word ] }) elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ]) > 0: current_source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ][0] current_source_word.processed = True target_word.processed = True result_dict.update({ current_source_word: [ target_word ] }) else: msg = f'On line {target_word.line_number}: target_word "{target_word.text}" does not have a sibling in {[ s.text for s in source_words_on_line if not s.processed ]}' warnings.warn(msg) if force_sync_on_word is not None: _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict) def _sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None): """Sync same length """ for i, word in enumerate(source_words_on_line): if word.text == target_words_on_line[i].text: word.processed = True target_words_on_line[i].processed = True result_dict.update({ word: [ target_words_on_line[i] ] }) elif len([ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ]) > 0: target_word = [ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ][0] word.processed = True target_word.processed = True result_dict.update({ word: [ target_word ] }) else: msg = f'On line {word.line_number}: source_word "{word.text}" does not have a sibling in {[ s.text for s in target_words_on_line]}' warnings.warn(msg) if force_sync_on_word is not None: _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): - """This program can be used to fix faksimile position ->set them to their absolute value. + """This program can be used to fix old data. svgscripts/fix_old_data.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -c|--check-faksimile-positions check whether faksimile positions have been updated - -j|--join-words join words by id interactive -l|--faksimile-line-position create faksimile line positions -p|--faksimile-positions fix old faksimile positions -r|--redo rerun -s|--fix-graphical-svg fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file. + -p|--transkription-positions fix old transkription positions ->set to 0,0 instead of text_field.0,0 :return: exit code (int) """ function_list = [] function_dict = create_function_dictionary(['-c', '--check-faksimile-positions'], check_faksimile_positions) function_dict = create_function_dictionary(['-l', '--faksimile-line-position'], fix_faksimile_line_position, function_dictionary=function_dict) function_dict = create_function_dictionary(['-p', '--faksimile-positions'], fix_faksimile_positions, function_dictionary=function_dict) - function_dict = create_function_dictionary(['-t', '--transkription-positions'], fix_transkription_positions, function_dictionary=function_dict) + function_dict = create_function_dictionary(['-m', '--merge-positions'], merge_transkription_positions, function_dictionary=function_dict) function_dict = create_function_dictionary(['-s', '--fix-graphical-svg'], fix_graphical_svg_file, function_dictionary=function_dict) - function_dict = create_function_dictionary(['default', '-j', '--join-words'], join_words_interactive, function_dictionary=function_dict) + function_dict = create_function_dictionary(['default', '-t', '--transkription-positions'], fix_transkription_positions, function_dictionary=function_dict) redo = False; try: - opts, args = getopt.getopt(argv, "hcplrst", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position", "redo", "fix-graphical-svg", "transkription--positions" ]) + opts, args = getopt.getopt(argv, "hcplrmst", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position",\ + "redo", "merge-positions", "fix-graphical-svg", "transkription-positions" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-r', '--redo'): redo = True; elif opt in function_dict.keys(): function_list.append(function_dict[opt]) if len(function_list) == 0: function_list.append(function_dict['default']) if len(args) < 1: usage() return 2 exit_status = 0 xml_file = args[0] - if isfile(xml_file): - counters = { f.__name__: 0 for f in function_list } - for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK): + for xml_file in args: + if isfile(xml_file): + counters = { f.__name__: 0 for f in function_list } for current_function in function_list: - if not UNITTESTING: - print(Fore.CYAN + f'Processing {page.title}, {page.number} with function {current_function.__name__} ...' + Style.RESET_ALL) - back_up(page, page.xml_file) - counters[current_function.__name__] += 1 if current_function(page, redo=redo) else 0 - if not UNITTESTING: - for function_name, counter in counters.items(): - print(Style.RESET_ALL + f'[{counter} pages changed by {function_name}]') - else: - raise FileNotFoundError('File {} does not exist!'.format(xml_file)) + status_contains = STATUS_MERGED_OK if 'faksimile' in current_function.__name__ else 'OK' + for page in Page.get_pages_from_xml_file(xml_file, status_contains=status_contains): + if not UNITTESTING: + print(Fore.CYAN + f'Processing {page.title}, {page.number} with function {current_function.__name__} ...' + Style.RESET_ALL) + back_up(page, page.xml_file) + counters[current_function.__name__] += 1 if current_function(page, redo=redo) else 0 + if not UNITTESTING: + for function_name, counter in counters.items(): + print(Style.RESET_ALL + f'[{counter} pages changed by {function_name}]') + else: + raise FileNotFoundError('File {} does not exist!'.format(xml_file)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: py2ttl/class_spec.py =================================================================== --- py2ttl/class_spec.py (revision 101) +++ py2ttl/class_spec.py (revision 102) @@ -1,252 +1,254 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This is an abstract class for all classes that are semantically relevant. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc import inspect import warnings class UnSemanticClass: """ Subclasses of this class are not semantically relevant, even if their superclasses are. """ pass class SemanticClass(metaclass=abc.ABCMeta): """ This is an abstract class for all classes that are semantically relevant. """ HAS_PART = 'has_part' HAS_SEQNUM = 'has_seqnum' SINGLE_VALUE = 1 LIST = -99 CLASS_KEY = 'class' CARDINALITY = "cardinality" CARDINALITY_RESTRICTION = "cardinality_restriction" HAS_HOMOTYPIC_PARTS_URL_STRING = 'http://www.nie.org/ontology/homotypic#hasHomotypicParts' + HAS_IMAGE = 'http://www.nie.org/ontology/nietzsche#hasImage' HOMOTYPIC_HAS_TEXT_URL_STRING = 'http://www.nie.org/ontology/homotypic#hasText' STOFF_STYLE_HAS_CSS_URL_STRING = 'http://www.nie.org/ontology/standoff#styleHasCSS' + PAGE_IS_ON_TEXTFIELD = 'http://www.nie.org/ontology/nietzsche#pageIsOnTextField' PROPERTY_NAME = "name" PROPERTY_LABEL = "label" PROPERTY_COMMENT = "comment" PROPERTIES_KEY = "properties" SUBCLASS_OF = "rdfs:subClassOf" SUBPROPERTYOF = "subPropertyOf" SUPER_CLASSES_DICT = { 'http://www.nie.org/ontology/homotypic': 'HomotypicEntity', 'http://www.nie.org/ontology/standoff': 'Style' } SUPER_PROPERTY = "super_property" THIS = "this" TYPE = "type" @classmethod def create_semantic_property_dictionary(cls, property_key, class_type, cardinality=0, cardinality_restriction='cardinality', name='', label='', comment='', subPropertyOf='') -> dict: """Create a semantic property dicitonary. Here is how to make a subproperty: Pass the IRI of the super property as subPropertyOf=IRI, be sure that base_uri of IRI (as key) and Class identifier of super class (as value) are in cls.SUPER_CLASSES_DICT, then call cls.return_dictionary_after_updating_super_classes -> it will subclass the class that owns the subproperty to the super class. :return: semantic property dicitonary (dict) """ property_content = { SemanticClass.CLASS_KEY: class_type } if cardinality > 0: property_content.update({ SemanticClass.CARDINALITY: cardinality}) property_content.update({ SemanticClass.CARDINALITY_RESTRICTION: cardinality_restriction}) if name != '': property_content.update({ SemanticClass.PROPERTY_NAME: name}) if label != '': property_content.update({ SemanticClass.PROPERTY_LABEL: label}) if comment != '': property_content.update({ SemanticClass.PROPERTY_COMMENT: comment}) if subPropertyOf != '': property_content.update({ SemanticClass.SUBPROPERTYOF: subPropertyOf}) return { property_key: property_content } @classmethod def get_class_dictionary(cls): """Creates and returns a class_dictionary with the keys cls.THIS [, cls.SUBCLASS_OF, cls.TYPE]. """ class_dict = {cls.THIS: cls } if cls.__dict__.get('OWL_EQUIVALENTCLASSES') and len(cls.OWL_EQUIVALENTCLASSES) > 0: class_dict.update({'owl:equivalentClass': cls.OWL_EQUIVALENTCLASSES }) if cls.__dict__.get('RDFS_SUBCLASSOF_LIST') and len(cls.RDFS_SUBCLASSOF_LIST) > 0: class_dict.update({cls.SUBCLASS_OF: cls.RDFS_SUBCLASSOF_LIST }) direct_super_class = inspect.getclasstree([cls],unique=True)[0][0] if issubclass(direct_super_class, SemanticClass) and direct_super_class != SemanticClass: class_dict.update({cls.TYPE: direct_super_class}) return class_dict def get_name_and_id(self): """Return an identification for object as 2-tuple. """ id = 0 if 'id' in self.__dict__.keys(): id = self.id elif 'number' in self.__dict__.keys(): id = self.number elif 'title' in self.__dict__.keys(): id = self.title.replace(' ', '_') return type(self).__name__, id def _get_list_of_type(self, list_type): """Return list of type == list_type if list is not empty. """ list_of_type = [] for object_list in [ list_obj for list_obj in self.__dict__.values()\ if type(list_obj) == list ]: if len(object_list) > 0 and type(object_list[0]) == list_type: return object_list return list_of_type def get_object_from_list_with_id(self, object_type, object_id): """Return object from list if object has id == object_id, None if not found. """ list_with_object = [ item for item in self._get_list_of_type(object_type)\ if item.id == object_id ] if len(list_with_object) > 0: return list_with_object[0] return None @classmethod def get_cls_hasPart_objectCls_dictionaries(cls, object_cls, xpath, object_seqnum_xpath=None, cardinality=0, cardinality_restriction='minCardinality'): """Return a dictionary containing the information for creating a class that can act as an intermediary between cls and a number of object_cls if object_cls has a position in a sequence of object_classes that belong to cls. """ part_name = object_cls.__name__ + 'Part' has_part_name = object_cls.__name__.lower() + 'PartHas' + object_cls.__name__ has_seqnum_name = object_cls.__name__.lower() + 'HasSeqNum' if object_seqnum_xpath is None: object_seqnum_xpath = xpath + '/@id' object_part_dictionary = { 'class': object_cls, 'cardinality': 1, 'xpath': xpath,\ 'name': has_part_name, 'label': '{0} has a {1}'.format(part_name, object_cls.__name__),\ 'comment': '{0} has a part, that is a {1}'.format(part_name, object_cls.__name__)} object_seqnum_dictionary = { 'class': int, 'cardinality': 1, 'xpath': object_seqnum_xpath,\ 'name': has_seqnum_name, 'label': '{0} has a sequence number'.format(part_name),\ 'comment': '{0} has a part, that stands in a sequence with this number'.format(part_name, object_cls.__name__)} object_dictionary = { 'class_name': part_name, SemanticClass.HAS_PART: object_part_dictionary, SemanticClass.HAS_SEQNUM: object_seqnum_dictionary,\ 'label': '{0} part'.format(object_cls.__name__.lower()),\ 'comment': 'This class servers as a intermediary between {0} and {1}. {0} has some {1} in a specific sequence.'.format(cls.__name__, object_cls.__name__)} dictionary = { 'flag': 'ordered_list' , 'class': object_dictionary, 'cardinality': cardinality, 'cardinality_restriction': cardinality_restriction, 'xpath': xpath,\ 'name': cls.__name__.lower() + 'Has' + part_name, 'label': '{0} has a part that connects it with a {1}'.format(cls.__name__, object_cls.__name__),\ 'comment': '{0} has a part that connects it with a {1}, that has a position in a sequence of {1}'.format(cls.__name__, object_cls.__name__)} return dictionary @classmethod @abc.abstractmethod def get_semantic_dictionary(cls): """Creates a semantic dictionary with cls.CLASS_KEY and cls.PROPERTIES_KEY as its keys. The class-key points to a class_dictionary with the keys: cls.THIS [, cls.SUBCLASS_OF, cls.TYPE]. Create initial dictionary using cls.get_class_dictionary(): dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: {} } The properties_key points to a properties_dictionary with semantically relevant keys of self.__dict__ as keys. Use cls.create_semantic_property_dictionary(...) in order to add a property dictionary for each property as follows: dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary(property_key, ...)) Return dictionary by using: cls.return_dictionary_after_updating_super_classes(dictionary) """ pass def get_xml_conform_key_value_dictionary(self) -> dict: """Return a xml conform key value dictionary. """ property_d = self.get_semantic_dictionary()[self.PROPERTIES_KEY] attachable, attachable_list, builtins, builtin_list = 'attachable', 'attachable-list', 'builtins', 'builtin-list' xml_d = { attachable: {}, attachable_list: {}, builtins: {}, builtin_list: {}} for key in property_d.keys(): value = self.__dict__.get(key) if value is not None and (type(value) != list or len(value) > 0): semantic_type = property_d[key][self.CLASS_KEY]\ if type(property_d[key]) is dict\ else property_d[key][0] if type(value) != list and semantic_type.__module__ == builtins: if semantic_type == bool: xml_d[builtins].update({key.replace('_','-'): str(value).lower()}) else: xml_d[builtins].update({key.replace('_','-'): str(value)}) elif semantic_type.__module__ != builtins: attachable_key = attachable if type(value) != list else attachable_list xml_d[attachable_key].update({key.replace('_','-'): value}) else: xml_d[builtin_list].update({key.replace('_','-'): value}) return xml_d @classmethod def return_dictionary_after_updating_super_classes(cls, dictionary): """Return semantic dictionary after updating super classes if necessary. """ if cls.PROPERTIES_KEY not in dictionary.keys(): return dictionary subproperty_base_uri_set = set( value.get(cls.SUBPROPERTYOF).split('#')[0]\ for value in dictionary[cls.PROPERTIES_KEY].values()\ if bool(value.get(cls.SUBPROPERTYOF)) ) for sub_property_base in subproperty_base_uri_set: if bool(cls.SUPER_CLASSES_DICT.get(sub_property_base))\ and (\ cls.SUBCLASS_OF not in dictionary[cls.CLASS_KEY].keys()\ or len(dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF]) == 0\ or len([ url for url in dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF] if sub_property_base in url]) == 0\ # above instead of beneath, there might be more than one Class that share a sub_property_base. #or sub_property_base + '#' + cls.SUPER_CLASSES_DICT.get(sub_property_base) not in dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF]\ ): subclass_list = dictionary[cls.CLASS_KEY][cls.SUBCLASS_OF]\ if cls.SUBCLASS_OF in dictionary[cls.CLASS_KEY].keys()\ and len(dictionary[cls.CLASS_KEY].get(cls.SUBCLASS_OF)) > 0\ else [] subclass_list.append(sub_property_base + '#' + cls.SUPER_CLASSES_DICT.get(sub_property_base)) dictionary[cls.CLASS_KEY].update({cls.SUBCLASS_OF: subclass_list}) return dictionary def __repr__(self) -> str: """Return a representation of all semantically relevant properties. """ data_string = self.__str__() return f'<{data_string}>' def __str__(self) -> str: """Return a str of all semantically relevant properties. """ name = type(self).__name__ data = [] for key in self.get_semantic_dictionary()[self.PROPERTIES_KEY].keys(): if key in self.__dict__.keys() and\ (self.__dict__[key] != None or (type(self.__dict__[key]) == list and len(self.__dict__[key]) > 0)): data.append(f'{key}: {self.__dict__[key]}') data_string = ', '.join(data) return f'{name} {data_string}' Index: py2ttl/config.py =================================================================== --- py2ttl/config.py (revision 101) +++ py2ttl/config.py (revision 102) @@ -1,39 +1,40 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import getpass from os.path import isfile, isdir, exists import re PROJECT_NAME = 'tln' PROJECT_URL = 'http://www.knora.org/ontology/0068/nietzsche' DATA_URL = 'http://rdfh.ch/projects/0068' ONTOLOGY_DIR = './ontologies' if getpass.getuser() == 'knister0' else './ontologies' # local onotology dir, script will read only KNORA_BASE_ONTOLOGY_FILE = '{}/knora-ontologies/knora-base.ttl'.format(ONTOLOGY_DIR) SHARED_ONTOLOGIES_DIR = '{}/Ontologies-shared'.format(ONTOLOGY_DIR) PROJECT_ONTOLOGY_FILE = './Friedrich-Nietzsche-late-work-ontology.ttl' +PROJECT_ONTOLOGY_FILE_URL = 'https://c4scdn.ch/file/data/v6tjganrzg2nk3fuukgy/PHID-FILE-lcacdm2atc73ladd3ajq/Friedrich-Nietzsche-late-work-ontology.ttl' DATATYPES_DIR = './svgscripts/datatypes' # optional in config file, can be overwritten by passing a to py2ttl/py2ttl.py def check_config_files_exist(): """Checks whether all files exist that are specified in this file by uppercase variables ending in 'DIR' or 'FILE'. :return: exit code (int) """ for key in [ key for key in globals().keys() if re.match(r'^[A-Z_-]+(DIR|FILE)$', key) ]: if not exists(globals().get(key)): raise FileNotFoundError('Key {} does not specify an existing file or directory'.format(key)) if key.endswith('DIR') and not isdir(globals().get(key)): raise NotADirectoryError('Key {} does not specify an existing directory'.format(key)) return 0 def get_datatypes_dir(): """Returns value of DATATYPES_DIR if set, else None. """ if 'DATATYPES_DIR' in globals().keys(): return DATATYPES_DIR.replace('./','') else: None Index: py2ttl/py2ttl_ontology.py =================================================================== --- py2ttl/py2ttl_ontology.py (revision 101) +++ py2ttl/py2ttl_ontology.py (revision 102) @@ -1,363 +1,369 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert py classes that are subclasses of class_spec.SemanticClass to a owl ontology in turtle format. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import getopt import importlib import importlib.util import inspect import lxml.etree as ET from os import sep, path, listdir from os.path import isfile, isdir, dirname, basename from progress.bar import Bar from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD import re +import requests import sys import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from class_spec import SemanticClass, UnSemanticClass -from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL +from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_ONTOLOGY_FILE_URL, PROJECT_URL from data_handler import RDFDataHandler sys.path.append('shared_util') from myxmlwriter import dict2xml __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Py2TTLOntologyConverter: """This class can be used convert semantic_dictionaries to a owl ontology in turtle format. """ UNITTESTING = False INFERRED_SUB_CLASS = RDFS.subClassOf * '*' def __init__(self, project_ontology_file=None): self.class_uri_dict = {} self.uri_mapping4cls_and_properties = {} self.project_graph = Graph() self.base_uriref = URIRef(PROJECT_URL) self.project_name = PROJECT_NAME self.ns = { self.base_uriref + '#': self.project_name } if project_ontology_file is not None and isfile(project_ontology_file): + if project_ontology_file == PROJECT_ONTOLOGY_FILE: + r = requests.get(PROJECT_ONTOLOGY_FILE_URL) + with open(project_ontology_file, 'wb') as f: + f.write(r.content) + print(f'{project_ontology_file} updated from c4science repository') self.project_graph.parse(project_ontology_file, format="turtle") if len(self.project_graph) > 0: self.base_uriref = self.project_graph.value(predicate=RDF.type, object=OWL.Ontology, any=False) self.ns = { uriref: ns for ns, uriref in self.project_graph.namespace_manager.namespaces() } self.project_name = self.ns.get(self.base_uriref + '#') self.project_graph.bind(self.project_name, self.base_uriref + '#') self.uri_mapping4cls_and_properties.update({ 'ontology': { 'project_name': self.project_name, 'project_uri': self.base_uriref + '#' }}) self.uri_mapping4cls_and_properties.update({ 'classes': {} }) def addClass2Graph(self, cls, semantic_dict=None) -> (URIRef, type): """Add a class to project_graph. :return: (cls_uri (URIRef), super_cls (cls)) """ if semantic_dict is None: semantic_dict = cls.get_semantic_dictionary() comment, label = self.get_comment_label(cls) cls_uri = URIRef(self.base_uriref + '#' + cls.__name__) self.project_graph.add((cls_uri, RDF.type, OWL.Class)) self.project_graph.add((cls_uri, RDFS.isDefinedBy, self.base_uriref)) if comment != '': self.project_graph.add((cls_uri, RDFS.comment, Literal(comment, lang='en'))) if label != '': self.project_graph.add((cls_uri, RDFS.label, Literal(label, lang='en'))) super_uri = None super_cls = None if bool(semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.TYPE)): super_cls = semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.TYPE) super_uri = self.createClassAndProperties(super_cls) if super_uri is not None: self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri)) if SemanticClass.SUBCLASS_OF in semantic_dict[SemanticClass.CLASS_KEY].keys()\ and len(semantic_dict[SemanticClass.CLASS_KEY][SemanticClass.SUBCLASS_OF]) > 0: for super_uri_string in semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.SUBCLASS_OF): super_uri = URIRef(super_uri_string) if not (cls_uri, self.INFERRED_SUB_CLASS, super_uri) in self.project_graph: self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri)) return cls_uri, super_cls def addProperty2Graph(self, property_uri, domain_uri, range_uri, info_dict, property_type=OWL.ObjectProperty): """Add a property to self.project_graph. """ label = 'has ' + property_uri.split('#')[1].replace('has','')\ if SemanticClass.PROPERTY_LABEL not in info_dict.keys() else info_dict[SemanticClass.PROPERTY_LABEL] self.project_graph.add((property_uri, RDF.type, property_type)) self.project_graph.add((property_uri, RDFS.isDefinedBy, self.base_uriref)) self.project_graph.add((property_uri, RDFS.domain, domain_uri)) self.project_graph.add((property_uri, RDFS.range, range_uri)) if SemanticClass.PROPERTY_COMMENT in info_dict.keys(): comment = info_dict[SemanticClass.PROPERTY_COMMENT] self.project_graph.add((property_uri, RDFS.comment, Literal(comment, lang='en'))) self.project_graph.add((property_uri, RDFS.label, Literal(label, lang='en'))) if SemanticClass.CARDINALITY in info_dict.keys()\ and info_dict[SemanticClass.CARDINALITY] > 0: self.addRestriction2Class(domain_uri, property_uri, info_dict) def addRestriction2Class(self, cls_uri, property_uri, info_dict): """Adds restriction on property_uri to class cls_uri. """ if SemanticClass.CARDINALITY in info_dict.keys()\ and info_dict[SemanticClass.CARDINALITY] > 0: if (cls_uri, None, None) not in self.project_graph: warnings.warn('{} not in graph!'.format(cls_uri)) restriction = BNode() cardinality_restriction = URIRef(OWL + info_dict[SemanticClass.CARDINALITY_RESTRICTION])\ if SemanticClass.CARDINALITY_RESTRICTION in info_dict.keys()\ else OWL.cardinality cardinality = info_dict[SemanticClass.CARDINALITY] self.project_graph.add((cls_uri, RDFS.subClassOf, restriction)) self.project_graph.add((restriction, RDF.type, OWL.Restriction)) self.project_graph.add((restriction, OWL.onProperty, property_uri)) self.project_graph.add((restriction, cardinality_restriction, Literal(str(cardinality), datatype=XSD.nonNegativeInteger))) def create_ontology(self, datatypes_dir, target_ontology_file): """Convert all classes contained in datatypes_dir that are subclasses of class_spec.SemanticClass to rdf. :return: exit code (int) """ if isdir(datatypes_dir): semantic_classes = self.get_semantic_classes(datatypes_dir) if not Py2TTLOntologyConverter.UNITTESTING: bar = Bar('creating classes and properties', max=len(semantic_classes)) for cls in semantic_classes: self.createClassAndProperties(cls) not bool(Py2TTLOntologyConverter.UNITTESTING) and bar.next() not bool(Py2TTLOntologyConverter.UNITTESTING) and bar.finish() self.uri_mapping4cls_and_properties['ontology'].update({'ontology_file': target_ontology_file}) f = open(target_ontology_file, 'wb+') f.write(self.project_graph.serialize(format="turtle")) f.close() if not Py2TTLOntologyConverter.UNITTESTING: xml_file = 'mapping_file4' + datatypes_dir.replace(sep, '.') + '2' + target_ontology_file.replace('.' + sep, '').replace(sep, '.').replace('.ttl', '.xml') dict2xml(self.uri_mapping4cls_and_properties, xml_file) else: print('Error: dir {} does not exist!'.format(datatypes_dir)) usage return 1 return 0 def createClassAndProperties(self, cls): """Creates a owl:Class and some owl:ObjectProperty from semantic_dictionary of a python class. """ if not cls.__name__ in self.class_uri_dict: self.class_uri_dict.update({cls.__name__: cls}) semantic_dict = cls.get_semantic_dictionary() cls_uri, super_cls = self.addClass2Graph(cls, semantic_dict) uri_mapping4properties = {} for property_key in self._get_semantic_dictionary_keys_super_first(semantic_dict['properties']): super_semantic_dict = {} if super_cls is None else super_cls.get_semantic_dictionary() if len(super_semantic_dict) == 0 or not bool(super_semantic_dict['properties'].get(property_key)): property_dict4key = semantic_dict['properties'].get(property_key) property_cls = property_dict4key.get('class') subject_uri, property_uri = self.createProperty(cls_uri, property_key, property_cls, property_dict4key) uri_mapping4properties.update({ property_key: property_uri }) elif bool(self.uri_mapping4cls_and_properties.get('classes').get(super_cls.__name__).get('properties').get(property_key)): property_uri = self.uri_mapping4cls_and_properties['classes'][super_cls.__name__]['properties'][property_key] uri_mapping4properties.update({ property_key: property_uri}) self.uri_mapping4cls_and_properties.get('classes').update({ cls.__name__: { 'class_uri': cls_uri, 'properties': uri_mapping4properties }}) return URIRef(self.base_uriref + '#' + cls.__name__) def createProperty(self, domain_uri, property_name, range_cls, info_dict) -> (URIRef, URIRef): """Creates a owl:ObjectProperty. :return: tuple of domain_uri (rdflib.URIRef) and property_uri (rdflib.URIRef) of created property """ name = self.createPropertyName(property_name=property_name)\ if SemanticClass.PROPERTY_NAME not in info_dict.keys() else info_dict[SemanticClass.PROPERTY_NAME] property_uri = URIRef(self.base_uriref + '#' + name) inferredSubClass = RDFS.subClassOf * '*' range_uri = URIRef(self.base_uriref + '#' + range_cls.__name__) super_property_uri = None if SemanticClass.SUBPROPERTYOF in info_dict.keys(): super_property_uri = URIRef(info_dict[SemanticClass.SUBPROPERTYOF]) elif SemanticClass.SUPER_PROPERTY in info_dict.keys(): domain_uri, super_property_uri = self.createProperty(domain_uri,\ info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME),\ range_cls, info_dict[SemanticClass.SUPER_PROPERTY]) if (property_uri, None, None) not in self.project_graph: property_type = OWL.ObjectProperty if range_cls.__module__ == 'builtins': if range_cls != list: property_type = OWL.DatatypeProperty range_uri = RDFDataHandler.SIMPLE_DATA_TYPE_MAPPING.get(range_cls) if range_uri == XSD.string and property_name == 'URL': range_uri = XSD.anyURI self.addProperty2Graph(property_uri, domain_uri, range_uri, info_dict, property_type=property_type) elif not True in [\ (domain_uri, inferredSubClass, o) in self.project_graph\ for o in self.project_graph.objects(property_uri, RDFS.domain)\ ]: # if domain_uri is NOT a subclass of a cls specified by RDFS.domain if SemanticClass.CARDINALITY in info_dict.keys()\ and info_dict[SemanticClass.CARDINALITY] > 0: self.addRestriction2Class(domain_uri, property_uri, info_dict) self.project_graph.add((property_uri, RDFS.domain, domain_uri)) if super_property_uri is not None\ and (property_uri, RDFS.subPropertyOf, super_property_uri) not in self.project_graph: self.project_graph.add((property_uri, RDFS.subPropertyOf, super_property_uri)) return domain_uri, property_uri def createPropertyName(self, property_name=None, subject_uri=None, object_uri=None, connector='BelongsTo', prefix='has'): """Returns a property name. """ if property_name is not None: property_name = ''.join([ property_name.split('_')[0].lower() ] + [ text.capitalize() for text in property_name.split('_')[1:] ]) return prefix + property_name[0].upper() + property_name[1:] if property_name[0].islower()\ else prefix + property_name elif subject_uri is not None: property_name = subject_uri.split('#')[1] + self.createPropertyName(object_uri=object_uri, prefix=connector) return property_name[0].lower() + property_name[1:] elif object_uri is not None: return prefix + object_uri.split('#')[1] else: return prefix def get_comment_label(self, cls): """Returns comment and label from cls __doc__. """ comment = cls.__doc__.replace('\n','').lstrip() label = cls.__name__ if '.' in cls.__doc__: comment = [ text for text in cls.__doc__.split('\n') if text != '' ][0].lstrip() if '@label' in cls.__doc__: m = re.search('(@label[:]*\s)(.*[\.]*)', cls.__doc__) label_tag, label = m.groups() elif re.search('([A-Z][a-z]+)', label): m = re.search('([A-Z]\w+)([A-Z]\w+)', label) label = ' '.join([ text.lower() for text in re.split(r'([A-Z][a-z]+)', label) if text != '' ]) return comment, label def get_semantic_classes(self, datatypes_dir): """Returns a list of all classes that are contained in datatypes_dir that are subclasses of class_spec.SemanticClass. :return: a list of (str_name, class) """ base_dir = dirname(dirname(__file__)) sys.path.append(base_dir) root_modul_name = datatypes_dir.replace('/','.') files = [ file.replace('.py','') for file in listdir(datatypes_dir) if file.endswith('.py') and not file.startswith('test_') and not file.startswith('_')] all_modules = [] for name in files: all_modules.append(importlib.import_module('{}.{}'.format(root_modul_name, name))) all_classes = [] for modul in all_modules: all_classes += inspect.getmembers(modul, inspect.isclass) #all_classes = sorted(set(all_classes)) all_classes = sorted(set(all_classes), key=lambda current_class: current_class[0]) semantic_classes = [ cls for name, cls in all_classes if issubclass(cls, SemanticClass)\ and not issubclass(cls, UnSemanticClass)\ and not (cls == SemanticClass)] return semantic_classes def _get_builtin_cls_keys(self, property_dict): """Returns a list of keys for classes that are builtin. """ builtin_cls_keys = [] for key in property_dict.keys(): property_cls = property_dict.get(key).get('class')\ if type(property_dict.get(key)) is dict\ else property_dict.get(key)[0] if type(property_cls) != dict\ and property_cls.__module__ == 'builtins': builtin_cls_keys.append(key) return builtin_cls_keys def _get_semantic_dictionary_keys_super_first(self, property_dict): """Sorts the keys of the property part of a semantic dictionary and returns the keys for super classes before keys of subclasses. :return: a sorted list of keys. """ builtin_cls_keys = self._get_builtin_cls_keys(property_dict) complex_cls_keys = [] for key in [ key for key in property_dict.keys()\ if key not in builtin_cls_keys ]: current_cls = property_dict.get(key).get('class') key_inserted = False for index, cls_key in enumerate(complex_cls_keys): potential_sub_cls = property_dict.get(cls_key).get('class') if issubclass(potential_sub_cls, current_cls): complex_cls_keys.insert(index, key) key_inserted = True break if not key_inserted: complex_cls_keys.append(key) return builtin_cls_keys + complex_cls_keys def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert py classes that are subclasses of class_spec.SemanticClass to owl:Class and its properties to owl:ObjectProperty. py2ttl/py2ttl_ontology.py [OPTIONS ] [optional] directory containing datatypes that are subclasses of class_spec.SemanticClass. Overwrites DATATYPES_DIR in py2ttl/config.py. OPTIONS: -h|--help: show help -s|--source=source_ontology_file source ontology ttl file, option overwrites PROJECT_ONTOLOGY_FILE in py2ttl/config.py -t|--target=target_ontology_file target ontology ttl file, default: 'PROJECT_PREFIX-ontology_autogenerated.ttl' :return: exit code (int) """ check_config_files_exist() datatypes_dir = get_datatypes_dir() source_ontology_file = PROJECT_ONTOLOGY_FILE target_ontology_file = '' try: opts, args = getopt.getopt(argv, "hs:t:", ["help","source=", "target="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-t', '--target'): target_ontology_file = arg elif opt in ('-s', '--source'): source_ontology_file = arg converter = Py2TTLOntologyConverter(project_ontology_file=source_ontology_file) if len(args) > 0: datatypes_dir = args[0] if target_ontology_file == '': target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, converter.project_name) return converter.create_ontology(datatypes_dir, target_ontology_file) if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: tests_py2ttl/test_data/mapping_dict.xml =================================================================== --- tests_py2ttl/test_data/mapping_dict.xml (revision 101) +++ tests_py2ttl/test_data/mapping_dict.xml (revision 102) @@ -1,344 +1,352 @@ tln http://www.nie.org/ontology/nietzsche# ./tln-ontology_autogenerated.ttl http://www.nie.org/ontology/nietzsche#ArchivalManuscriptUnity http://www.nie.org/ontology/nietzsche#hasTitle http://www.nie.org/ontology/nietzsche#hasManuscriptType http://www.nie.org/ontology/nietzsche#hasStyles http://www.nie.org/ontology/nietzsche#hasPages http://www.nie.org/ontology/nietzsche#hasDescription http://www.nie.org/ontology/nietzsche#hasEarlierDescriptions http://www.nie.org/ontology/nietzsche#EditorComment http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#AtypicalWriting http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#atypicalWritingHasText http://www.nie.org/ontology/nietzsche#Path http://www.nie.org/ontology/nietzsche#hasDAttribute http://www.nie.org/ontology/nietzsche#Box http://www.nie.org/ontology/nietzsche#hasDAttribute http://www.nie.org/ontology/nietzsche#hasEarlierText http://www.nie.org/ontology/nietzsche#Clarification http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#clarificationHasText http://www.nie.org/ontology/nietzsche#Color http://www.nie.org/ontology/nietzsche#colorHasName http://www.nie.org/ontology/nietzsche#hasHexadecimalValue http://www.nie.org/ontology/nietzsche#Text http://www.nie.org/ontology/nietzsche#textHasContent http://www.nie.org/ontology/nietzsche#textHasMarkup http://www.nie.org/ontology/nietzsche#Description http://www.nie.org/ontology/nietzsche#textHasContent http://www.nie.org/ontology/nietzsche#textHasMarkup http://www.nie.org/ontology/nietzsche#EarlierDescription http://www.nie.org/ontology/nietzsche#textHasContent http://www.nie.org/ontology/nietzsche#hasAuthor http://www.nie.org/ontology/nietzsche#hasCitation http://www.nie.org/ontology/nietzsche#textHasMarkup http://www.nie.org/ontology/nietzsche#EditorCorrection http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#hasCorrectionText http://www.nie.org/ontology/nietzsche#Image http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasFileName + http://www.nie.org/ontology/nietzsche#hasTransform + http://www.nie.org/ontology/nietzsche#hasUrl + http://www.nie.org/ontology/nietzsche#hasTextField http://www.nie.org/ontology/nietzsche#FaksimileImage http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasFileName + http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#hasUrl http://www.nie.org/ontology/nietzsche#hasTextField http://www.nie.org/ontology/nietzsche#PositionalObject http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#WordPosition http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#FaksimilePosition http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#isOnFaksimileImage http://www.nie.org/ontology/nietzsche#isOnTextField http://www.nie.org/ontology/nietzsche#Line http://www.nie.org/ontology/nietzsche#lineHasNumber http://www.nie.org/ontology/nietzsche#lineHasBottomValueOnTranskription http://www.nie.org/ontology/nietzsche#lineHasTopValueOnTranskription http://www.nie.org/ontology/nietzsche#lineHasInnerBottomValueOnFaksimile http://www.nie.org/ontology/nietzsche#lineHasInnerTopValueOnFaksimile http://www.nie.org/ontology/nietzsche#lineHasOuterBottomValueOnFaksimile http://www.nie.org/ontology/nietzsche#lineHasOuterTopValueOnFaksimile http://www.nie.org/ontology/nietzsche#isMainLine http://www.nie.org/ontology/nietzsche#lineHasEditorComment http://www.nie.org/ontology/nietzsche#LineContinuation http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#isLineAContinuationTo http://www.nie.org/ontology/nietzsche#lineContinuationHasReference http://www.nie.org/ontology/nietzsche#SimpleWord http://www.nie.org/ontology/nietzsche#hasText http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#SpecialWord http://www.nie.org/ontology/nietzsche#hasText http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#MarkForeignHands http://www.nie.org/ontology/nietzsche#hasText http://www.nie.org/ontology/nietzsche#textOfForeignHands http://www.nie.org/ontology/nietzsche#penOfForeignHands http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#Page http://www.nie.org/ontology/nietzsche#hasNumber http://www.nie.org/ontology/nietzsche#hasOrientation http://www.nie.org/ontology/nietzsche#hasLines http://www.nie.org/ontology/nietzsche#hasMarkForeignHands http://www.nie.org/ontology/nietzsche#hasWords http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks http://www.nie.org/ontology/nietzsche#hasFaksimileImage + http://www.nie.org/ontology/nietzsche#pageIsOnSVGTextField + http://www.nie.org/ontology/nietzsche#pageIsOnFaksimileTextField http://www.nie.org/ontology/nietzsche#hasSvgImage - http://www.nie.org/ontology/nietzsche#pageIsOnTextField http://www.nie.org/ontology/nietzsche#Reference http://www.nie.org/ontology/nietzsche#firstLineOfReference http://www.nie.org/ontology/nietzsche#lastLineOfReference http://www.nie.org/ontology/nietzsche#wordReference http://www.nie.org/ontology/nietzsche#IsUncertain http://www.nie.org/ontology/nietzsche#hasTitle http://www.nie.org/ontology/nietzsche#hasPageNumber http://www.nie.org/ontology/nietzsche#SVGImage http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasFileName + http://www.nie.org/ontology/nietzsche#hasTransform + http://www.nie.org/ontology/nietzsche#hasUrl + http://www.nie.org/ontology/nietzsche#hasTextField http://www.nie.org/ontology/nietzsche#StandoffTag http://www.nie.org/ontology/nietzsche#standoffTagHasStartIndex http://www.nie.org/ontology/nietzsche#standoffTagHasEndIndex http://www.nie.org/ontology/nietzsche#standoffTagHasCSS http://www.nie.org/ontology/nietzsche#TextConnectionMark http://www.nie.org/ontology/nietzsche#hasText http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#textConnectionMarkHasTextSource http://www.nie.org/ontology/nietzsche#TextField http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#TranskriptionPosition http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#isOnSvgImage http://www.nie.org/ontology/nietzsche#UncertainDecipherment http://www.nie.org/ontology/nietzsche#isUncertain http://www.nie.org/ontology/nietzsche#hasComment http://www.nie.org/ontology/nietzsche#Word http://www.nie.org/ontology/nietzsche#hasText http://www.nie.org/ontology/nietzsche#hasEditedText http://www.nie.org/ontology/nietzsche#wordHasWordParts http://www.nie.org/ontology/nietzsche#wordBelongsToLine http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition http://www.nie.org/ontology/nietzsche#hasFaksimilePosition http://www.nie.org/ontology/nietzsche#wordHasStyle http://www.nie.org/ontology/nietzsche#overwritesWord http://www.nie.org/ontology/nietzsche#isTransformationOfWord http://www.nie.org/ontology/nietzsche#isExtensionOfWord http://www.nie.org/ontology/nietzsche#isDeletionOfWord http://www.nie.org/ontology/nietzsche#isClarificationOfWord http://www.nie.org/ontology/nietzsche#wordHasEarlierVersion http://www.nie.org/ontology/nietzsche#wordHasCorrection http://www.nie.org/ontology/nietzsche#wordIsDeletedByPath http://www.nie.org/ontology/nietzsche#wordHasEditorComment http://www.nie.org/ontology/nietzsche#WordDeletionPath http://www.nie.org/ontology/nietzsche#hasDAttribute http://www.nie.org/ontology/nietzsche#WordInsertionMark http://www.nie.org/ontology/nietzsche#hasHeight http://www.nie.org/ontology/nietzsche#hasWidth http://www.nie.org/ontology/nietzsche#hasLeft http://www.nie.org/ontology/nietzsche#hasTop http://www.nie.org/ontology/nietzsche#hasBottom http://www.nie.org/ontology/nietzsche#hasTransform http://www.nie.org/ontology/nietzsche#hasMarkType http://www.nie.org/ontology/nietzsche#hasSymbolId http://www.nie.org/ontology/nietzsche#hasNextWord http://www.nie.org/ontology/nietzsche#hasPreviousWord http://www.nie.org/ontology/nietzsche#wordInsertionMarkBelongsToLine xml-dictionary - 2020-07-15 19:50:04 + 2020-11-11 15:04:42 Index: Friedrich-Nietzsche-late-work-ontology.ttl =================================================================== --- Friedrich-Nietzsche-late-work-ontology.ttl (revision 101) +++ Friedrich-Nietzsche-late-work-ontology.ttl (revision 102) @@ -1,26 +1,41 @@ @prefix dct: . +@prefix document: . @prefix homotypic: . @prefix stoff: . @prefix text: . @prefix owl: . @prefix rdfs: . @prefix xsd: . @prefix tln: . a owl:Ontology; dct:license ; dct:title "An ontology about the collected late works of Friedrich Nietzsche"@en; dct:description """Formal description of specific concepts in the scientific study of Friedrich Nietzsches late work."""@en; dct:creator "Dominique Steinbach, tool coordinator/software developer, NIE-INE/digital edition of der späte Nietzsche, Basel University, Switzerland"@en; dct:contributor "Christian Steiner, software developer, digital edition of der späte Nietzsche, University of Basel, Switzerland"@en; dct:publisher "Basel University, Switzerland"@en. tln:inheritOverwritesWord a owl:ObjectProperty ; rdfs:subPropertyOf tln:overwritesWord; rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ; rdfs:comment "The author has used this word in order to overwrite that word."@en ; rdfs:isDefinedBy ; owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ). - + +tln:Page a owl:Class ; + rdfs:subClassOf document:Page . + +tln:writingContinuesWithWord a owl:ObjectProperty ; + rdfs:label "writing continues with next word"@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:Word ; + rdfs:range tln:Word . + +tln:lineContinuesOn a owl:ObjectProperty ; + rdfs:label "writing from subject line continues with object line"@en ; + rdfs:isDefinedBy ; + rdfs:domain tln:Line ; + rdfs:range tln:Line . Index: svgscripts/fix_old_data.py =================================================================== --- svgscripts/fix_old_data.py (revision 101) +++ svgscripts/fix_old_data.py (revision 102) @@ -1,212 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -""" This program can be used to process words after they have been merged with faksimile data. -""" -# Copyright (C) University of Basel 2019 {{{1 -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see 1}}} - -from colorama import Fore, Style -from deprecated import deprecated -from functools import cmp_to_key -import getopt -import inspect -import lxml.etree as ET -import re -import shutil -import string -from svgpathtools import svg2paths2, svg_to_paths -from svgpathtools.path import Path as SVGPath -from svgpathtools.path import Line -import sys -import tempfile -from operator import attrgetter -import os -from os import listdir, sep, path, setpgrp, devnull -from os.path import exists, isfile, isdir, dirname, basename -from progress.bar import Bar -import warnings - -if dirname(__file__) not in sys.path: - sys.path.append(dirname(__file__)) - -from datatypes.box import Box -from datatypes.faksimile import FaksimilePage -from datatypes.manuscript import ArchivalManuscriptUnity -from datatypes.mark_foreign_hands import MarkForeignHands -from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK -from datatypes.path import Path -from datatypes.text_connection_mark import TextConnectionMark -from datatypes.transkriptionField import TranskriptionField -from datatypes.word import Word, update_transkription_position_ids -from util import back_up -from process_files import update_svgposfile_status -from process_words_post_merging import update_faksimile_line_positions - -sys.path.append('shared_util') -from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT -from main_util import create_function_dictionary - - -__author__ = "Christian Steiner" -__maintainer__ = __author__ -__copyright__ = 'University of Basel' -__email__ = "christian.steiner@unibas.ch" -__status__ = "Development" -__license__ = "GPL v3" -__version__ = "0.0.1" - -UNITTESTING = False - -def save_page(page): - """Write page to xml file - """ - script_name = f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}' - write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ - script_name=script_name, file_type=FILE_TYPE_SVG_WORD_POSITION) - -def page_already_changed(page) -> bool: - """Return whether page has alreadybeen changed by function - """ - return len(\ - page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}:{inspect.currentframe().f_back.f_code.co_name}"]')\ - ) > 0 - -def fix_faksimile_line_position(page, redo=False) -> bool: - """Create a faksimile line position. - """ - if not redo and page_already_changed(page): - return False; - update_faksimile_line_positions(page) - if not UNITTESTING: - save_page(page) - return True - -def check_faksimile_positions(page, redo=False) -> bool: - """Check faksimile line position. - """ - if len(page.page_tree.xpath('//data-source/@file')) > 0: - svg_file = page.page_tree.xpath('//data-source/@file')[0] - svg_tree = ET.parse(svg_file) - positions_are_equal_counter = 0 - page_changed = False - for faksimile_page in FaksimilePage.GET_FAKSIMILEPAGES(svg_tree): - if page.title == faksimile_page.title\ - and page.number == faksimile_page.page_number: - #print([fp.id for fp in faksimile_page.word_positions ]) - for word in page.words: - for fp in word.faksimile_positions: - rect_fps = [ rfp for rfp in faksimile_page.word_positions if rfp.id == fp.id ] - if len(rect_fps) > 0: - rfp = rect_fps[0] - if fp.left != rfp.left or fp.top != rfp.top: - #print(f'{fp.id}: {fp.left}/{rfp.left} {fp.top}/{rfp.top}') - fp.left = rfp.left - fp.top = rfp.top - fp.bottom = fp.top + rfp.height - word.attach_word_to_tree(page.page_tree) - page_changed = True - else: - positions_are_equal_counter += 1 - print(f'{positions_are_equal_counter}/{len(page.words)} are equal') - if page_changed and not UNITTESTING: - save_page(page) - return page_changed - -def fix_faksimile_positions(page, redo=False) -> bool: - """Set faksimile positions to absolute values. - - [:return:] fixed - """ - if not redo and len(page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}"]')) > 0: - return False - x_min = page.text_field.xmin - y_min = page.text_field.ymin - for word in page.words: - for fp in word.faksimile_positions: - fp.left = fp.left + x_min - fp.top = fp.top + y_min - fp.bottom = fp.bottom + y_min - word.attach_word_to_tree(page.page_tree) - if not UNITTESTING: - print(f'writing to {page.page_tree.docinfo.URL}') - write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ - script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) - return True - -def usage(): - """prints information on how to use the script - """ - print(main.__doc__) - -def main(argv): - """This program can be used to fix faksimile position ->set them to their absolute value. - - svgscripts/fix_old_data.py [OPTIONS] - - a xml file about a manuscript, containing information about its pages. - a xml file about a page, containing information about svg word positions. - - OPTIONS: - -h|--help show help - -c|--check-faksimile-positions check whether faksimile positions have been updated - -l|--faksimile-line-position create faksimile line positions - -p|--faksimile-positions fix old faksimile positions - -r|--redo rerun - - :return: exit code (int) - """ - function_list = [] - function_dict = create_function_dictionary(['default', '-c', '--check-faksimile-positions'], check_faksimile_positions) - function_dict = create_function_dictionary(['default', '-l', '--faksimile-line-position'], fix_faksimile_line_position, function_dictionary=function_dict) - function_dict = create_function_dictionary(['-p', '--faksimile-positions'], fix_faksimile_positions, function_dictionary=function_dict) - redo = False; - try: - opts, args = getopt.getopt(argv, "hcplr", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position", "redo" ]) - except getopt.GetoptError: - usage() - return 2 - for opt, arg in opts: - if opt in ('-h', '--help'): - usage() - return 0 - elif opt in ('-r', '--redo'): - redo = True; - elif opt in function_dict.keys(): - function_list.append(function_dict[opt]) - if len(function_list) == 0: - function_list.append(function_dict['default']) - if len(args) < 1: - usage() - return 2 - exit_status = 0 - xml_file = args[0] - if isfile(xml_file): - counters = { f.__name__: 0 for f in function_list } - for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK): - for current_function in function_list: - if not UNITTESTING: - print(Fore.CYAN + f'Processing {page.title}, {page.number} with function {current_function.__name__} ...' + Style.RESET_ALL) - back_up(page, page.xml_file) - counters[current_function.__name__] += 1 if current_function(page, redo=redo) else 0 - if not UNITTESTING: - for function_name, counter in counters.items(): - print(Style.RESET_ALL + f'[{counter} pages changed by {function_name}]') - else: - raise FileNotFoundError('File {} does not exist!'.format(xml_file)) - return exit_status - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) Index: svgscripts/extract_line_continuation.py =================================================================== --- svgscripts/extract_line_continuation.py (revision 101) +++ svgscripts/extract_line_continuation.py (revision 102) @@ -1,224 +1,227 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract line continuations. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import lxml.etree as ET import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import warnings __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from datatypes.box import text_node_is_inside_match_box, tspan_node_is_inside_match_box from datatypes.line import Line from datatypes.line_continuation import LineContinuation from datatypes.matrix import Matrix from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.reference import Reference from datatypes.transkriptionField import TranskriptionField from util import back_up sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT UNITTESTING = False DEBUG = False def extract_line_continuations(page: Page, svg_file=None, warning_message='WARNING'): """Extract line continuations. """ if svg_file is None: if page.source is None or not isfile(page.source): raise Exception('Function "extract_line_continuations" needs a page with a valid source or a svg_file!') svg_file = page.source if not UNITTESTING: print(Fore.CYAN + f'Extracting line continuations on {page.title}, {page.number} ...' + Style.RESET_ALL) svg_tree = ET.parse(svg_file) transkription_field = TranskriptionField(svg_file, multipage_index=page.multipage_index) - page.update_line_number_area(transkription_field, svg_tree=svg_tree) + set_to_text_field_zero = (page.svg_image is None or page.svg_image.text_field is None) + tr_xmin = transkription_field.xmin if set_to_text_field_zero else 0 + tr_ymin = transkription_field.ymin if set_to_text_field_zero else 0 + page.update_line_number_area(transkription_field, svg_tree=svg_tree, set_to_text_field_zero=set_to_text_field_zero) for line in page.lines: line.editor_comments = [] namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } arrow_style_key = [ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen'][0]\ if len([ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen']) > 0\ else None if arrow_style_key is not None: if DEBUG: print(arrow_style_key) for arrow in _extract_arrow_nodes(svg_tree, arrow_style_key, transkription_field, namespaces): matrix = Matrix(transform_matrix_string=arrow.get('transform'))\ if not arrow.tag.endswith('tspan')\ else Matrix(transform_matrix_string=arrow.getparent().get('transform')) - line = _get_line_of_arrow(arrow, page, transkription_field) + line = _get_line_of_arrow(arrow, page, tr_ymin) if line is not None: reference_counter = 0 reference = None while reference is None and reference_counter < 2: reference = _get_reference(svg_tree, arrow, matrix, transkription_field, namespaces, is_from_reference=(reference_counter==0)) reference_counter += 1 if reference is not None: line.editor_comments.append(LineContinuation(reference=reference, to_reference=(reference_counter>1))) else: to_reference = (matrix.getX() > transkription_field.xmax) line.editor_comments.append(LineContinuation(reference=Reference(), to_reference=to_reference)) else: - y = round(matrix.getY() - transkription_field.ymin, 2) + y = round(matrix.getY() - tr_ymin, 2) warnings.warn(f'{warning_message}: There is no line for {y}') for line in page.lines: line.attach_object_to_tree(page.page_tree) if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def _extract_arrow_nodes(svg_tree: ET.ElementTree, arrow_style_key: str, transkription_field=None, namespaces=None) ->list: """Extract arrow nodes from svg_tree. """ if transkription_field is None: transkription_field = TranskriptionField(svg_tree.docinfo.URL) if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } return [ arrow for arrow in svg_tree.xpath('//ns:text[contains(@class, "{0}")]'.format(arrow_style_key)\ + '|//ns:tspan[contains(@class, "{0}")]'.format(arrow_style_key),\ namespaces=namespaces)\ if arrow.text == ')' and node_is_on_marginalia(arrow, transkription_field) ] def _get_arrow_y(arrow: ET.Element, matrix=None) ->float: """Return y of arrow node. """ if matrix is None: matrix = Matrix(transform_matrix_string=arrow.get('transform'))\ if not arrow.tag.endswith('tspan')\ else Matrix(transform_matrix_string=arrow.getparent().get('transform')) if arrow.tag.endswith('tspan'): return matrix.add2Y(add_to_y=arrow.get('y')) else: return matrix.getY() -def _get_line_of_arrow(arrow: ET.Element, page: Page, transkription_field: TranskriptionField, matrix=None) ->Line: +def _get_line_of_arrow(arrow: ET.Element, page: Page, tr_ymin: float, matrix=None) ->Line: """Return Line next to arrow. """ arrow_y = _get_arrow_y(arrow, matrix=matrix) - line_number = page.get_line_number(round(arrow_y - transkription_field.ymin, 2) -.5) + line_number = page.get_line_number(round(arrow_y - tr_ymin, 2) -.5) lines = [ line for line in page.lines if line.id == line_number ] if len(lines) > 0: return lines[0] return None def _get_reference(svg_tree: ET.ElementTree, arrow: ET.Element, arrow_matrix: Matrix, transkription_field: TranskriptionField, namespaces: dict, is_from_reference=True) ->Reference: """Return reference. """ reference = None arrow_left = arrow_matrix.add2X(add_to_x=arrow.get('x'))\ if arrow.tag.endswith('tspan')\ else arrow_matrix.getX() arrow_y = _get_arrow_y(arrow, matrix=arrow_matrix) xmin = 0\ if arrow_left < transkription_field.xmin\ else transkription_field.xmax + transkription_field.line_number_area_width xmax = arrow_left ymin = arrow_y -5 ymax = arrow_y +5 if not is_from_reference: xmin = xmax xmax = transkription_field.xmin - transkription_field.line_number_area_width\ if arrow_left < transkription_field.xmin\ else transkription_field.documentWidth + transkription_field.line_number_area_width text_nodes_on_arrow_line = sorted([ text_node for text_node in svg_tree.xpath('//ns:text', namespaces=namespaces)\ if text_node != arrow and text_node_is_inside_match_box(text_node, xmin, xmax, ymin, ymax) ],\ key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX()) reference_string = '' for text_node in text_nodes_on_arrow_line: reference_string += ''.join([ child.text for child in text_node.getchildren()])\ if len(text_node.getchildren()) > 0\ else text_node.text if reference_string != '': try: reference = Reference.create_cls(reference_string=reference_string) except Exception: print(reference_string) return reference def node_is_on_marginalia(node: ET.Element, transkription_field: TranskriptionField) ->bool: """Return true if node is on marginalia. """ if node.tag.endswith('tspan'): return tspan_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\ or tspan_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax) return text_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\ or text_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract the line continuations. svgscripts/extract_line_continuation.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK): if not UNITTESTING: back_up(page, page.xml_file) extract_line_continuations(page) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/util.py =================================================================== --- svgscripts/util.py (revision 101) +++ svgscripts/util.py (revision 102) @@ -1,509 +1,522 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to copy a faksimile svg file with the option of highlighting some word boxes. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from datetime import datetime from functools import cmp_to_key import getopt import inspect import itertools import lxml.etree as ET import re import shutil import signal import string import subprocess from svgpathtools import svg_to_paths import sys import tempfile import os from os import listdir, sep, path, setpgrp, devnull, makedirs from os.path import basename, commonpath, dirname, exists, isfile, isdir, realpath, splitext import warnings import wget import xml.etree.ElementTree as XET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.faksimile import FaksimilePage, get_paths_inside_rect from datatypes.faksimile_image import FaksimileImage from datatypes.lineNumber import LineNumber from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page import Page from datatypes.page_creator import PageCreator from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition -from datatypes.word import update_transkription_position_ids +from datatypes.word import Word, update_transkription_position_ids from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT sys.path.append('shared_util') from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False HIGHLIGHT_COLOR = 'red' OPACITY = '0.5' class ExternalViewer: """This class can be used to show files with external viewers. """ file_format_viewer_dict = { '.pdf': PDF_READER, '.svg': SVG_EDITOR } @classmethod def show_files(cls, single_file=None, list_of_files=[]): """Opens file(s) with corresponding external viewer(s). """ DEVNULL = None if type(single_file) == list: list_of_files = single_file elif single_file is not None: list_of_files.append(single_file) if len(list_of_files) > 1: DEVNULL = open(devnull, 'wb') process_list = [] list_of_files.reverse() while len(list_of_files) > 0: file2open = list_of_files.pop() viewer = cls.file_format_viewer_dict.get(splitext(file2open)[1]) if viewer is not None: if len(list_of_files) > 0: process_list.append(\ subprocess.Popen([viewer, file2open], stdout=DEVNULL, stderr=DEVNULL, preexec_fn=os.setsid)) else: subprocess.run([viewer, file2open]) for process in process_list: os.killpg(os.getpgid(process.pid), signal.SIGTERM) if DEVNULL is not None: DEVNULL.close() def back_up(page: Page, reference_file, bak_dir='./bak') -> str: """Back up a xml_source_file. :return: target_file_name """ date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') makedirs(bak_dir, exist_ok=True) page.bak_file = bak_dir + sep + basename(page.page_tree.docinfo.URL) + '_' + date_string write_pretty(xml_element_tree=page.page_tree, file_name=page.bak_file,\ script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, reference_file),\ file_type=FILE_TYPE_SVG_WORD_POSITION) return page.bak_file def back_up_svg_file(svg_tree: ET.ElementTree, namespaces=None, bak_dir='./bak') -> str: """Back up a xml_source_file. :return: target_file_name """ if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') makedirs(bak_dir, exist_ok=True) bak_file = bak_dir + sep + date_string + '_' + basename(svg_tree.docinfo.URL) copy_faksimile_svg_file(target_file=bak_file, faksimile_tree=svg_tree, namespaces=namespaces) return bak_file def copy_faksimile_svg_file(target_file=None, faksimile_source_file=None, faksimile_tree=None, target_directory=None, abs_image_path=None, local_image_path=None, namespaces=None): """Copy a faksimile_svg_file to target_file. """ if faksimile_source_file is None and faksimile_tree is not None: faksimile_source_file = faksimile_tree.docinfo.URL elif faksimile_source_file is None: raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file') if target_file is not None and target_directory is not None: target_file = target_directory + sep + target_file elif target_file is None and target_directory is not None: target_file = target_directory + sep + basename(faksimile_source_file) elif target_file is None: raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory') paths, attributes, svg_attributes = svg_to_paths.svg2paths(faksimile_source_file, return_svg_attributes=True) for key in [ key for key in svg_attributes.keys() if key.startswith('xmlns:') ]: try: XET.register_namespace(key.replace('xmlns:', ''), svg_attributes[key]) except ValueError: pass XET.register_namespace('', 'http://www.w3.org/2000/svg') if namespaces is None: namespaces = { 'ns': svg_attributes['xmlns'], 'xlink': svg_attributes['xmlns:xlink'],\ 'sodipodi': svg_attributes['xmlns:sodipodi'] } if faksimile_tree is not None: element = XET.fromstring(ET.tostring(faksimile_tree))\ if type(faksimile_tree) == ET._ElementTree\ else XET.fromstring(XET.tostring(faksimile_tree.getroot())) target_tree = XET.ElementTree(element) else: target_tree = XET.parse(faksimile_source_file) if (local_image_path is not None or abs_image_path is not None)\ and len(target_tree.findall('.//ns:image', namespaces=namespaces)) > 0: image_node = target_tree.findall('.//ns:image', namespaces=namespaces)[0] if local_image_path is not None: image_node.set('{%s}href' % namespaces['xlink'], local_image_path) if abs_image_path is not None: image_node.set('{%s}absref' % namespaces['sodipodi'], abs_image_path) target_tree.write(target_file) def copy_faksimile_update_image_location(faksimile_source_file=None, faksimile_tree=None, target_file=None, target_directory=None, overwrite=False): """Copy a faksimile_svg_file to target_file and update image location. """ if faksimile_source_file is None and faksimile_tree is not None: faksimile_source_file = faksimile_tree.docinfo.URL elif faksimile_source_file is None: raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file') if target_file is not None and target_directory is not None: target_file = target_directory + sep + target_file elif target_file is None and target_directory is not None: target_file = target_directory + sep + basename(faksimile_source_file) elif target_directory is None and target_file is not None: target_directory = dirname(target_file) elif target_file is None: raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory') source_tree = ET.parse(faksimile_source_file) if faksimile_tree is None else faksimile_tree namespaces = { k if k is not None else 'ns': v for k, v in source_tree.getroot().nsmap.items() } image_nodes = source_tree.xpath('//ns:image', namespaces=namespaces) local_image_path = None abs_image_path = None user_abs_image_path = None if len(image_nodes) > 0: image = FaksimileImage.CREATE_IMAGE(image_nodes[0], source_file=faksimile_source_file) abs_image_path = image.local_path for user_name in USER_ROOT_LOCATION_DICT.keys(): if user_name in target_directory: user_abs_image_path = abs_image_path.replace(FAKSIMILE_LOCATION, USER_ROOT_LOCATION_DICT[user_name]).replace('//','/') break # if target_directory is subdir of FAKSIMILE_LOCATION if realpath(target_directory).startswith(realpath(FAKSIMILE_LOCATION)): common_path = commonpath([ realpath(target_directory), realpath(dirname(image.local_path)) ]) relative_directory = '/'.join(\ [ '..' for d in realpath(target_directory).replace(common_path + '/', '').split('/') ]) local_image_path = relative_directory + realpath(image.local_path).replace(common_path, '') if not isfile(target_directory + sep + local_image_path): local_image_path = None elif abs_image_path is not None: local_image_path = abs_image_path if abs_image_path is not None and not isfile(abs_image_path): wget.download(image.URL, out=dirname(abs_image_path)) if not isfile(target_file) or overwrite: abs_image_path = user_abs_image_path if user_abs_image_path is not None else abs_image_path copy_faksimile_svg_file(target_file=target_file, faksimile_source_file=faksimile_source_file,\ faksimile_tree=faksimile_tree, abs_image_path=abs_image_path,\ local_image_path=local_image_path, namespaces=namespaces) else: msg = 'File {0} not copied to directory {1}, it already contains a file {2}.'.format(faksimile_source_file, target_directory, target_file) warnings.warn(msg) def copy_xml_file_word_pos_only(xml_source_file, target_directory): """Copy word positions of a xml file to target directory. :return: (str) xml_target_file """ xml_target_file = target_directory + sep + basename(xml_source_file) source_page = Page(xml_source_file) target_page = PageCreator(xml_target_file, title=source_page.title, page_number=source_page.number, orientation=source_page.orientation) target_page.words = source_page.words target_page.update_and_attach_words2tree() write_pretty(xml_element_tree=target_page.page_tree, file_name=xml_target_file,\ script_name=__file__ + '({})'.format(inspect.currentframe().f_code.co_name), file_type=FILE_TYPE_SVG_WORD_POSITION) return xml_target_file def create_highlighted_svg_file(faksimile_tree, node_ids, nodes_color_dict=None, target_file=None, target_directory=None, local_image_path=None, namespaces=None, highlight_color=HIGHLIGHT_COLOR, opacity=OPACITY): """Highlights the nodes of a faksimile_tree that are specified by the list of node_ids and writes the tree to a file. """ if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } for node in itertools.chain(*[\ faksimile_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)\ for node_id in node_ids\ ]): node.set('fill', highlight_color) node.set('opacity', opacity) node.set('style', '') copy_faksimile_update_image_location(target_file=target_file, faksimile_tree=faksimile_tree, target_directory=target_directory) def get_empty_node_ids(faksimile_tree, x_min=0.0, x_max=0.0, y_min=0.0, y_max=0.0, text_field_id=None, faksimile_page=None, namespaces={}): """Returns a list of ids of rect and path nodes that do not have a title element. """ THRESHOLD_X = 10 if faksimile_page is not None: x_min = faksimile_page.text_field.xmin + faksimile_page.faksimile_image.x x_max = faksimile_page.text_field.xmax + faksimile_page.faksimile_image.x - THRESHOLD_X y_min = faksimile_page.text_field.ymin + faksimile_page.faksimile_image.y y_max = faksimile_page.text_field.ymax + faksimile_page.faksimile_image.y text_field_id = faksimile_page.text_field.id if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() } empyt_node_ids = [] nodes_without_title = faksimile_tree.xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'.format(\ x_min, x_max, y_min, y_max, text_field_id), namespaces=namespaces) nodes_without_title += get_paths_inside_rect(faksimile_tree, '//ns:path[not(./ns:title)]', x_min, x_max, y_min, y_max, text_field_id, namespaces=namespaces) for node_without_title in nodes_without_title: empyt_node_ids.append(node_without_title.get('id')) return empyt_node_ids def get_mismatching_ids(words, faksimile_positions): """ Return the list of mismatching words and the list of mismatching faksimile_positions as a 2-tuple. """ mismatching_words = [] mismatching_faksimile_positions = [] faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions) word_texts = [ word.text for word in words if word.text != '.' ] for word_text in set(word_texts): if word_text not in unique_faksimile_words: mismatching_words += [ word for word in words if word.text == word_text ] for faksimile_position_text in unique_faksimile_words: if faksimile_position_text not in set(word_texts): mismatching_faksimile_positions += [ faksimile_position for faksimile_position in faksimile_positions\ if faksimile_position.text == faksimile_position_text ] return mismatching_words, mismatching_faksimile_positions def process_warnings4status(warnings, warning_messages, current_status, ok_status, status_prefix='') ->str: """Process potential warnings and return actual status. """ if warnings is not None and len(warnings) > 0: status = status_prefix for warning_message in warning_messages: if True in [ str(warn.message).startswith(warning_message) for warn in warnings ]: status += f':{warning_message}:' if status != status_prefix: return status return f'{current_status}:{ok_status}:' else: return f'{current_status}:{ok_status}:' def record_changes(original_svg_file, changed_svg_file, node_ids, namespaces={}): """Copy changes made to changed_svg_file to original_svg_file. """ old_tree = ET.parse(original_svg_file) new_tree = ET.parse(changed_svg_file) if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() } for node_id in node_ids: new_titles = new_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces) old_nodes = old_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces) if len(new_titles) > 0 and len(old_nodes) > 0: if old_nodes[0].find('ns:title', namespaces=namespaces) is not None: old_nodes[0].find('ns:title', namespaces=namespaces).text = new_titles[0].text else: old_title_id_string = new_titles[0].get('id') old_title = ET.SubElement(old_nodes[0], 'title', attrib={ 'id': old_title_id_string }) old_title.text = new_titles[0].text elif len(old_nodes) > 0: for old_node in old_nodes: old_node.getparent().remove(old_node) copy_faksimile_svg_file(target_file=original_svg_file, faksimile_tree=old_tree) def record_changes_on_svg_file_to_page(xml_source_file, svg_file, word_ids=None): """Copy changes made to svg_file to xml_source_file. :return: datatypes.page.Page """ svg_tree = ET.parse(svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } transkription_field = TranskriptionField(svg_file) page = Page(xml_source_file) words = [ word for word in page.words if word.id in word_ids ]\ if word_ids is not None else page.words new_page_words = [] for word in words: word_id = 'word_' + str(word.id) + '_' recorded_ids = [] for transkription_position in word.transkription_positions: transkription_position_id = word_id + str(transkription_position.id) tp_nodes = svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]'.format(transkription_position_id), namespaces=namespaces) if len(tp_nodes) > 0: record_changes_to_transkription_position(tp_nodes[0], transkription_position,\ transkription_field.xmin, transkription_field.ymin, namespaces=namespaces) recorded_ids.append(transkription_position_id) extra_nodes = [ node for node in\ svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[contains(@id, "{0}")]'.format(word_id), namespaces=namespaces)\ if node.get('id') not in recorded_ids ] if len(extra_nodes) > 0: for extra_node in extra_nodes: old_ids = [ inkscape_id.replace('#','') for inkscape_id in\ svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]/@inkscape:label'.format(extra_node.get('id')),\ namespaces=namespaces) ] if len(old_ids) > 0 and re.match(r'word_[0-9]+_[0-9]+', old_ids[0]): old_id_list = old_ids[0].split('_') ref_word_id = int(old_id_list[1]) ref_tp_id = old_id_list[2] ref_words = [ word for word in page.words if word.id == ref_word_id ] if len(ref_words) > 0: ref_tps = [ tp for tp in ref_words[0].transkription_positions\ if tp.id == ref_tp_id ] if len(ref_tps) > 0: ref_words[0].transkription_positions.remove(ref_tps[0]) record_changes_to_transkription_position(extra_node,\ ref_tps[0], transkription_field.xmin, transkription_field.ymin, namespaces=namespaces) word.transkription_positions.append(ref_tps[0]) for word in page.words: if word.has_mixed_status('text'): new_page_words += [ word for word in word.split_according_to_status('text') if word.text is not None and word.text != '' ] elif len(word.transkription_positions) > 0: new_text = [ tp.text for tp in word.transkription_positions if tp.text is not None and tp.text != '' ] if len(new_text) > 0: word.text = new_text[0] new_page_words.append(word) page.words = new_page_words page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids) page.unlock() if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\ script_name=__file__ + ' -> ' + inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_SVG_WORD_POSITION) return page def record_changes_on_xml_file_to_page(xml_source_file, xml_file) -> Page: """Copy changes made to xml_file to xml_source_file. :return: datatypes.page.Page """ copy_page = Page(xml_file) page = Page(xml_source_file) page.unlock() back_up(page, xml_file) page.words = [] for word in copy_page.words: if word.split_strings is None\ or len(word.split_strings) == 0: page.words.append(word) else: next_word = word for split_string in word.split_strings: _, new_word, next_word = next_word.split(split_string) page.words.append(new_word) if next_word is not None: page.words.append(next_word) page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids) remove_words_if_done = [] for word in page.words: if 'join_string' in word.__dict__.keys()\ and word.join_string is not None: if word.id > 0\ and page.words[word.id-1].text + word.text == word.join_string: page.words[word.id-1].join(word) remove_words_if_done.append(word) elif word.id < len(page.words)\ and word.text + page.words[word.id+1].text == word.join_string: word.join(page.words[word.id+1]) remove_words_if_done.append(page.words[word.id+1]) for word in remove_words_if_done: page.words.remove(word) page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids) if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\ script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, xml_file), file_type=FILE_TYPE_SVG_WORD_POSITION) return page def record_changes_to_transkription_position(node, transkription_position, xmin=0.0, ymin=0.0, namespaces=None): """Record changes made to node to transkription_position. """ if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in node.nsmap.items() } if bool(node.get('x')): transkription_position.left = float(node.get('x')) - xmin if bool(node.get('y')): transkription_position.top = float(node.get('y')) - ymin if bool(node.get('width')): transkription_position.width = float(node.get('width')) if bool(node.get('height')): transkription_position.height = float(node.get('height')) if len(node.xpath('./ns:title/text()', namespaces=namespaces)) > 0: transkription_position.text = node.xpath('./ns:title/text()', namespaces=namespaces)[0] def replace_chars(words, faksimile_positions, unique_faksimile_words=None): """Return unique_faksimile_words and faksimile_positions, with characters changed according to transcription words. """ if unique_faksimile_words is None: unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\ key=lambda text: len(text)) for index, word_text in enumerate(unique_faksimile_words): if len([ word for word in words if word.text == word_text ]) == 0: if re.match(r'.*".*', word_text)\ and len([ word for word in words if word.text == word_text.replace('"', '“') ]) > 0: unique_faksimile_words[index] = word_text.replace('"', '“') elif re.match(r'.*ss.*', word_text)\ and len([ word for word in words if word.text == word_text.replace('ss', 'ß') ]) > 0: unique_faksimile_words[index] = word_text.replace('ss', 'ß') elif re.match(r'.*-.*', word_text)\ and len([ word for word in words if word.text == word_text.replace('-', '–') ]) > 0: unique_faksimile_words[index] = word_text.replace('-', '–') for faksimile_position in [ faksimile_position for faksimile_position in faksimile_positions\ if faksimile_position.text == word_text ]: faksimile_position.text = unique_faksimile_words[index] elif word_text == '-'\ and len([ word for word in words if word.text == '–' ]) > 0: print([ word.text for word in words if word.text == word_text ]) print([ word.text for word in words if word.text == '–' ]) return faksimile_positions, unique_faksimile_words +def reset_tp_with_matrix(page, transkription_positions, new_left=0, new_top=-5): + """Set left = 0, top = -5 for each transkription_position with transform matrix. + """ + if len(transkription_positions) > 0\ + and (page.svg_image is None\ + or page.svg_image.text_field is None): + for tp in transkription_positions: + if tp.transform is not None\ + and tp.transform.isRotationMatrix()\ + and tp.left > 10 and tp.top > 10: + tp.left = new_left + tp.top = new_top + def update_svgposfile_status(file_name, manuscript_file=None, status='changed', append=True): """Updates svg position file's status. Changes its status to status if it does not contain 'OK', else it appends new status to old status. """ if isfile(file_name): parser = ET.XMLParser(remove_blank_text=True) file_tree = ET.parse(file_name, parser) old_status = file_tree.getroot().get('status') if old_status is None or 'OK' not in old_status.split(':'): file_tree.getroot().set('status', status) elif append: if status not in old_status.split(':'): new_status = old_status + ':' + status file_tree.getroot().set('status', new_status) else: file_tree.getroot().set('status', new_status) write_pretty(xml_element_tree=file_tree, file_name=file_name, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) if manuscript_file is not None and isfile(manuscript_file): page_number = file_tree.getroot().get('number') update_manuscript_file(manuscript_file, page_number, file_name, status=status) def update_manuscript_file(manuscript_file, page_number, file_name, status='changed', append=True): """Updates manuscript file: adds status information about page. """ if isfile(manuscript_file): parser = ET.XMLParser(remove_blank_text=True) manuscript_tree = ET.parse(manuscript_file, parser) if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0: node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0] old_status = node.get('status') if old_status is None or 'OK' not in old_status.split(':'): node.set('status', status) elif append: if status not in old_status.split(':'): new_status = old_status + ':' + status node.set('status', new_status) else: node.set('status', new_status) if not bool(node.get('output')): node.set('output', file_name) else: pages_node = manuscript_tree.getroot().find('pages')\ if manuscript_tree.getroot().find('pages') is not None\ else ET.SubElement(manuscript_tree.getroot(), 'pages') new_id = len(pages_node.findall('page')) + 1 ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name}) write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT) Index: svgscripts/convert_wordPositions.py =================================================================== --- svgscripts/convert_wordPositions.py (revision 101) +++ svgscripts/convert_wordPositions.py (revision 102) @@ -1,690 +1,695 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert the word positions to HTML for testing purposes. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import cairosvg import getopt import json from lxml.html import builder as E from lxml.html import open_in_browser import lxml from pathlib import Path as PathLibPath from os import sep, listdir, mkdir, path, remove from os.path import exists, isfile, isdir, dirname import re import sys from svgpathtools import svg_to_paths import xml.etree.ElementTree as ET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.page_creator import PageCreator from datatypes.transkriptionField import TranskriptionField +from datatypes.text_field import TextField from datatypes.writing_process import WritingProcess from datatypes.word import Word __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" EXIST_DB = 'http://130.60.24.65:8081/exist/rest/db/ProjectData/Nietzsche/' class Converter: """The converter super class. """ def __init__(self, page, non_testing=True, show_word_insertion_mark=False): self.page = page self.non_testing = non_testing self.show_word_insertion_mark = show_word_insertion_mark def _get_transkription_positions(self, transkription_positions, stage_version=''): """Returns the transkription_positions of the indicated stage_version. """ convertable_transkription_positions = transkription_positions if stage_version != '': convertable_transkription_positions = [] if re.match(r'^\d$', stage_version): writing_process_id = int(stage_version) for transkription_position in transkription_positions: if transkription_position.writing_process_id == writing_process_id: convertable_transkription_positions.append(transkription_position) elif re.match(r'^\d\+$', stage_version): version_range = [ *range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)) ] for transkription_position in transkription_positions: if transkription_position.writing_process_id in version_range: convertable_transkription_positions.append(transkription_position) elif re.match(r'^\d\-\d$', stage_version): start_stop = [ int(i) for i in re.split(r'-', stage_version) ] version_range = [ *range(start_stop[0], start_stop[1]+1) ] for transkription_position in transkription_positions: if transkription_position.writing_process_id in version_range: convertable_transkription_positions.append(transkription_position) return convertable_transkription_positions def _get_words(self, words, highlighted_words=None): """Return the words that will be hightlighted. """ return highlighted_words if highlighted_words is not None else words def convert(self, output_file=None, stage_version='', highlighted_words=None): """Prints all words. """ first_word_of_line = None out = sys.stdout if output_file is not None: out = open(output_file, 'w') for word in self.page.words: if first_word_of_line is None or first_word_of_line.line_number != word.line_number: out.write('\n') first_word_of_line = word if word.line_number % 2 == 0: out.write(str(word.line_number).zfill(2) + ' ') else: out.write(' ') if stage_version == '' or len(self._get_transkription_positions(word.transkription_positions, stage_version=stage_version)) > 0: if word.text is not None: out.write(word.text + ' ') out.close() return 0 @classmethod def CREATE_CONVERTER(cls, page, non_testing=True, converter_type='', show_word_insertion_mark=False, key=''): """Returns a converter of type converter_type. [:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None """ cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() } cls_key = converter_type + 'Converter' if bool(cls_dict.get(cls_key)): converter_cls = cls_dict[cls_key] if converter_cls == JSONConverter: return converter_cls(page, non_testing, key=key) return converter_cls(page, non_testing, show_word_insertion_mark) else: return Converter(page, non_testing, show_word_insertion_mark) class JSONConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a json file. """ def __init__(self, page, non_testing=True, key=''): Converter.__init__(self, page, non_testing, False) def _add_word_to_list(self, words, word, text, text_field=None, edited_text=None, earlier_version=None, overwrites_word=None, parent_id=-1): """Add word to list. """ id = word.id\ if parent_id == -1\ else parent_id edited_text = word.edited_text\ if edited_text is None\ else edited_text earlier_version = word.earlier_version\ if earlier_version is None\ else earlier_version overwrites_word = word.overwrites_word\ if overwrites_word is None\ else overwrites_word line_number = word.line_number for tp in word.transkription_positions: tp_id = f'w{word.id}:tp{tp.id}'\ if parent_id == -1\ else f'w{parent_id}:w{word.id}:tp{tp.id}' if text_field is not None: word_dict = { 'id': id, 'text': text, 'left': tp.left + text_field.left, 'top': tp.top + text_field.top,\ 'width': tp.width, 'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted } if tp.transform is not None: matrix = tp.transform.clone_transformation_matrix() xmin = text_field.left ymin = text_field.top matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3) matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3) word_dict.update({ 'transform': matrix.toString() }) if tp.left > 0: word_dict.update({ 'left': round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)}) else: word_dict.update({ 'left': 0}) word_dict.update({ 'top': round((tp.height-1.5)*-1, 3)}) else: word_dict = { 'id': id, 'text': text, 'left': tp.left, 'top': tp.top, 'width': tp.width,\ 'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted } if tp.transform is not None: word_dict.update({ 'transform': tp.transform.toString() }) if edited_text is not None: word_dict.update({'edited_text': edited_text}) if earlier_version is not None: word_dict.update({'earlier_version': earlier_version.text }) if overwrites_word is not None: word_dict.update({'overwrites_word': overwrites_word.text }) if parent_id > -1: word_dict.update({'part_text': word.text }) words.append(word_dict) for wp in word.word_parts: self._add_word_to_list(words, wp, text, text_field=text_field, edited_text=edited_text,\ earlier_version=earlier_version, overwrites_word=overwrites_word, parent_id=word.id) def create_json_dict(self) ->dict: """Create and return a json dictionary. """ words = [] text_field = None if self.page.svg_image is not None: if self.page.svg_image.text_field is None: text_field = self.page.svg_image.text_field = TranskriptionField(self.page.svg_image.file_name).convert_to_text_field() self.page.svg_image.decontextualize_file_name(update_url=EXIST_DB) for word in self.page.words: self._add_word_to_list(words, word, word.text, text_field=text_field) lines = [] - for line in self.page.lines: lines.append({ 'id': line.id, 'top': line.top, 'bottom': line.bottom }) + offset = 0 if text_field is None else text_field.ymin + for line in self.page.lines: lines.append({ 'id': line.id, 'top': line.top + offset, 'bottom': line.bottom }) return { 'title': self.page.title, 'number': self.page.number, 'words': words,\ 'svg': self.add_object2dict(self.page.svg_image), 'lines': lines } def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to JSON. """ if output_file is None: output_file = 'output.json' json_file = open(output_file, "w+") try: json.dump(self.create_json_dict(), json_file) except Exception: raise Exception('Error in json.dump') json_file.close() return 0 def add_object2dict(self, object_instance): """Add an object to json_dict and generate json data and interfaces. [:return:] json dict or object_instance """ json_dict = {} object_type = type(object_instance) if object_type.__module__ == 'builtins': if object_type != list: return object_instance else: items = [] for item in object_instance: items.append(self.add_object2dict(item)) if len(items) > 0: return items else: return { self.key: [] } semantic_dictionary = object_type.get_semantic_dictionary() for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]: content = object_instance.__dict__.get(key) if content_type == list\ and content is not None\ and len(content) > 0\ and type(content[0]).__module__ != 'builtins': content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item)) json_dict.update({key: content_list}) elif content_type.__module__ == 'builtins': if content is not None: json_dict.update({key: content}) else: if content is not None and type(content) == list: content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item)) json_dict.update({key: content_list}) else: if content is not None: json_dict.update({key: self.add_object2dict(content)}) return json_dict class oldJSONConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a json file. """ PY2TS_DICT = { float: 'number', int: 'number', bool: 'boolean', str: 'string' } def __init__(self, page, non_testing=True, key=''): Converter.__init__(self, page, non_testing, False) self.key = key self.interface_output_dir = PathLibPath('ts_interfaces') if not self.interface_output_dir.is_dir(): self.interface_output_dir.mkdir() elif len(list(self.interface_output_dir.glob('*.ts'))) > 0: for ts_file in self.interface_output_dir.glob('*.ts'): remove(ts_file) def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to JSON. """ if output_file is None: output_file = 'output.json' class_dict = {} if self.key != '': object_instance = self.page.__dict__.get(self.key) if object_instance is not None: json_dict = self.add_object2dict(object_instance, class_dict) if type(json_dict) == list: json_dict = { self.key : json_dict } else: print(f'Page initialized from {self.page.page_tree.docinfo.URL} does not have an object at "{self.key}"!') return 2 else: json_dict = self.add_object2dict(self.page, class_dict) json_file = open(output_file, "w+") try: json.dump(json_dict, json_file) except Exception: raise Exception('Error in json.dump') json_file.close() self.create_imports(class_dict) return 0 def add_object2dict(self, object_instance, class_dict): """Add an object to json_dict and generate json data and interfaces. [:return:] json dict or object_instance """ json_dict = {} interface_list = [] object_type = type(object_instance) if object_type.__module__ == 'builtins': if object_type != list: return object_instance else: items = [] for item in object_instance: items.append(self.add_object2dict(item, class_dict)) if len(items) > 0: return { self.key: items } else: return { self.key: 'null' } semantic_dictionary = object_type.get_semantic_dictionary() for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]: content = object_instance.__dict__.get(key) if content_type == list\ and content is not None\ and len(content) > 0\ and type(content[0]).__module__ != 'builtins': content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item, class_dict)) json_dict.update({key: content_list}) interface_list.append(f'{key}: {type(content[0]).__name__}[];') elif content_type.__module__ == 'builtins': if content_type != list: ts_type = self.PY2TS_DICT[content_type]\ if content_type in self.PY2TS_DICT.keys()\ else 'string' interface_list.append(f'{key}: {ts_type};') json_dict.update({key: content}) else: if content is not None and type(content) == list: interface_list.append(f'{key}: {content_type.__name__}[];') content_list = [] for content_item in content: content_list.append(self.add_object2dict(content_item, class_dict)) json_dict.update({key: content_list}) else: interface_list.append(f'{key}: {content_type.__name__};') if content is not None: json_dict.update({key: self.add_object2dict(content, class_dict)}) if object_type not in class_dict.keys(): class_dict.update({object_type: self.create_interface(object_type.__name__, interface_list)}) return json_dict def create_imports(self, class_dict): """Create an ts interface from a list of key and content_types. [:return:] file_name of interface """ ts_file = PathLibPath('ts_imports.ts') file = open(ts_file, "w+") file.write(f'//import all interfaces from {self.interface_output_dir} ' + '\n') for interface_name, path_name in class_dict.items() : file.write('import {' + interface_name.__name__ + '} from \'./' + str(self.interface_output_dir.joinpath(path_name.stem)) + '\';\n') file.close() return ts_file def create_interface(self, class_name, interface_list) -> PathLibPath: """Create an ts interface from a list of key and content_types. [:return:] file_name of interface """ ts_file = self.interface_output_dir.joinpath(PathLibPath(f'{class_name.lower()}.ts')) import_list = [ import_class_name for import_class_name in\ [ import_class_name.split(': ')[1].replace(';','').replace('[]','') for import_class_name in interface_list ]\ if import_class_name not in set(self.PY2TS_DICT.values()) ] file = open(ts_file, "w") for import_class_name in set(import_list): file.write('import {' + import_class_name + '} from \'./' + import_class_name.lower() + '\';\n') file.write(f'export interface {class_name} ' + '{\n') for interace_string in interface_list: file.write(f'\t' + interace_string + '\n') file.write('}') file.close() return ts_file class SVGConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text. """ BG_COLOR = 'yellow' OPACITY = '0.2' def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY): Converter.__init__(self, page, non_testing, show_word_insertion_mark) self.bg_color = bg_color self.opacity = opacity def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to SVG """ title = self.page.title if(self.page.title is not None) else 'Test Page' title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title svg_file = self.page.svg_file if svg_file is None and self.page.svg_image is not None: svg_file = self.page.svg_image.file_name elif svg_file is None: msg = f'ERROR: xml_source_file {self.page.docinfo.URL} does neither have a svg_file nor a svg_image!' raise Exception(msg) transkription_field = TranskriptionField(svg_file) if bool(transkription_field.get_svg_attributes('xmlns')): ET.register_namespace('', transkription_field.get_svg_attributes('xmlns')) if bool(transkription_field.get_svg_attributes('xmlns:xlink')): ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink')) svg_tree = ET.parse(svg_file) transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'}) colors = [ 'yellow', 'orange' ] if self.bg_color == self.BG_COLOR else [ self.bg_color ] if highlighted_words is not None: colors = ['yellow'] else: highlighted_words = [] color_index = 0 for word in self.page.words: word_id = 'word_' + str(word.id) for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version): transkription_position_id = word_id + '_' + str(transkription_position.id) color = colors[color_index] if word not in highlighted_words else self.bg_color rect_node = ET.SubElement(transkription_node, 'rect',\ attrib={'id': transkription_position_id, 'x': str(transkription_position.left + transkription_field.xmin),\ 'y': str(transkription_position.top + transkription_field.ymin), 'width': str(transkription_position.width),\ 'height': str(transkription_position.height), 'fill': color, 'opacity': self.opacity}) if transkription_position.transform is not None: matrix = transkription_position.transform.clone_transformation_matrix() matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3) matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3) rect_node.set('transform', matrix.toString()) rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3))) rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3))) ET.SubElement(rect_node, 'title').text = word.text color_index = (color_index + 1) % len(colors) if output_file is not None: svg_tree.write(output_file) return 0 class HTMLConverter(Converter): """This class can be used to convert a 'svgWordPositions' xml file to a test HTML file. """ CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; } .highlight1 { background-color: pink; opacity: 0.2; } .highlight2 { background-color: red; opacity: 0.2; } .foreign { background-color: blue; opacity: 0.4; } .overwritten { background-color: green; opacity: 0.4; } .word-insertion-mark { background-color: orange; opacity: 0.2; } .deleted { background-color: grey; opacity: 0.2; } """ def __init__(self, page, non_testing=True, show_word_insertion_mark=False): Converter.__init__(self, page, non_testing, show_word_insertion_mark) + self.text_field = TextField() def convert(self, output_file=None, stage_version='', highlighted_words=None): """Converts Page to HTML """ title = self.page.title if(self.page.title is not None) else 'Test Page' title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title if stage_version != '': title = title + ', Schreibstufe: ' + stage_version if self.page.svg_image is not None: width = self.page.svg_image.width height = self.page.svg_image.height + self.text_field = self.page.svg_image.text_field svg_file = self.page.svg_image.file_name + print('Textfield found ->adjusting data') elif self.page.svg_file is not None: svg_file = self.page.svg_file transkription_field = TranskriptionField(svg_file) width = transkription_field.getWidth() height = transkription_field.getHeight() style_content = ' position: relative; width: {}px; height: {}px; background-image: url("{}"); background-size: {}px {}px '\ .format(width, height, path.abspath(svg_file), width, height) style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS) head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style) transkription = E.DIV(id="transkription") counter = 0 for word in self.page.words: highlight_class = 'highlight' + str(counter)\ if not word.deleted else 'deleted' if highlighted_words is not None\ and word in highlighted_words: highlight_class = 'highlight2' earlier_text = '' if word.earlier_version is None else word.earlier_version.text if earlier_text == '' and len(word.word_parts) > 0: earlier_versions = [ word for word in word.word_parts if word.earlier_version is not None ] earlier_text = earlier_versions[0].text if len(earlier_versions) > 0 else '' if earlier_text != '': word_title = 'id: {}/line: {}\n0: {}\n1: {}'.format(str(word.id), str(word.line_number), earlier_text, word.text) else: word_title = 'id: {}/line: {}\n{}'.format(str(word.id), str(word.line_number), word.text) if word.edited_text is not None: word_title += f'\n>{word.edited_text}' for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version): self._append2transkription(transkription, highlight_class, word_title, transkription_position) if word.overwrites_word is not None: overwritten_title = f'{word.text} overwrites {word.overwrites_word.text}' for overwritten_transkription_position in word.overwrites_word.transkription_positions: self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position) for part_word in word.word_parts: highlight_class = 'highlight' + str(counter)\ if not part_word.deleted else 'deleted' for part_transkription_position in self._get_transkription_positions(part_word.transkription_positions, stage_version=stage_version): self._append2transkription(transkription, highlight_class, word_title, part_transkription_position) if part_word.overwrites_word is not None: overwritten_title = f'{word.text} overwrites {part_word.overwrites_word.text}' for overwritten_transkription_position in part_word.overwrites_word.transkription_positions: self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position) counter = (counter + 1) % 2 word_insertion_mark_class = 'word-insertion-mark' counter = 0 for mark_foreign_hands in self.page.mark_foreign_hands: highlight_class = 'foreign' title = 'id: {}/line: {}\n{} {}'.format(str(mark_foreign_hands.id), str(mark_foreign_hands.line_number),\ mark_foreign_hands.foreign_hands_text, mark_foreign_hands.pen) for transkription_position in mark_foreign_hands.transkription_positions: self._append2transkription(transkription, highlight_class, title, transkription_position) if self.show_word_insertion_mark: for word_insertion_mark in self.page.word_insertion_marks: wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(word_insertion_mark.id), str(word_insertion_mark.line_number)) style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\ word_insertion_mark.top, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height) link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content) transkription.append(link) html = E.HTML(head,E.BODY(transkription)) bool(self.non_testing) and open_in_browser(html) if output_file is not None: with open(output_file, 'wb') as f: f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8')) f.closed return 0 def _append2transkription(self, transkription, highlight_class, title, transkription_position): """Append content to transkription-div. """ style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\ - transkription_position.top, transkription_position.left, transkription_position.width, transkription_position.height) + transkription_position.top - self.text_field.top, transkription_position.left - self.text_field.left, transkription_position.width, transkription_position.height) if transkription_position.transform is not None: style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString()) transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\ if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0 style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height) link = E.A(' ', E.CLASS(highlight_class), title=title, style=style_content) transkription.append(link) def create_pdf_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, pdf_file_name='output.pdf', bg_color=SVGConverter.BG_COLOR): """Creates a pdf file highlighting some words. """ if not pdf_file_name.endswith('pdf'): pdf_file_name = pdf_file_name + '.pdf' tmp_svg_file = pdf_file_name.replace('.pdf', '.svg') create_svg_with_highlighted_words(xml_source_file=xml_source_file, page=page, highlighted_words=highlighted_words,\ svg_file_name=tmp_svg_file, bg_color=bg_color) if isfile(tmp_svg_file): cairosvg.svg2pdf(url=tmp_svg_file, write_to=pdf_file_name) remove(tmp_svg_file) def create_svg_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, svg_file_name='output.svg', bg_color=SVGConverter.BG_COLOR): """Creates a svg file highlighting some words. """ if page is None and xml_source_file is not None: page = Page(xml_source_file) converter = SVGConverter(page, bg_color=bg_color) if not svg_file_name.endswith('svg'): svg_file_name = svg_file_name + '.svg' converter.convert(output_file=svg_file_name, highlighted_words=highlighted_words) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes. svgscripts/convert_wordPositions.py OPTIONS OPTIONS: -h|--help: show help -H|--HTML [default] convert to HTML test file -k|--key=key option for json converter: only convert object == page.__dict__[key] -o|--output=outputFile save output to file outputFile -P|--PDF convert to PDF test file -S|--SVG convert to SVG test file -s|--svg=svgFile: svg web file -T|--TEXT convert to TEXT output -t|--text=text highlight word -w|--word-insertion-mark show word insertion mark on HTML -v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. } -x|--testing execute in test mode, do not write to file or open browser :return: exit code (int) """ convert_to_type = None key = '' non_testing = True output_file = None page = None show_word_insertion_mark = False stage_version = '' svg_file = None text = None try: opts, args = getopt.getopt(argv, "hk:t:HPSTws:o:v:x", ["help", "key=", "text=", "HTML", "PDF", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version=", "testing"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-v', '--version'): if re.match(r'^(\d|\d\+|\d\-\d)$', arg): stage_version = arg else: raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg)) elif opt in ('-w', '--word-insertion-mark'): show_word_insertion_mark = True elif opt in ('-P', '--PDF'): convert_to_type = 'PDF' elif opt in ('-S', '--SVG'): convert_to_type = 'SVG' elif opt in ('-T', '--TEXT'): convert_to_type = 'TEXT' elif opt in ('-H', '--HTML'): convert_to_type = 'HTML' elif opt in ('-x', '--testing'): non_testing = False elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-o', '--output'): output_file = arg elif opt in ('-k', '--key'): key = arg elif opt in ('-t', '--text'): text = arg print(arg) if len(args) < 1: usage() return 2 if convert_to_type is None: if output_file is not None and len(re.split(r'\.', output_file)) > 1: output_file_part_list = re.split(r'\.', output_file) convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper() else: convert_to_type = 'HTML' exit_code = 0 for word_position_file in args: if not isfile(word_position_file): print("'{}' does not exist!".format(word_position_file)) return 2 if convert_to_type == 'PDF': if output_file is None: output_file = 'output.pdf' highlighted_words = None if text is not None: page = Page(word_position_file) highlighted_words = [ word for word in page.words if word.text == text ] create_pdf_with_highlighted_words(word_position_file, pdf_file_name=output_file, highlighted_words=highlighted_words) else: if svg_file is not None: if isfile(svg_file): page = PageCreator(word_position_file, svg_file=svg_file) else: print("'{}' does not exist!".format(word_position_file)) return 2 else: page = Page(word_position_file) if page.svg_file is None: print('Please specify a svg file!') usage() return 2 highlighted_words = None if text is not None: highlighted_words = [ word for word in page.words if word.text == text ] print([ (word.id, word.text) for word in highlighted_words ]) converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark, key=key) exit_code = converter.convert(output_file=output_file, stage_version=stage_version, highlighted_words=highlighted_words) return exit_code if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/extractWordPosition.py =================================================================== --- svgscripts/extractWordPosition.py (revision 101) +++ svgscripts/extractWordPosition.py (revision 102) @@ -1,544 +1,550 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract the position of the words in a svg file and write them to a xml file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import inspect import getopt from lxml import etree as ET from os import sep, listdir, mkdir, path from os.path import exists, isfile, isdir from progress.bar import Bar import re import sys import warnings from datatypes.lineNumber import LineNumber from datatypes.matrix import Matrix from datatypes.page_creator import PageCreator, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from datatypes.pdf import PDFText from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word from datatypes.word_insertion_mark import WordInsertionMark -from util import process_warnings4status +from util import process_warnings4status, reset_tp_with_matrix sys.path.append('shared_util') from myxmlwriter import write_pretty __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Extractor: """ This class can be used to extract the word positions in a svg file and write it to a xml file. Args: [xml_dir (str): target directory] [title (str): title of document] [manuscript_file (str): xml file containing information about the archival unity to which the current page belongs """ UNITTESTING = False SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ] + SET_POSITIONS_TO_TEXTFIELD_0_0 = False def __init__(self, xml_dir=None, title=None, manuscript_file=None, compare2pdf=False): if bool(xml_dir): self.xml_dir = xml_dir not isdir(self.xml_dir) and mkdir(self.xml_dir) else: self.xml_dir = 'xml' if(isdir('xml')) else '' self.latest_status = None self.compare2pdf = compare2pdf self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else '' self.title = title self.manuscript_file = manuscript_file self.manuscript_tree = None if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file): self.manuscript_tree = ET.parse(self.manuscript_file) self.title = self.manuscript_tree.getroot().get('title') elif bool(self.manuscript_file): raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file)) elif bool(self.title): self.update_title_and_manuscript(self.title, False) - def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None): + def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None) ->int: """Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word). If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created. :returns: the new word counter (int) """ break_points = [] if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points for Sonderzeichen in self.SONDERZEICHEN_LIST: contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ] if True in contains_Sonderzeichen: break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]] for sz_point in [i for i, e in break_points]: wim_index = len(page.word_insertion_marks) x = float(word_part_objs[sz_point]['x']) y = float(word_part_objs[sz_point]['y']) - if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None: + if page.svg_file is not None and isfile(page.svg_file)\ + and (not self.SET_POSITIONS_TO_TEXTFIELD_0_0 or transkription_field is not None): svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } - xmin = transkription_field.xmin - ymin = transkription_field.ymin + xmin = 0 if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field.xmin + ymin = 0 if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field.ymin wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\ line_number=page.get_line_number(y-1), mark_type=Sonderzeichen) page.word_insertion_marks.append(wim) if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points THRESHOLDX = 20 # Threshold between line number and text last_x = -1 for i, x in enumerate([float(dict['x']) for dict in word_part_objs]): if(last_x > -1 and (x - last_x > THRESHOLDX)): break_points.append((i, i)) last_x = x if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words from_index = 0 for end_point, next_from_index in break_points: new_word_part_objs = word_part_objs[from_index:end_point] new_endX = word_part_objs[end_point]['x'] from_index = next_from_index index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) if from_index > 0 and from_index < len(word_part_objs): new_word_part_objs = word_part_objs[from_index:] index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) return index else: if len(word_part_objs) > 0: + provide_tf = None if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\ - debug_msg_string=debug_msg, transkription_field=transkription_field) + debug_msg_string=debug_msg, transkription_field=provide_tf) text = self.get_word_from_part_obj(word_part_objs) line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2) if line_number == -1: if transkription_positions[0].transform is not None: line_number = page.get_line_number(transkription_positions[0].transform.getY()) if line_number == -1 and len(page.words) > 0: lastWord = page.words[-1] lastWord_lastTP = lastWord.transkription_positions[-1] lastTP = transkription_positions[-1] if transkription_positions[0].left > lastWord_lastTP.left\ and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2: line_number = lastWord.line_number else: line_number = lastWord.line_number+1 + reset_tp_with_matrix(page, transkription_positions) newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions) page.words.append(newWord) return int(index) + 1 else: return int(index) def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default', multipage_index=-1, marginals_page=None): """Extracts information about positions of text elements and writes them to a xml file. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file exit_status = 0 with warnings.catch_warnings(record=record_warnings) as w: warnings.simplefilter(warning_filter) page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile,\ multipage_index=multipage_index, marginals_page=marginals_page) status_message = process_warnings4status(w, [ PageCreator.WARNING_MISSING_USE_NODE4PWP, PageCreator.WARNING_MISSING_GLYPH_ID4WIM ],\ '', 'OK', 'with warnings') if status_message != 'OK': self.latest_status = status_message exit_status = 1 else: self.latest_status = None page.page_tree.getroot().set('status', status_message) write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) return exit_status else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, multipage_index=-1, marginals_page=None) -> PageCreator: """Extracts information about positions of text elements. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file transkription_field = TranskriptionField(file_name, multipage_index=multipage_index) + text_field = transkription_field.convert_to_text_field() svg_tree = ET.parse(file_name) page = PageCreator(xml_target_file, title=self.title, multipage_index=multipage_index,\ - page_number=page_number, pdfFile=pdfFile,\ - svg_file=svg_file, source=file_name, marginals_source=marginals_page) + page_number=page_number, pdfFile=pdfFile, svg_file=svg_file,\ + svg_text_field=text_field, source=file_name, marginals_source=marginals_page) sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot()) page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict) - if transkription_field is not None: - page.init_line_numbers(LineNumber.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax) + page.init_line_numbers(LineNumber.extract_line_numbers(svg_tree, transkription_field, set_to_text_field_zero=self.SET_POSITIONS_TO_TEXTFIELD_0_0),\ + transkription_field.ymax) self.extract_word_position(svg_tree, page, transkription_field=transkription_field) page.create_writing_processes_and_attach2tree() page.update_and_attach_words2tree() for word_insertion_mark in page.word_insertion_marks: # it is not clear if we really need to know this alternative word ordering. See 'TODO.md' #word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark) word_insertion_mark.attach_object_to_tree(page.page_tree) return page else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def extract_word_position(self, svg_tree, page, transkription_field=None): """Extracts word positions. """ counter = 0 word_part_obj = [] endSign = '%' last_matrix = None MAXBOTTOMDIFF = 5 MAXXDIFF = 6 if not Extractor.UNITTESTING: bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)])) for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field): - current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field) + provide_tf = None if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field + current_matrix = Matrix(text_item.get('transform'), transkription_field=provide_tf) # check for line breaks if (last_matrix is not None and len(word_part_obj) > 0 and (\ Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\ (abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\ (abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\ or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()): endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\ round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\ str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix))) counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field) word_part_obj = [] endX = current_matrix.getX() if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))): word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} ) else: endSign = text_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT endX = current_matrix.add2X(tspan_item.get('x')) if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))): y = current_matrix.add2Y(tspan_item.get('y')) word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix }) if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: """text_item has letterspacing class (set s & set t = new set with elements common to s and t) """ endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='tspan with letterspacing', transkription_field=transkription_field) word_part_obj = [] else: endSign = tspan_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='svg/text/tspan/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' last_matrix = current_matrix not bool(Extractor.UNITTESTING) and bar.next() if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\ transkription_field=transkription_field) word_part_obj = [] endSign = '%' not bool(Extractor.UNITTESTING) and bar.finish() def find_inserted_words_by_position(self, target_tree, x, y): """Returns an Array with the words that are inserted above the x, y position or [] if not found. """ warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.') MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 if(len(target_tree.getroot().xpath('//word[@id]')) > 0): result_list = [] minus2left = 20.0 minus2top = 19.0 while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX : result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ] minus2left -= 1 minus2top += 1 if len(result_list) > 0: result_bottom = result_list[len(result_list)-1].bottom result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)): result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def find_inserted_words(self, target_tree, word_insertion_mark): """Returns an Array with the words that are inserted above/underneath the word_insertion_mark. """ warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.') if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1: return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y) if(len(target_tree.getroot().xpath('//word[@id]')) > 0): MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 result_list = [] x = word_insertion_mark.x y = word_insertion_mark.y if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line line_number = word_insertion_mark.line_number - 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: minus2top = 1.0 while len(result_list) == 0 and minus2top < MINY: for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y - minus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break minus2top += 1 elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line line_number = word_insertion_mark.line_number + 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: plus2top = 1.0 while len(result_list) == 0 and plus2top < MINY : for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y + plus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break plus2top += 1 if len(result_list) > 0: # now, collect more words that are right of already collected words result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width for item in target_tree.getroot().xpath(\ '//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)): result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def get_file_name(self, file_name, page_number=None): """Returns the file_name of the target xml file. """ dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else '' if bool(self.title): return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml' else: return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml')) def get_page_number(self, file_name, page_number=None): """ Returns page number as a string (with leading zero(s) if len(page_number) < 3). """ if not bool(page_number) and bool(re.search(r'\d', file_name)): """if page_number=None and filename contains digits, then split filename into its parts that contain only digits, remove empty strings and return the last part containing only digits. """ page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop() if bool(page_number): leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else '' return leading_zeros + str(page_number) else: return '' def get_style(self, etree_root): """Returns the style specification as a dictionary. :returns: sonderzeichen_list: list of keys for classes that are 'Sonderzeichen' style_dict: dictionary: key = class name (str), value = style specification (dictionary) """ style_dict = {} sonderzeichen_list = [] letterspacing_list = [] style = etree_root.find('style', etree_root.nsmap) if style is not None: for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))): style_key = style_item.split('{')[0].replace('.', '') style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \ for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))} style_dict[style_key] = style_value_dict if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'): sonderzeichen_list.append(style_key) if bool(style_value_dict.get('letter-spacing')): letterspacing_list.append(style_key) return sonderzeichen_list, letterspacing_list, style_dict def get_text_items(self, tree_root, transkription_field=None): """Returns all text elements with a matrix or (if transkription_field is specified) all text elements that are located inside the transkription field. """ if transkription_field is not None: return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=x),\ tree_root.iterfind(".//text", tree_root.nsmap)) else: return tree_root.iterfind(".//text", tree_root.nsmap) def get_word_from_part_obj(self, word_part_obj): """Extracts all 'text' from a list of dicitonaries and concats it to a string. """ return ''.join([ dict['text'] for dict in word_part_obj]) def get_word_object_multi_char_x(self, word_part_obj_dict): """Returns the x of the last char of word_part_object. TODO: get real widths from svg_file!!! """ WIDTHFACTOR = 2.6 return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR def update_title_and_manuscript(self, title, update_manuscript=True): """Updates title and manuscript. """ self.title = title if update_manuscript or not bool(self.manuscript_file): self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml' if not isfile(self.manuscript_file): self.manuscript_tree = ET.ElementTree(ET.Element('manuscript', attrib={"title": self.title})) write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile') def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract the position of the words in a svg file and write them to a xml file. svgscripts/extractWordPosition.py [OPTIONS] svg file OR xml target file containing file name of svg file as "/page/@source". directory containing svg files OPTIONS: -h|--help: show help -c|--compare-to-pdf compare words to pdf and autocorrect -d|--xml-dir=xmlDir: target directory for the xml output file(s) -m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s) -p|--page=pageNumber: page number of the current page. For use with _one_ file only. -P|--PDF=pdfFile: pdf file - used for word correction -s|--svg=svgFile: svg web file -t|--title=title: title of the manuscript to which the current page(s) belong(s) -x|--xml-target-file=xmlOutputFile: xml target file :return: exit code (int) """ compare2pdf = True manuscript_file = None page_number = None pdfFile = None svg_file = None title = None xml_target_file = None xml_dir = ".{}xml".format(sep) try: opts, args = getopt.getopt(argv, "hcd:m:t:p:s:x:P:", ["help", "compare-to-pdf", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-c', '--compare-to-pdf'): compare2pdf = True elif opt in ('-d', '--xml-dir'): xml_dir = arg elif opt in ('-m', '--manuscript-file'): manuscript_file = arg elif opt in ('-t', '--title'): title = arg elif opt in ('-p', '--page'): page_number = str(arg) elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-P', '--PDF'): pdfFile = arg elif opt in ('-x', '--xml-target-file'): xml_target_file = str(arg) files_to_process = list() for arg in args: if isfile(arg): files_to_process.append(arg) elif isdir(arg): files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg))) else: print("'{}' does not exist!".format(arg)) return 2 if len(files_to_process) < 1 or args[0].endswith('xml'): if xml_target_file is None: xml_target_file = args[0] if len(args) > 0 else None if xml_target_file is not None and isfile(xml_target_file): target_file_tree = ET.parse(xml_target_file) file_name = target_file_tree.getroot().get('source') title = target_file_tree.getroot().get('title') if title is None else title page_number = target_file_tree.getroot().get('number') if page_number is None else page_number if svg_file is None: if len(target_file_tree.xpath('//svg-image')) > 0: svg_file = target_file_tree.xpath('.//svg-image/@file-name')[0]\ if len(target_file_tree.xpath('.//svg-image/@file-name')) > 0 else None else: svg_file = target_file_tree.xpath('.//svg/@file')[0]\ if len(target_file_tree.xpath('.//svg/@file')) > 0 else None files_to_process.insert(0, file_name) if xml_target_file in files_to_process: files_to_process.remove(xml_target_file) else: usage() return 2 if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)): print("ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!") usage() return 2 extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, compare2pdf=compare2pdf) for file in files_to_process: extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/fix_missing_glyphs.py =================================================================== --- svgscripts/fix_missing_glyphs.py (revision 101) +++ svgscripts/fix_missing_glyphs.py (revision 102) @@ -1,205 +1,210 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to fix missing glyphs. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.page import Page, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word from util import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False REMOVE_SVG_WORD_POS_PAGE_ENDING = re.compile('_page[0-9]+\w*') def find_missing_glyph_for_pwp(pwp, svg_path_tree, namespaces, xmin=0.0, ymin=0.0): """Finds missing glyph for a PositionalWordPart. :return: list of PositionalWordPart """ THRESHOLD = 15.5 #pwp = PositionalWordPart(node=positional_word_part_node) word_part_obj = { "x": pwp.left, "y": pwp.top, "text": pwp.text, "matrix": pwp.transform, "class": pwp.style_class } start_id = int(pwp.id) threshold = -0.5 positional_word_parts = [] while threshold < THRESHOLD and len(positional_word_parts) < 1: try: positional_word_parts = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces,\ start_id=start_id, xmin=xmin, ymin=ymin, threshold=threshold, throw_error_if_not_found=True) except Exception: threshold += 0.1 return positional_word_parts def update_word(word, old_transkription_position, old_positional_word_part, positional_word_parts): """Updates word according to new positional_word_parts. :return: new transkription_position """ if len(positional_word_parts) > 0: debug_msg_string = 'update word from ' + __file__ old_transkription_position.positional_word_parts.remove(old_positional_word_part) positional_word_parts.reverse() for positional_word_part in positional_word_parts: old_transkription_position.positional_word_parts.insert(int(old_positional_word_part.id), positional_word_part) for index, positional_word_part in enumerate(old_transkription_position.positional_word_parts): positional_word_part.id = index transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ old_transkription_position.positional_word_parts, debug_msg_string=debug_msg_string, transkription_position_id=old_transkription_position.id) word.transkription_positions.remove(old_transkription_position) transkription_positions.reverse() for new_tp in transkription_positions: word.transkription_positions.insert(int(old_transkription_position.id), new_tp) text = '' for index, tp in enumerate(word.transkription_positions): tp.id = index tp.writing_process_id = old_transkription_position.writing_process_id for pwp in tp.positional_word_parts: text += pwp.text if word.text != text: word.text = text return transkription_positions[0] def fix_missing_glyphs(svg_word_pos_file, manuscript_file=None): """Finds missing glyphs for xml file of type FILE_TYPE_SVG_WORD_POSITION. """ if isfile(svg_word_pos_file): if not UNITTESTING: print(Fore.LIGHTBLUE_EX + 'Fixing missing glyphs for file {} ... '.format(svg_word_pos_file), end='') print(Style.RESET_ALL) page = Page(svg_word_pos_file) - transkription_field = TranskriptionField(page.svg_file) + xmin = 0 + ymin = 0 + if page.svg_image is None or page.svg_image.text_field is None: + transkription_field = TranskriptionField(page.svg_file) + xmin = transkription_field.xmin + ymin = transkription_field.ymin svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) words_without_glyphs = [ word for word in page.words\ if len([ tp for tp in word.transkription_positions\ if len([ pwp for pwp in tp.positional_word_parts if pwp.symbol_id is None]) > 0]) > 0 ] for word in words_without_glyphs: for transkription_position in word.transkription_positions: positional_word_parts = transkription_position.positional_word_parts[:] for positional_word_part in positional_word_parts: if positional_word_part.symbol_id is None: - pwps = find_missing_glyph_for_pwp(positional_word_part, svg_path_tree, namespaces, xmin=transkription_field.xmin, ymin=transkription_field.ymin) + pwps = find_missing_glyph_for_pwp(positional_word_part, svg_path_tree, namespaces, xmin=xmin, ymin=ymin) new_transkription_position = update_word(word, transkription_position, positional_word_part, pwps) if new_transkription_position is not None: transkription_position = new_transkription_position page.update_and_attach_words2tree() write_pretty(xml_element_tree=page.page_tree, file_name=svg_word_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) page = Page(svg_word_pos_file) new_number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) if not UNITTESTING: result_color = Fore.LIGHTBLUE_EX if new_number_of_missing_glyphs == 0 else Fore.MAGENTA print(result_color + ' {0}/{1}'.format(number_of_missing_glyphs-new_number_of_missing_glyphs, number_of_missing_glyphs), end='') print(Fore.LIGHTBLUE_EX + ' fixed.', end='') print(Style.RESET_ALL) if len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) == 0: update_svgposfile_status(svg_word_pos_file, manuscript_file=manuscript_file, status='OK') def get_filelist_and_manuscript_file(file_a, file_b=None): """Returns a file list and a manuscript file (or None) """ file_list = [] manuscript_file = None source_tree = ET.parse(file_a) if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION\ and len([ word_part for word_part in source_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')]) > 0: # if symbol_ids are missing ... file_list.append(file_a) if file_b is not None: manuscript_file = file_b else: manuscript_file = REMOVE_SVG_WORD_POS_PAGE_ENDING.sub('', file_a) if not isfile(manuscript_file): manuscript_file = None elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: manuscript_file = file_a if file_b is not None: file_list.append(file_b) else: file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND.lower())) return file_list, manuscript_file def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to fix missing glyphs. svgscripts/fix_missing_glyphs.py [OPTIONS] -File [-File] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help: show help :return: exit code (int) """ try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): file_b = None if len(args) > 1 and isfile(args[1]): file_b = args[1] file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b) for svg_word_pos_file in file_list: fix_missing_glyphs(svg_word_pos_file, manuscript_file=manuscript_file) else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/datatypes/image.py =================================================================== --- svgscripts/datatypes/image.py (revision 101) +++ svgscripts/datatypes/image.py (revision 102) @@ -1,137 +1,138 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent all image types. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile import sys from .attachable_object import AttachableObject from .matrix import Matrix from .text_field import TextField sys.path.append('py2ttl') from class_spec import SemanticClass class Image(AttachableObject,SemanticClass): """ This super class represents all types of images. Args: file_name (str): name of the image file. node (lxml.etree.Element) node, containing information URL (str): URL of image file. height (float): height of image width (float): width of image text_field (.text_field.TextField) text_field on image representation """ stringKeys = [ 'file_name', 'URL', 'local_path' ] floatKeys = [ 'height', 'width' ] XML_TAG = 'image' def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, matrix=None, text_field=None, tag=XML_TAG): self.text_field = text_field self.tag = tag if node is not None: self.file_name = node.get('file-name') self.local_path = node.get('local-path') self.URL = node.get('URL') self.height = float(node.get('height')) self.width = float(node.get('width')) self.transform = Matrix(node.get('transform')) if bool(node.get('transform')) and 'matrix(' in node.get('transform') else None if len(node.findall(TextField.XML_TAG)) > 0: self.text_field = TextField(node=node.find(TextField.XML_TAG)) else: self.file_name = file_name self.local_path = local_path self.URL = URL self.height = height self.width = width self.transform = matrix def attach_object_to_tree(self, target_tree): """Attach object to tree. """ obj_node = target_tree.getroot().find('.//' + self.tag) \ if(len(target_tree.getroot().findall('.//' + self.tag)) > 0) \ else ET.SubElement(target_tree.getroot(), self.tag) for key in self.floatKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) for key in self.stringKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), self.__dict__[key]) if self.transform is not None and self.transform.isRotationMatrix(): obj_node.set('transform', self.transform.toString()) if self.text_field is not None: self.text_field.attach_object_to_tree(obj_node) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} for floatKey in Image.floatKeys: properties.update(cls.create_semantic_property_dictionary(floatKey, float, cardinality=1)) properties.update(cls.create_semantic_property_dictionary('file_name', str, cardinality=1)) + properties.update(cls.create_semantic_property_dictionary('text_field', TextField)) properties.update(cls.create_semantic_property_dictionary('transform', str)) properties.update(cls.create_semantic_property_dictionary('URL', str, cardinality=1)) dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary class SVGImage(Image): """This class represents a svg image. """ XML_TAG = 'svg-image' def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG): if node is not None and node.tag != self.XML_TAG: file_name = node.get('file') height = float(node.get('height')) if bool(node.get('height')) else 0.0 width = float(node.get('width')) if bool(node.get('width')) else 0.0 node = None super(SVGImage, self).__init__(node=node, file_name=file_name, URL=URL,\ height=height, width=width, text_field=text_field, tag=self.XML_TAG) def decontextualize_file_name(self, update_url=None): """Decontextualize file name. """ self.file_name = self.file_name.replace('./', '') if update_url is not None: self.URL = update_url + self.file_name - @classmethod - def get_semantic_dictionary(cls): - """ Creates and returns a semantic dictionary as specified by SemanticClass. - """ - dictionary = super(SVGImage,cls).get_semantic_dictionary() - dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('text_field', TextField)) - return cls.return_dictionary_after_updating_super_classes(dictionary) + +# @classmethod +# def get_semantic_dictionary(cls): +# """ Creates and returns a semantic dictionary as specified by SemanticClass. +# """ +# dictionary = super(SVGImage,cls).get_semantic_dictionary() +# return cls.return_dictionary_after_updating_super_classes(dictionary) Index: svgscripts/datatypes/simple_word.py =================================================================== --- svgscripts/datatypes/simple_word.py (revision 101) +++ svgscripts/datatypes/simple_word.py (revision 102) @@ -1,125 +1,127 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent a simple word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc from lxml import etree as ET import sys from .line import Line from .faksimile_position import FaksimilePosition from .transkription_position import TranskriptionPosition from .word_position import WordPosition sys.path.append('py2ttl') from class_spec import SemanticClass class SimpleWord(SemanticClass, metaclass=abc.ABCMeta): """ This class represents a simple word. """ XML_TAG = 'simple-word' XML_SUB_TAG = 'content' def __init__(self, id=0, line_number=-1, line=None, text='', deleted=False, transkription_positions=None, faksimile_positions=None): self.id = id self.text = text self.line_number = line_number self.lines = [] if line is not None: self.lines.append(line) self.transkription_positions = transkription_positions if transkription_positions is not None else [] self.faksimile_positions = faksimile_positions if faksimile_positions is not None else [] def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() if len(target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)) > 0: word_node = target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)[0] word_node.getparent().remove(word_node) word_node = ET.SubElement(target_tree, self.XML_TAG, attrib={'id': str(self.id)}) word_node.set('text', self.text) if self.line_number > -1: word_node.set('line-number', str(self.line_number)) for id, transkription_position in enumerate(self.transkription_positions): transkription_position.id = id transkription_position.attach_object_to_tree(word_node) for faksimile_position in self.faksimile_positions: faksimile_position.attach_object_to_tree(word_node) return word_node @classmethod def create_cls(cls, word_node): """Creates a cls from a (lxml.Element) node. [:return:] cls """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else -1 text = word_node.get('text') transkription_positions = [ TranskriptionPosition(id=id, node=node) for id, node in enumerate(word_node.findall('./' + WordPosition.TRANSKRIPTION)) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('./' + WordPosition.FAKSIMILE) ] return cls(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) else: error_msg = 'word_node has not been defined' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'lines': {cls.CLASS_KEY: Line,\ cls.CARDINALITY: 1,\ cls.CARDINALITY_RESTRICTION: 'minCardinality',\ cls.PROPERTY_NAME: 'wordBelongsToLine',\ cls.PROPERTY_LABEL: 'word belongs to a line',\ cls.PROPERTY_COMMENT: 'Relating a word to a line.'}} properties.update(cls.create_semantic_property_dictionary('transkription_positions', TranskriptionPosition,\ name='hasTranskriptionPosition', cardinality=1, cardinality_restriction='minCardinality')) properties.update(cls.create_semantic_property_dictionary('faksimile_positions', FaksimilePosition,\ name='hasFaksimilePosition')) #, cardinality=1, cardinality_restriction='minCardinality')) properties.update(cls.create_semantic_property_dictionary('text', str, cardinality=1,\ subPropertyOf=cls.HOMOTYPIC_HAS_TEXT_URL_STRING)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def init_word(self, page): """Initialize word with objects from page. """ - for transkription_position in self.transkription_positions: - transkription_position.svg_image = page.svg_image - self.faksimile_positions = FaksimilePosition.create_list_of_cls(self.faksimile_positions, page.faksimile_image, page.text_field) + #for transkription_position in self.transkription_positions: + # transkription_position.svg_image = page.svg_image + #self.faksimile_positions = FaksimilePosition.create_list_of_cls(self.faksimile_positions, page.faksimile_image, page.text_field) if self.line_number > -1: self.lines += [ line for line in page.lines if line.id == self.line_number ] + elif 'word_parts' in self.__dict__.keys() and len(self.word_parts) > 0: + self.lines += [ line for line in page.lines if line.id in [ wp.line_number for wp in self.word_parts ] ] Index: svgscripts/datatypes/transkription_position.py =================================================================== --- svgscripts/datatypes/transkription_position.py (revision 101) +++ svgscripts/datatypes/transkription_position.py (revision 102) @@ -1,200 +1,191 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a transkription word position. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile import sys from .debug_message import DebugMessage from .image import SVGImage from .positional_word_part import PositionalWordPart from .word_position import WordPosition from .matrix import Matrix sys.path.append('py2ttl') from class_spec import SemanticClass class TranskriptionPosition(WordPosition): """ This class represents the position of a word on the transkription as it is displayed by a svg image. @label position of a word on the topological transkription Args: id (int): word id matrix (datatypes.Matrix): matrix containing information about transformation. height (float): height of word width (float): width of word x (float): x position of word y (float): y position of word positional_word_parts a list of (datatypes.positional_word_part) PositionalWordPart debug_message a (datatypes.debug_message) DebugMessage """ ADD2X = 0.15 ADD2TOP = 1.0 ADD2BOTTOM = 0.2 HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height XML_TAG = WordPosition.TRANSKRIPTION def __init__(self, id=0, node=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, positional_word_parts=None, debug_message=None): super(TranskriptionPosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) self.positional_word_parts = positional_word_parts if positional_word_parts is not None else [] self.debug_message = debug_message self.deleted = False self.has_box = None self.style = None self.svg_image = None if node is not None: self.debug_message = DebugMessage(node=node.xpath('.//' + DebugMessage.XML_TAG)[0])\ if len(node.xpath('.//' + DebugMessage.XML_TAG)) > 0 else None self.positional_word_parts = [ PositionalWordPart(node=pwp_node) for pwp_node in node.xpath('.//' + PositionalWordPart.XML_TAG) ] self.attachable_objects += self.positional_word_parts if self.debug_message is not None: self.attachable_objects.append(self.debug_message) - @classmethod - def get_semantic_dictionary(cls): - """ Creates a semantic dictionary as specified by SemanticClass. - """ - dictionary = super(TranskriptionPosition,cls).get_semantic_dictionary() - dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('svg_image', SVGImage, cardinality=1,\ - name='isOnSvgImage', label='transkription position is on svg image')) - return cls.return_dictionary_after_updating_super_classes(dictionary) - def get_text(self): """Returns the concatenated text of all positional_word_parts. """ return ''.join([pwp.text for pwp in self.positional_word_parts]) def is_mergebale_with(self, other) -> bool: """Return whether self and other have same writing_process_id or style. """ if self.writing_process_id == other.writing_process_id: return True if self.writing_process_id == -1 or other.writing_process_id == -1\ and (len(self.positional_word_parts) > 0 and len(other.positional_word_parts) > 0): return self.positional_word_parts[0].style_class == other.positional_word_parts[0].style_class return False def split(self, split_position, second_split=-1) ->list: """Split a transkription_position in two at split_position. :return: a list of the new transkription_positions """ transkription_positions = [] left_pwp = [ pwp for pwp in self.positional_word_parts if pwp.left + pwp.width < split_position ] transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(left_pwp, transkription_position_id=self.id) if second_split == -1: right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp ] next_id = int(self.id) + 1 transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id)) else: middle_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp.left + pwp.width < second_split ] next_id = int(self.id) + 1 transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(middle_pwp, transkription_position_id=str(next_id)) right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp not in middle_pwp ] next_id = int(self.id) + 1 transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id)) return transkription_positions def update_positional_word_parts(self, positional_word_parts): """Update positional_word_parts. """ if len(self.positional_word_parts) > 0 and self.positional_word_parts in self.attachable_objects: self.attachable_objects.remove(self.positional_word_parts) self.positional_word_parts = positional_word_parts self.attachable_objects += self.positional_word_parts @staticmethod def CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=None, debug_msg_string=None, transkription_position_id=0): """Creates a list of TranskriptionPosition from a list of (datatypes.positional_word_part) PositionalWordPart. [:return:] a list of (datatypes.transkription_position) TranskriptionPosition """ TOPCORRECTION = 1 debug_message = DebugMessage(message=debug_msg_string)\ if debug_msg_string is not None else debug_message transkription_positions = [] if len(positional_word_parts) < 1: return [] matrix = positional_word_parts[0].transform index = 0 matrices_differ = False style_class = positional_word_parts[0].style_class styles_differ = False while index < len(positional_word_parts) and not matrices_differ and not styles_differ: if Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix, positional_word_parts[index].transform): matrices_differ = True elif style_class != positional_word_parts[index].style_class: styles_differ = True else: index += 1 if (matrices_differ or styles_differ) and index < len(positional_word_parts): debug_msg_string = 'matrices differ' if matrices_differ else 'styles differ' transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts[index:], debug_msg_string=debug_msg_string, transkription_position_id=int(transkription_position_id)+1) positional_word_parts = positional_word_parts[:index] height = [ pwp.height for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.height, reverse=True)][0] + 2*TOPCORRECTION x = positional_word_parts[0].left - TranskriptionPosition.ADD2X y = [ pwp.top for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.top)][0] - TOPCORRECTION width = positional_word_parts[len(positional_word_parts)-1].left - x\ + positional_word_parts[len(positional_word_parts)-1].width + TranskriptionPosition.ADD2X for pwp_index, pwp in enumerate(positional_word_parts): pwp.id = pwp_index transkription_positions.insert(0, TranskriptionPosition(id=transkription_position_id, height=height, width=width, x=x, y=y, matrix=matrix,\ positional_word_parts=positional_word_parts, debug_message=debug_message)) return transkription_positions @staticmethod def CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=None, debug_msg_string=None, transkription_field=None): """Creates a list of TranskriptionPosition from word_part_objs (i.e. a list of dictionaries with the keys: text, x, y, matrix, class). [:return:] a list of (datatypes.transkription_position) TranskriptionPosition """ positional_word_parts = [] debug_message = DebugMessage(message=debug_msg_string)\ if debug_msg_string is not None else None if page.svg_file is not None and isfile(page.svg_file): svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } xmin = 0.0 ymin = 0.0 if transkription_field is not None: xmin = transkription_field.xmin ymin = transkription_field.ymin for part_obj in word_part_objs: positional_word_parts += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(\ part_obj, svg_path_tree, namespaces, page, start_id=len(positional_word_parts),\ xmin=xmin, ymin=ymin) else: positional_word_parts = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) if len(positional_word_parts) > 0: return TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=debug_message) else: return [ TranskriptionPosition(matrix=matrix, debug_message=debug_message) ] Index: svgscripts/datatypes/faksimile_image.py =================================================================== --- svgscripts/datatypes/faksimile_image.py (revision 101) +++ svgscripts/datatypes/faksimile_image.py (revision 102) @@ -1,108 +1,108 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent faksimile images. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import fnmatch from lxml import etree as ET import os from os.path import basename, dirname, isfile, realpath, sep import sys from .image import Image from .matrix import Matrix from .text_field import TextField sys.path.append('svgscripts') from local_config import FAKSIMILE_LOCATION class FaksimileImage(Image): """ This class represents a faksimile image. Args: file_name (str): name of the image file. node (lxml.etree.Element) node, containing information URL (str): URL of image file. height (float): height of image width (float): width of image x (float): x y (float): y """ XML_TAG = 'faksimile-image' #OLD_NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/download/' NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/' def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, text_field=None): super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL, local_path=local_path,\ height=height, width=width, matrix=matrix, text_field=text_field, tag=self.XML_TAG) self.x = x self.y = y def get_image_joined_with_text_field(self, text_field): """Returns a new instance of itself that has a text_field (text_field.TextField). """ return FaksimileImage(file_name=self.file_name, local_path=self.local_path, URL=self.URL, height=self.height,\ width=self.width, x=self.x, y=self.y, text_field=text_field) - @classmethod - def get_semantic_dictionary(cls): - """ Creates and returns a semantic dictionary as specified by SemanticClass. - """ - dictionary = super(FaksimileImage,cls).get_semantic_dictionary() - dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('text_field', TextField)) - #dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('URL', str, cardinality=1)) - return cls.return_dictionary_after_updating_super_classes(dictionary) +# @classmethod +# def get_semantic_dictionary(cls): +# """ Creates and returns a semantic dictionary as specified by SemanticClass. +# """ +# dictionary = super(FaksimileImage,cls).get_semantic_dictionary() +# dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('text_field', TextField)) +# #dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('URL', str, cardinality=1)) +# return cls.return_dictionary_after_updating_super_classes(dictionary) @staticmethod def CREATE_IMAGE(image_node, source_file=None): """Instantiates a FaksimileImage from a (lxml.etree.Element) image_node. """ namespaces = image_node.nsmap if len(namespaces) == 0: namespaces = { 'xlink': '' } local_path = image_node.get('{%s}href' % namespaces['xlink']) file_name = basename(local_path) if file_name != local_path and source_file is not None: local_path = realpath(dirname(source_file)) + sep + local_path local_path = realpath(local_path) if not isfile(local_path): local_path = None for path, dirs, files in os.walk(os.path.abspath(FAKSIMILE_LOCATION)): for filename in fnmatch.filter(files, file_name): local_path = os.path.join(path, filename) break URL = FaksimileImage.NIETZSCHE_SOURCES_URL + file_name.replace('.jpg','') height = float(image_node.get('height')) if bool(image_node.get('height')) else 0.0 width = float(image_node.get('width')) if bool(image_node.get('width')) else 0.0 x = float(image_node.get('x')) if bool(image_node.get('x')) else 0.0 y = float(image_node.get('y')) if bool(image_node.get('y')) else 0.0 matrix = Matrix(transform_matrix_string=image_node.get('transform'))\ if bool(image_node.get('transform'))\ else None return FaksimileImage(file_name=file_name, local_path=local_path, URL=URL, height=height, width=width, x=x, y=y, matrix=matrix) Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 101) +++ svgscripts/datatypes/page.py (revision 102) @@ -1,321 +1,333 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile, basename from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import re import sys import warnings from .box import Box from .color import Color from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .faksimile_position import FaksimilePosition from .lineNumber import LineNumber from .line import Line from .mark_foreign_hands import MarkForeignHands from .matrix import Matrix from .path import Path from .positional_word_part import PositionalWordPart from .super_page import SuperPage from .style import Style from .text_connection_mark import TextConnectionMark from .text_field import TextField from .transkriptionField import TranskriptionField from .writing_process import WritingProcess from .word import Word from .word_deletion_path import WordDeletionPath from .word_insertion_mark import WordInsertionMark sys.path.append('py2ttl') from class_spec import SemanticClass FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK class Page(SemanticClass,SuperPage): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. faksimile_image: FaksimileImage. faksimile_svgFile: svg file containing information about word positions. """ UNITTESTING = False def __init__(self, xml_source_file=None, faksimile_image=None, faksimile_svgFile=None, add_deletion_paths_to_words=True, number=None): if xml_source_file is not None: super(Page,self).__init__(xml_source_file) self.update_property_dictionary('faksimile_image', faksimile_image) self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile) self.init_all_properties() self.add_style(style_node=self.page_tree.getroot().find('.//style')) + self.faksimile_text_field = None + self.svg_text_field = None self.init_node_objects() if add_deletion_paths_to_words: self.add_deletion_paths_to_words() else: self.number = number def add_deletion_paths_to_words(self): """Add deletion paths to words. """ if (self.svg_file is not None and isfile(self.svg_file))\ or (self.source is not None and isfile(self.source)): svg_file = self.svg_file if self.svg_file is not None else self.source transkription_field = TranskriptionField(svg_file) words = [ word for word in self.words if word.deleted or True in [ part.deleted for part in word.word_parts ]] + tr_xmin = transkription_field.xmin if (self.svg_image is None or self.svg_image.text_field is None) else 0 + tr_ymin = transkription_field.ymin if (self.svg_image is None or self.svg_image.text_field is None) else 0 for word in words: - word.add_deletion_paths(self.word_deletion_paths, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin) + word.add_deletion_paths(self.word_deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) @classmethod def create_cls(cls, xml_source_file, create_dummy_page=False): """Create a Page. """ if not create_dummy_page: return cls(xml_source_file) else: m = re.match(r'(.*)(page[0]*)(.*)(\.xml)', xml_source_file) if m is not None and len(m.groups()) > 3: number = m.group(3) else: number = basename(xml_source_file).replace('.xml','') return cls(number=number) @classmethod def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None): """Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT. [optional: instantiation depends on the fulfilment of a status_contains and/or on the selection of some words by a word_selection_function]. """ source_tree = ET.parse(xml_file) - if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION: + if source_tree.getroot().find('metadata/type').text == cls.FILE_TYPE_SVG_WORD_POSITION: page = cls(xml_file) if word_selection_function is None or len(word_selection_function(page.words)) > 0: return [ page ] else: return [] elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: pages = [] xpath = '//page/@output' if status_contains != '' and status_not_contain != '': xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain) elif status_contains != '': xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains) elif status_not_contain != '': xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain) for xml_source_file in source_tree.xpath(xpath): if isfile(xml_source_file): pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function) return pages else: return [] @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'number': { 'class': str, 'cardinality': 1}} - properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage)) + properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage, subPropertyOf=cls.HAS_IMAGE)) + properties.update(cls.create_semantic_property_dictionary('faksimile_text_field', TextField,\ + name='pageIsOnFaksimileTextField', label='page is on faksimile text field',\ + comment='Relates a page to the text field on a svg image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD)) properties.update(cls.create_semantic_property_dictionary('orientation', str)) - properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage)) - properties.update(cls.create_semantic_property_dictionary('text_field', TextField,\ - cardinality=1, name='pageIsOnTextField', label='page is on text field',\ - comment='Relates a page to the text field on a faksimile image.')) + properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage, subPropertyOf=cls.HAS_IMAGE)) + properties.update(cls.create_semantic_property_dictionary('svg_text_field', TextField,\ + name='pageIsOnSVGTextField', label='page is on svg text field',\ + comment='Relates a page to the text field on a faksimile image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD)) for key in [ 'lines', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']: properties.update(cls.create_semantic_property_dictionary(key, list)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def init_node_objects(self): """Initialize all node objects. """ self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ] self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ] self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ] self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ] self.word_deletion_paths = [ WordDeletionPath.create_cls(node, self) for node in self.page_tree.xpath('//' + WordDeletionPath.XML_TAG) ] - if self.faksimile_image is not None and self.text_field is not None: - for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks: - simple_word.init_word(self) + if self.faksimile_image is not None and self.faksimile_image.text_field is not None: + self.faksimile_text_field = self.faksimile_image.text_field + if self.svg_image is not None and self.svg_image.text_field is not None: + self.svg_text_field = self.svg_image.text_field + for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks: + simple_word.init_word(self) for wim in self.word_insertion_marks: if wim.line_number > -1: wim.line = [ line for line in self.lines if line.id == wim.line_number ][0] def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None): """Update the data source of page. """ if faksimile_svgFile is not None: self.faksimile_svgFile = faksimile_svgFile data_node = self.page_tree.xpath('.//data-source')[0]\ if len(self.page_tree.xpath('.//data-source')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'data-source') data_node.set('file', self.faksimile_svgFile) if xml_correction_file is not None: data_node.set('xml-corrected-words', xml_correction_file) - def update_line_number_area(self, transkription_field, svg_tree=None): + def update_line_number_area(self, transkription_field, svg_tree=None, set_to_text_field_zero=True): """Determines the width of the area where the line numbers are written in the page.source file. """ THRESHOLD = 0.4 if svg_tree is None: svg_tree = ET.parse(self.source) if len(self.line_numbers) > 1: line_number = self.line_numbers[9]\ if transkription_field.is_page_verso() and len(self.line_numbers) > 8\ else self.line_numbers[1] ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\ if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\ and LineNumber.IS_A_LINE_NUMBER(item)\ - and LineNumber(raw_text_node=item, transkription_field=transkription_field).id == line_number.id ] + and LineNumber(raw_text_node=item).id == line_number.id ] if len(ln_nodes) > 0: matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform')) if transkription_field.is_page_verso(): transkription_field.add_line_number_area_width(matrix.getX()) elif self.svg_file is not None and isfile(self.svg_file): svg_path_tree = ET.parse(self.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } svg_x = matrix.getX() - svg_y = self.line_numbers[1].bottom + transkription_field.ymin + svg_y = self.line_numbers[1].bottom + transkription_field.ymin\ + if set_to_text_field_zero\ + else self.line_numbers[1].bottom use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\ .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces) if len(use_nodes) > 0: symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '') d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces) if len(d_strings) > 0 and d_strings[0] != '': path = parse_path(d_strings[0]) xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin transkription_field.add_line_number_area_width(matrix.getX() + width) def update_page_type(self, transkription_field=None): """Adds a source to page and attaches it to page_tree. """ if self.number.endswith('r')\ or self.number.endswith('v'): self.page_type = Page.PAGE_VERSO\ if self.number.endswith('v')\ else Page.PAGE_RECTO else: if transkription_field is None: if self.source is None or not isfile(self.source): raise FileNotFoundError('Page does not have a source!') transkription_field = TranskriptionField(self.source, multipage_index=self.multipage_index) self.page_type = Page.PAGE_VERSO\ if transkription_field.is_page_verso()\ else Page.PAGE_RECTO self.page_tree.getroot().set('pageType', self.page_type) def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False): """Update styles of words and add them to their transkription_positions. Args: add_to_parents: Add styles also to word (and if not None to manuscript). partition_according_to_styles: Partition word if its transkription_positions have different styles. """ style_dictionary = {} if words is None: words = self.words for word in words: if len(word.word_parts) > 0: self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\ add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles) for transkription_position in word.transkription_positions: if len(transkription_position.positional_word_parts) > 0: style_class = transkription_position.positional_word_parts[0].style_class writing_process_id = -1 for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]: writing_process_id = self.fontsizekey2stage_mapping.get(font_key) style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id) if create_css: if style_dictionary.get((style_class_key, word.deleted)) is None: color = word.deletion_paths[0].style.color\ if len(word.deletion_paths) > 0 else None style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\ create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] ) transkription_position.style = style_dictionary[(style_class_key, word.deleted)] #print(style_dictionary[(style_class_key, word.deleted)]) else: if style_dictionary.get(style_class_key) is None: style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css) style_dictionary[style_class_key].writing_process_id = style_class_key[1] transkription_position.style = style_dictionary[style_class_key] if add_to_parents and transkription_position.style not in word.styles: word.styles.append(transkription_position.style) if partition_according_to_styles: word.split_according_to_status('style', splits_are_parts=True) if manuscript is not None\ and add_to_parents: manuscript.update_styles(*style_dictionary.values()) Index: svgscripts/datatypes/mark_foreign_hands.py =================================================================== --- svgscripts/datatypes/mark_foreign_hands.py (revision 101) +++ svgscripts/datatypes/mark_foreign_hands.py (revision 102) @@ -1,147 +1,148 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent the mark for text by some foreign hand. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from .matrix import Matrix from .special_word import SpecialWord class MarkForeignHands(SpecialWord): """ This class represents the mark for text by some foreign hand. """ XML_TAG = 'mark-foreign-hands' XML_SUB_TAG = 'text' CLASS_MARK = '$' REPLACE_DICT = { '+': 'x' } def __init__(self, id=0, line_number=-1, text=CLASS_MARK, foreign_hands_text='', pen='', transkription_positions=[], faksimile_positions=[]): super(MarkForeignHands, self).__init__(id=id, text=text, line_number=line_number,\ transkription_positions=transkription_positions, faksimile_positions=faksimile_positions) self.foreign_hands_text = foreign_hands_text self.pen = pen def add_content(self, node): """Adds content to MarkForeignHands. """ self.foreign_hands_text = node.text self.pen = node.get('pen') def attach_word_to_tree(self, target_tree): """Attaches MarkForeignHands to tree target_tree. """ node = super(MarkForeignHands,self).attach_word_to_tree(target_tree) if self.foreign_hands_text != '': content_node = ET.SubElement(node, MarkForeignHands.XML_SUB_TAG) content_node.text = self.foreign_hands_text if self.pen != '': content_node.set('pen', self.pen) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = super(MarkForeignHands,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('foreign_hands_text',\ str, cardinality=1, name='textOfForeignHands', label='text traces of some foreign hand')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('pen',\ str, cardinality=1, cardinality_restriction='maxCardinality',\ name='penOfForeignHands', label='pen used to write text by some foreign hand')) return cls.return_dictionary_after_updating_super_classes(dictionary) @classmethod def get_special_char_list(cls): """Returns a list of the chars that define this special word. """ return [ cls.CLASS_MARK ] @staticmethod - def find_content(list_of_special_words, transkription_field, svg_tree, style_dict=None, italic_classes=None, SonderzeichenList=None, marginals_extra=False): + def find_content(list_of_special_words, transkription_field, svg_tree, style_dict=None, italic_classes=None, SonderzeichenList=None, marginals_extra=False, set_to_text_field_zero=True): """Find content for the MarkForeignHands. """ if style_dict is None: style_dict = {} if italic_classes is None: italic_classes = [] if SonderzeichenList is None: SonderzeichenList = [] if len(style_dict) > 0: if len(italic_classes) == 0: italic_classes = [ key for key in style_dict\ if bool(style_dict[key].get('font-family')) and style_dict[key]['font-family'].endswith('Italic') ] if len(SonderzeichenList) == 0: SonderzeichenList = [ key for key in style_dict\ if bool(style_dict[key].get('font-family')) and style_dict[key]['font-family'].startswith('Sonderzeichen') ] nodes_in_margin_field = [ item for item in filter(lambda x: Matrix.IS_IN_MARGIN_FIELD(x.get('transform'), transkription_field, marginals_on_extra_page=marginals_extra),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] + provide_tf = transkription_field if set_to_text_field_zero else None for mark_foreign_hands in list_of_special_words: relevant_nodes = [ node for node in nodes_in_margin_field\ if is_close((mark_foreign_hands.transkription_positions[0].bottom+mark_foreign_hands.transkription_positions[0].top)/2,\ - node.get('transform'), transkription_field) ] + node.get('transform'), transkription_field=provide_tf) ] relevant_nodes = sorted(relevant_nodes, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) italic_found = False mark_foreign_hands_text = '' pen = '' for node in relevant_nodes: if len(node.getchildren()) == 0: if italic_found: pen += node.text elif any(style in italic_classes for style in node.get('class').split(' ')): italic_found = True pen = node.text else: mark_foreign_hands_text += get_text_from_node(node, SonderzeichenList) else: for tspan in node.getchildren(): if italic_found: pen += tspan.text elif any(style in italic_classes for style in tspan.get('class').split(' ')): italic_found = True pen = tspan.text else: mark_foreign_hands_text += get_text_from_node(tspan, SonderzeichenList) mark_foreign_hands.foreign_hands_text = mark_foreign_hands_text mark_foreign_hands.pen = pen def get_text_from_node(node, SonderzeichenList): """Returns the text of node. Replaces Sonderzeichen if node has a style class in SonderzeichenList. """ if any(style in SonderzeichenList for style in node.get('class').split(' '))\ and bool(MarkForeignHands.REPLACE_DICT.get(node.text)): return MarkForeignHands.REPLACE_DICT[node.text] else: return node.text -def is_close(mark_foreign_hands_position, matrix_string, transkription_field): +def is_close(mark_foreign_hands_position, matrix_string, transkription_field=None): """Return true if mark_foreign_hands_position is == matrix.getY()+-THRESHOLD_Y """ THRESHOLD_Y = 4 matrix = Matrix(transform_matrix_string=matrix_string, transkription_field=transkription_field) return abs(mark_foreign_hands_position-matrix.getY()) < THRESHOLD_Y Index: svgscripts/datatypes/box.py =================================================================== --- svgscripts/datatypes/box.py (revision 101) +++ svgscripts/datatypes/box.py (revision 102) @@ -1,141 +1,138 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent svg paths of type 'box'. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from svgpathtools.parser import parse_path import warnings from .matrix import Matrix from .path import Path from .transkriptionField import TranskriptionField class Box(Path): """ This represents box svg paths. Args: node (lxml.etree.Element) node, containing information path (svgpathtools.path.Path) svg path representation. """ XML_TAG = 'box-path' def __init__(self, id=0, node=None, path=None, d_string=None, style_class='', earlier_text='', text_style_class='', earlier_version=False): super(Box,self).__init__(id=id, node=node, path=path, d_string=d_string, style_class=style_class, tag=Box.XML_TAG) self.stringKeys += [ 'earlier_text', 'text_style_class' ] self.earlier_text = earlier_text self.text_style_class = text_style_class self.earlier_version = earlier_version if node is not None: if bool(node.get('earlier-text')): self.earlier_text = node.get('earlier-text') if bool(node.get('text-style-class')): self.text_style_class = node.get('text-style-class') @classmethod - def create_box(cls, path, margin_boxes_on_line, svg_source=None, svg_tree=None, transkription_field=None, namespaces={}, threshold=1.5): + def create_box(cls, path, margin_boxes_on_line, svg_source=None, svg_tree=None, namespaces={}, threshold=1.5): """Create a Box from a path and find its corresponding earlier_text outside of transkription_field. :return: box.Box """ if svg_source is not None: svg_tree = ET.parse(svg_source) if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } - if transkription_field is None: - transkription_field = TranskriptionField(svg_source) if svg_source is not None\ - else TranskriptionField(svg_tree.docinfo.URL) matching_boxes = [ margin_box for margin_box in margin_boxes_on_line\ if abs(margin_box.get_median_y()-path.get_median_y()) < threshold ] box = None if len(matching_boxes) > 0: matching_box = matching_boxes[0] margin_boxes_on_line.remove(matching_box) xmin, xmax, ymin, ymax = matching_box.path.bbox() if ymin == ymax: ymin = path.path.bbox()[2] ymax = path.path.bbox()[3] text_nodes = [ text_node for text_node in svg_tree.xpath('//ns:text', namespaces=namespaces)\ if text_node_is_inside_match_box(text_node, xmin, xmax, ymin, ymax) ] tspan_nodes = [ tspan_node for tspan_node in svg_tree.xpath('//ns:text/ns:tspan', namespaces=namespaces)\ if tspan_node_is_inside_match_box(tspan_node, xmin, xmax, ymin, ymax) ] box_text = '' text_styles = [] if len(text_nodes) > 0: text_nodes = sorted(text_nodes, key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX()) for text_node in text_nodes: if len(text_node.xpath('./ns:tspan', namespaces=namespaces)) == 0: text_styles += [ text_node.get('class') ] box_text += text_node.text else: matrix = Matrix(transform_matrix_string=text_node.get('transform')) for tspan_node in text_node.xpath('./ns:tspan', namespaces=namespaces): if matrix.add2X(add_to_x=tspan_node.get('x')) < xmax: text_styles.append(tspan_node.get('class')) box_text += tspan_node.text elif len(tspan_nodes) > 0: for tspan_node in tspan_nodes: text_styles.append(tspan_node.get('class')) box_text += tspan_node.text else: warnings.warn('No text_node found for xmin, xmax, ymin, ymax: {0} {1} {2} {3}'.format(xmin, xmax, ymin, ymax)) text_style_class = ' '.join(list(set([ item for style in text_styles for item in style.split(' ') ]))) box = Box(id=path.id, path=path.path, style_class=path.style_class,\ earlier_text=box_text.replace(' ',''), text_style_class=text_style_class) else: #print([ margin_box.path.bbox() for margin_box in margin_boxes_on_line ], len(margin_boxes_on_line)) warnings.warn(f'No margin box found for box with bbox: {path.path.bbox()}, {margin_boxes_on_line} {threshold}') return box @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(Box,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_text', str)) return cls.return_dictionary_after_updating_super_classes(dictionary) def text_node_is_inside_match_box(text_node, xmin, xmax, ymin, ymax): """Return true if text_node is inside xmin, xmax, ymin, ymax. """ if not bool(text_node.get('transform')): return False matrix = Matrix(transform_matrix_string=text_node.get('transform')) return matrix.getY() > ymin and matrix.getY() < ymax\ and matrix.getX() > xmin and matrix.getX() < xmax def tspan_node_is_inside_match_box(tspan_node, xmin, xmax, ymin, ymax): """Return true if tspan_node is inside xmin, xmax, ymin, ymax. """ if not bool(tspan_node.getparent().get('transform')): return False matrix = Matrix(transform_matrix_string=tspan_node.getparent().get('transform')) tspan_x = matrix.add2X(add_to_x=tspan_node.get('x')) return matrix.getY() > ymin and matrix.getY() < ymax\ and tspan_x > xmin and tspan_x < xmax Index: svgscripts/datatypes/faksimile_position.py =================================================================== --- svgscripts/datatypes/faksimile_position.py (revision 101) +++ svgscripts/datatypes/faksimile_position.py (revision 102) @@ -1,83 +1,70 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a faksimile word position. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from .faksimile_image import FaksimileImage from .matrix import Matrix from .positional_object import PositionalObject from .text_field import TextField from .writing_process import WritingProcess from .word_position import WordPosition class FaksimilePosition(WordPosition): """ This class represents the position of a Word on a TextField on a FaksimileImage. Args: id (int): word id matrix (Matrix): matrix containing information about conversion. height (float): height of word width (float): width of word x (float): x position of word y (float): y position of word faksimile_image (FaksimileImage) the faksimile image text_field (TextField) the text_field on the faksimile_image. """ XML_TAG = 'faksimile-position' def __init__(self, id=0, node=None, text=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, faksimile_image=None, text_field=None): super(FaksimilePosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.FAKSIMILE) self.faksimile_image = faksimile_image self.text_field = text_field if self.text_field is None\ and self.faksimile_image is not None\ and self.faksimile_image.text_field is not None: self.text_field = self.faksimile_image.text_field @classmethod - def get_semantic_dictionary(cls): - """ Creates a semantic dictionary as specified by SemanticClass. - """ - dictionary = super(FaksimilePosition,cls).get_semantic_dictionary() - dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('faksimile_image',\ - FaksimileImage, cardinality=1, name='isOnFaksimileImage', label='faksimile position is on faksimile image',\ - comment='Relates the faksimile position of a word to the faksimile image')) - dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('text_field',\ - TextField, cardinality=1, name='isOnTextField', label='faksimile position is on text field',\ - comment='Relates the faksimile position of a word to its text field on a faksimile image')) - return cls.return_dictionary_after_updating_super_classes(dictionary) - - @classmethod def create_list_of_cls(cls, word_positions, faksimile_image, text_field): """Instantiate cls from a list of WordPosition by adding FaksimileImage and TextField. """ return [ cls(id=wp.id, height=wp.height, width=wp.width, x=wp.left, y=wp.top, matrix=wp.transform,\ faksimile_image=faksimile_image, text_field=text_field)\ for wp in word_positions ] Index: svgscripts/datatypes/page_creator.py =================================================================== --- svgscripts/datatypes/page_creator.py (revision 101) +++ svgscripts/datatypes/page_creator.py (revision 102) @@ -1,128 +1,131 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to create a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import sys import warnings from .box import Box from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .faksimile_position import FaksimilePosition from .lineNumber import LineNumber from .line import Line from .mark_foreign_hands import MarkForeignHands from .matrix import Matrix from .path import Path from .positional_word_part import PositionalWordPart from .super_page import SuperPage from .text_connection_mark import TextConnectionMark from .text_field import TextField from .transkriptionField import TranskriptionField from .writing_process import WritingProcess from .word import Word from .word_insertion_mark import WordInsertionMark FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT class PageCreator(SuperPage): """ This class represents a page. Args: xml_target_file (str): name of the xml file to which page info will be written. """ UNITTESTING = False WARNING_MISSING_USE_NODE4PWP = PositionalWordPart.WARN_NO_USE_NODE_FOUND WARNING_MISSING_GLYPH_ID4WIM = WordInsertionMark.WARN_NO_GLYPH_ID PAGE_RECTO = 'recto' PAGE_VERSO = 'verso' - def __init__(self, xml_target_file, title=None, page_number=None, faksimile_image=None, faksimile_svgFile=None, pdfFile=None, svg_file=None, orientation='North', multipage_index=-1, page_type=PAGE_VERSO, source=None, marginals_source=None): + def __init__(self, xml_target_file, title=None, page_number=None, faksimile_image=None, faksimile_svgFile=None, pdfFile=None, svg_file=None, svg_text_field=None, orientation='North', multipage_index=-1, page_type=PAGE_VERSO, source=None, marginals_source=None): super(PageCreator,self).__init__(xml_target_file, title=title, multipage_index=multipage_index, page_number=page_number, orientation=orientation, page_type=page_type) self.update_property_dictionary('faksimile_image', faksimile_image) self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile) self.update_property_dictionary('marginals_source', marginals_source) self.update_property_dictionary('pdfFile', pdfFile) self.update_property_dictionary('svg_file', svg_file) self.update_property_dictionary('source', source) if svg_file is not None and isfile(svg_file): tf = TranskriptionField(svg_file) width = round(tf.documentWidth, 3) height = round(tf.documentHeight, 3) - self.update_property_dictionary('svg_image', SVGImage(file_name=svg_file, width=width, height=height)) + self.update_property_dictionary('svg_image', SVGImage(file_name=svg_file, width=width, height=height, text_field=svg_text_field)) for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG, WordInsertionMark.XML_TAG, WritingProcess.XML_TAG, Path.WORD_DELETION_PATH_TAG]: for node in self.page_tree.xpath('//' + xpath2remove): node.getparent().remove(node) self.init_all_properties() + if self.svg_image is not None and self.svg_image.text_field is None: + self.svg_image.text_field = TranskriptionField(self.svg_image.file_name).convert_to_text_field() + self.svg_image.attach_object_to_tree(self.page_tree) def create_writing_processes_and_attach2tree(self): """Creates three stages of Nietzsche's process of writing. """ self.writing_processes = [ WritingProcess(version=WritingProcess.FIRST_VERSION),\ WritingProcess(version=WritingProcess.INSERTION_AND_ADDITION),\ WritingProcess(version=WritingProcess.LATER_INSERTION_AND_ADDITION) ] for writing_process in self.writing_processes: writing_process.attach_object_to_tree(self.page_tree) #for word in self.words: # for transkription_position in word.transkription_positions: # for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): # if font_key in self.fontsizekey2stage_mapping.keys(): # transkription_position.writing_process_id = self.fontsizekey2stage_mapping.get(font_key) def init_line_numbers(self, line_numbers, document_bottom): """Init line numbers. """ even_index = 0 MINABOVE = 1 self.line_numbers = [] if len(line_numbers) > 0: first_line_bottom = line_numbers[even_index].top - MINABOVE self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 while even_index < len(line_numbers): self.line_numbers.append(LineNumber(id=line_numbers[even_index].id-1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=line_numbers[even_index].top-MINABOVE)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 self.line_numbers.append(LineNumber(id=line_numbers[even_index-1].id+1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=document_bottom)) for line_number in self.line_numbers: line_number.attach_object_to_tree(self.page_tree) Index: svgscripts/datatypes/lineNumber.py =================================================================== --- svgscripts/datatypes/lineNumber.py (revision 101) +++ svgscripts/datatypes/lineNumber.py (revision 102) @@ -1,136 +1,138 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a line number. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import re from lxml import etree as ET from os.path import isfile import sys from .matrix import Matrix sys.path.append('py2ttl') class LineNumber: """ This class represents a line number. Args: file_name (str): name of the xml file to be instantiated. """ XML_TAG = 'line-number' WARN_NO_LINE_NUMBER = 'No line number found' MIN_LINE_HIGHT = 5 def __init__(self, id=0, bottom=0.0, top=0.0, raw_text_node=None, transkription_field=None, xml_text_node=None): self.id = id self.bottom = bottom self.top = top self.faksimile_inner_bottom = 0.0 self.faksimile_inner_top = 0.0 self.faksimile_outer_bottom = 0.0 self.faksimile_outer_top = 0.0 if xml_text_node is not None: self.id = int(xml_text_node.get('id')) self.bottom = float(xml_text_node.get('bottom')) self.top = float(xml_text_node.get('top')) self.faksimile_inner_bottom = float(xml_text_node.get('faksimile-inner-bottom')) if bool(xml_text_node.get('faksimile-inner-bottom')) else 0.0 self.faksimile_inner_top = float(xml_text_node.get('faksimile-inner-top')) if bool(xml_text_node.get('faksimile-inner-top')) else 0.0 self.faksimile_outer_bottom = float(xml_text_node.get('faksimile-outer-bottom')) if bool(xml_text_node.get('faksimile-outer-bottom')) else 0.0 self.faksimile_outer_top = float(xml_text_node.get('faksimile-outer-top')) if bool(xml_text_node.get('faksimile-outer-top')) else 0.0 - if raw_text_node is not None and transkription_field is not None: + if raw_text_node is not None: matrix = Matrix(raw_text_node.get('transform'), transkription_field=transkription_field) self.bottom = matrix.getY() self.id = int(raw_text_node.text) if raw_text_node.text is not None\ else int(''.join([x.text for x in raw_text_node.findall('.//tspan', raw_text_node.nsmap)])) @classmethod - def extract_line_numbers(cls, svg_tree, transkription_field) -> list: + def extract_line_numbers(cls, svg_tree, transkription_field, set_to_text_field_zero=True) -> list: """Extracts line numbers. """ nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] - line_numbers = [ cls(raw_text_node=item, transkription_field=transkription_field) for item in nodes_near_tf if cls.IS_A_LINE_NUMBER(item)] + provide_tf = None if not set_to_text_field_zero else transkription_field + line_numbers = [ cls(raw_text_node=item, transkription_field=provide_tf) for item in nodes_near_tf if cls.IS_A_LINE_NUMBER(item)] if len(line_numbers) > 0: MINABOVE = 3 + yoffset = 0 if not set_to_text_field_zero else transkription_field.ymin last_to_position = transkription_field.ymin for line_number in line_numbers: - last_to_position = set_line_number_top(svg_tree.getroot(), transkription_field, line_number, last_to_position) + last_to_position = set_line_number_top(svg_tree.getroot(), yoffset, line_number, last_to_position) return line_numbers @staticmethod def IS_A_LINE_NUMBER(raw_text_node): """Returns whether svg node contains a line number. """ if raw_text_node.text is not None: return bool(re.search(r'^[0-9]+$', raw_text_node.text)) elif len(raw_text_node.findall('.//tspan', raw_text_node.nsmap)) > 0: text = ''.join([x.text for x in raw_text_node.findall('.//tspan', raw_text_node.nsmap)]) return bool(re.search(r'^[0-9]+$', text)) return False def setTop(self, top): """Sets top position of line number. """ self.top = top def attach_object_to_tree(self, target_tree): """Attach object to tree. """ obj_node = target_tree.getroot().xpath('//' + LineNumber.XML_TAG + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.getroot().xpath('//' + LineNumber.XML_TAG + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree.getroot(), LineNumber.XML_TAG) for key in self.__dict__.keys(): obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) -def set_line_number_top(tree_root, transkription_field, line_number, last_to_position, minabove=3) -> float: +def set_line_number_top(tree_root, yoffset, line_number, last_to_position, minabove=3) -> float: """Set top position of line_number and return next last_to_position. """ - above_current_line_bottom = line_number.bottom + transkription_field.ymin - minabove + above_current_line_bottom = line_number.bottom + yoffset - minabove bottoms = get_bottoms(tree_root, from_position=last_to_position, to_position=above_current_line_bottom) current_line_top = above_current_line_bottom if len(bottoms) > 0: - current_line_top = bottoms[-1] - transkription_field.ymin + minabove + current_line_top = bottoms[-1] - yoffset + minabove if line_number.bottom-current_line_top >= LineNumber.MIN_LINE_HIGHT: line_number.setTop(current_line_top) else: - return set_line_number_top(tree_root, transkription_field, line_number, last_to_position, minabove=minabove+1) + return set_line_number_top(tree_root, yoffset, line_number, last_to_position, minabove=minabove+1) return current_line_top def get_bottoms(tree_root, from_position=-1.0, to_position=-1.0, transkription_field=None) -> list: """Returns all unique bottom values (Float) as a sorted list. """ bottom_list = sorted(set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in tree_root.findall(".//text", tree_root.nsmap))) if transkription_field is not None: from_position = transkription_field.ymin to_position = transkription_field.ymax if from_position > 0.0 and to_position > 0.0: return [ item for item in bottom_list if item > from_position and item < to_position ] else: return bottom_list Index: svgscripts/process_words_post_merging.py =================================================================== --- svgscripts/process_words_post_merging.py (revision 101) +++ svgscripts/process_words_post_merging.py (revision 102) @@ -1,483 +1,492 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path as PathlibPath from progress.bar import Bar import re import shutil import string from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.path import Path as SVGPath from svgpathtools.path import Line import sys import tempfile import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.box import Box from datatypes.manuscript import ArchivalManuscriptUnity from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.word import Word, do_paths_intersect_saveMode, update_transkription_position_ids from extract_line_continuation import extract_line_continuations from util import back_up, process_warnings4status from process_files import update_svgposfile_status from process_footnotes import categorize_footnotes sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False DEBUG_WORD = None MERGED_DIR = 'merged' WARNING_FOOTNOTES_ERROR = 'footnotes not processed' WARNING_LINE_CONTINUATION = 'line continuation fail' def categorize_paths(page, transkription_field=None): """Categorize all paths that are part of the transkription field. :return: a dictionary containig a list for each category of path. """ if page.source is not None and isfile(page.source): MAX_HEIGHT_LINES = 1 max_line = sorted(\ [line_number.bottom-line_number.top for line_number in page.line_numbers if line_number.id % 2 == 0],\ reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17 - tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0 - tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0 + tr_xmin = 0.0 + tr_ymin = 0.0 + if (page.svg_image is None or page.svg_image.text_field is None)\ + and transkription_field is not None: + tr_xmin = transkription_field.xmin + tr_ymin = transkription_field.ymin paths, attributes = svg_to_paths.svg2paths(page.source) allpaths_on_tf = [] allpaths_outside_tf = [] attributes_outside_tf = [] if transkription_field is None: transkription_field = TranskriptionField(page.source) for index, path in enumerate(paths): attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and path.bbox()[0] >= tr_xmin\ and path.bbox()[1] <= transkription_field.xmax: allpaths_on_tf.append(Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) elif len(path) > 0\ and path != transkription_field.path: allpaths_outside_tf.append(Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) #print(index, allpaths_outside_tf[len(allpaths_outside_tf)-1].path, path) attributes_outside_tf.append(attribute) path_dict = { 'text_area_deletion_paths': [],\ 'deletion_or_underline_paths': [],\ 'box_paths': [],\ 'dots_paths': [],\ 'word_connector_paths': [],\ 'uncategorized_paths': [] } for mypath in allpaths_on_tf: xmin, xmax, ymin, ymax = mypath.path.bbox() start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin) if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1: path_dict.get('dots_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed(): path_dict.get('box_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed(): path_dict.get('word_connector_paths').append(mypath) elif abs(ymax-ymin) < MAX_HEIGHT_LINES: mypath.start_line_number = start_line_number path_dict.get('deletion_or_underline_paths').append(mypath) elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin): # Check for "ladder", i.e. a path with 3 segments (seg0 is horizontal on line x, seg1 moves to line x+1, seg2 is horizontal on line x+1) if start_line_number + 1 == page.get_line_number(mypath.path.end.imag-tr_ymin)\ and len(mypath.path._segments) == 3\ and abs(mypath.path._segments[0].bbox()[3]-mypath.path._segments[0].bbox()[2]) < MAX_HEIGHT_LINES\ and abs(mypath.path._segments[2].bbox()[3]-mypath.path._segments[2].bbox()[2]) < MAX_HEIGHT_LINES: for index in 0, 2: new_path = Path(parent_path=mypath, path=SVGPath(mypath.path._segments[index])) new_path.start_line_number = page.get_line_number(new_path.path.start.imag-tr_ymin) path_dict.get('deletion_or_underline_paths').append(new_path) else: path_dict.get('text_area_deletion_paths').append(mypath) else: path_dict.get('uncategorized_paths').append(mypath) underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin) path_dict.update({'underline_path': underline_path}) path_dict['uncategorized_paths'] += process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\ paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line) return path_dict elif not UNITTESTING: error_msg = 'Svg source file {} does not exist!'.format(page.source)\ if page.source is not None else 'Page does not contain a source file!' raise FileNotFoundError(error_msg) return {} def copy_page_to_merged_directory(page, manuscript_file=None): """Copy page to directory that contains the first version of all svg_pos_files that have been merged with the faksimile position data. MERGED_DIR is a subfolder of svg_pos_files-directory. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) target_dir = svg_pos_file.parent / MERGED_DIR if not target_dir.is_dir(): target_dir.mkdir() target_pos_file = target_dir / svg_pos_file.name save_page(page, str(svg_pos_file), target_svg_pos_file=str(target_pos_file), status=STATUS_MERGED_OK, manuscript_file=manuscript_file) def find_special_words(page, transkription_field=None): """Find special words, remove them from words, process their content. """ if page.source is None or not isfile(page.source): raise FileNotFoundError('Page does not have a source!') if transkription_field is None: transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) + set_to_text_field_zero = (page.svg_image is None or page.svg_image.text_field is None) special_char_list = MarkForeignHands.get_special_char_list() special_char_list += TextConnectionMark.get_special_char_list() single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ] if not UNITTESTING: bar = Bar('find special words', max=len(single_char_words)) for word in single_char_words: not bool(UNITTESTING) and bar.next() if word.text == MarkForeignHands.CLASS_MARK: id = len(page.mark_foreign_hands) page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id)) page.words.remove(word) elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\ or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\ and any(style in page.sonderzeichen_list for style\ in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))): id = len(page.text_connection_marks) page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id)) page.words.remove(word) not bool(UNITTESTING) and bar.finish() svg_tree = ET.parse(page.source) page.update_page_type(transkription_field=transkription_field) - page.update_line_number_area(transkription_field, svg_tree=svg_tree) + page.update_line_number_area(transkription_field, svg_tree=svg_tree, set_to_text_field_zero=set_to_text_field_zero) if page.marginals_source is not None: svg_tree = ET.parse(page.marginals_source) italic_classes = [ key for key in page.style_dict\ if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ] if len(page.mark_foreign_hands) > 0: MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\ - SonderzeichenList=page.sonderzeichen_list) + SonderzeichenList=page.sonderzeichen_list, set_to_text_field_zero=set_to_text_field_zero) if len(page.text_connection_marks) > 0: TextConnectionMark.find_content_in_footnotes(page, transkription_field, svg_tree) def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks all words that intersect with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] list of .path.Path that might be word_underline_paths """ if not UNITTESTING: bar = Bar('mark words that intersect with deletion paths', max=len(page.words)) for word in page.words: not bool(UNITTESTING) and bar.next() word = mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) for part_word in word.word_parts: part_word = mark_word_if_it_intersects_with_paths_as_deleted(part_word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) word.partition_according_to_deletion() not bool(UNITTESTING) and bar.finish() # return those paths in deletion_paths that are not in page.word_deletion_paths return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ] def mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks word if it intersects with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] word """ word.deleted = False for transkription_position in word.transkription_positions: word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) intersecting_paths = [ deletion_path for deletion_path in deletion_paths\ if do_paths_intersect_saveMode(deletion_path, word_path) ] if DEBUG_WORD is not None and word.text == DEBUG_WORD.text and word.line_number == DEBUG_WORD.line_number: relevant_paths = [ path for path in deletion_paths if path.start_line_number == DEBUG_WORD.line_number ] #print(word.line_number, word_path.path.bbox(), [ path.path.bbox() for path in relevant_paths]) if len(intersecting_paths) > 0: #print(f'{word.line_number}: {word.id}, {word.text}: {intersecting_paths}') transkription_position.deleted = True for deletion_path in intersecting_paths: if deletion_path.parent_path is not None: deletion_path = deletion_path.parent_path if deletion_path not in page.word_deletion_paths: deletion_path.tag = Path.WORD_DELETION_PATH_TAG deletion_path.attach_object_to_tree(page.page_tree) page.word_deletion_paths.append(deletion_path) return word def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None): """Process words after merging with faksimile word positions. """ if page is None and svg_pos_file is None: raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!') if page is None: page = Page(svg_pos_file) if page.source is None or not isfile(page.source): raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file)) if svg_pos_file is None: svg_pos_file = page.page_tree.docinfo.URL if new_words is not None: page.words = sorted(new_words, key=attrgetter('id')) for word_node in page.page_tree.xpath('.//word'): word_node.getparent().remove(word_node) manuscript = ArchivalManuscriptUnity.create_cls(manuscript_file)\ if manuscript_file is not None\ else None copy_page_to_merged_directory(page, manuscript_file=manuscript_file) transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) update_faksimile_line_positions(page) status = STATUS_MERGED_OK page.update_styles(manuscript=manuscript, partition_according_to_styles=True) categorize_paths(page, transkription_field=transkription_field) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('default') try: find_special_words(page, transkription_field=transkription_field) categorize_footnotes(page) extract_line_continuations(page, warning_message=WARNING_LINE_CONTINUATION) except Exception: warnings.warn(WARNING_FOOTNOTES_ERROR) status = process_warnings4status(w, [ WARNING_FOOTNOTES_ERROR, WARNING_LINE_CONTINUATION ], status, STATUS_POSTMERGED_OK) save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17) -> list: """Process word boxes: partition words according to word boxes. [:return:] a list of paths that are not boxes """ MAX_HEIGHT_LINES = 1 not_boxes = [] if not UNITTESTING: bar = Bar('process word boxes', max=len(page.words)) svg_tree = ET.parse(page.source) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } allpaths_on_margin_field = [] + tr_xmin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\ + else transkription_field.xmin + tr_ymin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\ + else transkription_field.ymin if paths is None or attributes is None: paths = [] raw_paths, attributes = svg_to_paths.svg2paths(page.source) for index, raw_path in enumerate(raw_paths): paths.append(Path.create_cls(id=index, path=raw_path, style_class=attributes[index].get('class'), page=page)) for index, mypath in enumerate(paths): path = mypath.path xmin, xmax, ymin, ymax = path.bbox() attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\ or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\ and abs(ymax-ymin) < max_line: allpaths_on_margin_field.append(mypath)#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) box_line_number_dict = {} for box_path in sorted(box_paths, key=lambda path: path.get_median_y()): - line_number = page.get_line_number(box_path.get_median_y(tr_ymin=transkription_field.ymin)) + line_number = page.get_line_number(box_path.get_median_y(tr_ymin=tr_ymin)) if line_number > 0: if line_number not in box_line_number_dict.keys(): box_line_number_dict.update({ line_number: [ box_path ]}) else: box_line_number_dict.get(line_number).append(box_path) boxes = [] for line_number in box_line_number_dict.keys(): box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x()) margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\ - if page.get_line_number(margin_box.get_median_y(tr_ymin=transkription_field.ymin)) == line_number ],\ + if page.get_line_number(margin_box.get_median_y(tr_ymin=tr_ymin)) == line_number ],\ key=lambda path: path.get_x()) threshold = 3 if line_number % 2 == 0 else 1.5 if len(margin_boxes_on_line) > 0: for box_path in box_paths_on_line: #print(line_number, box_path.path.d(), len(margin_boxes_on_line)) box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\ - transkription_field=transkription_field, namespaces=namespaces, threshold=threshold) + namespaces=namespaces, threshold=threshold) if box is not None: boxes.append(box) else: not_boxes += box_paths_on_line if len(boxes) > 0: for word in page.words: - word.process_boxes(boxes, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin) + word.process_boxes(boxes, tr_xmin=tr_xmin, tr_ymin=tr_ymin) word.create_correction_history(page) if not bool(UNITTESTING): bar.next() elif word.earlier_version is not None: #print(f'{word.text} -> {word.earlier_version.text}') if word.earlier_version.earlier_version is not None: print(f'{word.earlier_version.earlier_version.text}') not bool(UNITTESTING) and bar.finish() return not_boxes def reset_page(page): """Reset all words that have word_parts in order to run the script a second time. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) first_merge_version = svg_pos_file.parent / MERGED_DIR / svg_pos_file.name if first_merge_version.exists(): page = Page(str(first_merge_version)) else: word_with_wordparts = [ word for word in page.words if len(word.word_parts) > 0 ] word_with_wordparts += [ word for word in page.words if word.earlier_version is not None ] page_changed = False if len(word_with_wordparts) > 0: for word in word_with_wordparts: word.undo_partitioning() update_transkription_position_ids(word) page_changed = True no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if len(no_line_numbers) > 0: for word in no_line_numbers: if len(word.transkription_positions) > 0: word.line_number = page.get_line_number((word.transkription_positions[0].top+word.transkription_positions[0].bottom)/2) else: msg = f'Word {word.id} {word.text} has no transkription_position!' warnings.warn(msg) page_changed = True if page_changed: page.update_and_attach_words2tree() def save_page(page, svg_pos_file, target_svg_pos_file=None, status=None, manuscript_file=None): """Save page to target_file and update status of file. """ page.update_and_attach_words2tree() if not UNITTESTING: if target_svg_pos_file is None: target_svg_pos_file = svg_pos_file if status is not None: update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status) write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def update_faksimile_line_positions(page): """Update faksimile_positions of the lines """ num_lines = len(page.line_numbers) ymin = page.text_field.ymin\ if page.text_field is not None\ else 0.0 for line_number in page.line_numbers: if len([ word.faksimile_positions[0] for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) > 0: line_number.faksimile_inner_top = min([ word.faksimile_positions[0].top for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) line_number.faksimile_inner_bottom = max([ word.faksimile_positions[0].bottom for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) if line_number.id % 2 == 0: line_number.faksimile_outer_top = line_number.faksimile_inner_top - ymin line_number.faksimile_outer_bottom = line_number.faksimile_inner_bottom - ymin for index, line_number in enumerate(page.line_numbers): if line_number.faksimile_inner_bottom == 0.0\ or line_number.faksimile_inner_bottom < line_number.faksimile_inner_top: if index == 0 and num_lines > 1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].top elif index == num_lines-1 and page.text_field is not None: line_number.faksimile_inner_bottom = round(page.text_field.height + page.text_field.ymin, 3) elif index > 0 and index < num_lines-1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].faksimile_inner_top\ if page.line_numbers[index+1].faksimile_inner_top > page.line_numbers[index-1].faksimile_inner_bottom\ else page.line_numbers[index-1].faksimile_inner_bottom line_number.attach_object_to_tree(page.page_tree) def update_writing_process_ids(page): """Update the writing_process_ids of the words and split accordingly. """ for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to process words after they have been merged with faksimile data. svgscripts/process_words_post_merging.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -i|--include-missing-line-number run script on files that contain words without line numbers -r|--rerun rerun script on a svg_pos_file that has already been processed :return: exit code (int) """ status_not_contain = STATUS_POSTMERGED_OK include_missing_line_number = False try: opts, args = getopt.getopt(argv, "hir", ["help", "include-missing-line-number", "rerun" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-i', '--include-missing-line-number'): include_missing_line_number = True elif opt in ('-r', '--rerun'): status_not_contain = '' if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=status_not_contain): reset_page(page) no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if not include_missing_line_number and len(no_line_numbers) > 0: not UNITTESTING and print(Fore.RED + f'Page {page.title}, {page.number} has words with no line number!') for word in no_line_numbers: not UNITTESTING and print(f'Word {word.id}: {word.text}') else: back_up(page, page.xml_file) not UNITTESTING and print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL) post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:]))