Page MenuHomec4science

test_process_words_post_merging.py
No OneTemporary

File Metadata

Created
Sun, Apr 28, 03:14

test_process_words_post_merging.py

import unittest
from os import sep, path, remove
from os.path import isdir, isfile, dirname
import shutil
import sys
import lxml.etree as ET
import warnings
import sys
sys.path.append('svgscripts')
import process_words_post_merging
from datatypes.faksimile import FaksimilePage
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page
from datatypes.path import Path
from datatypes.positional_word_part import PositionalWordPart
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word_position import WordPosition
class TestPostMerge(unittest.TestCase):
def setUp(self):
process_words_post_merging.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.faksimile_dir = DATADIR + sep + 'faksimile_svg'
self.manuscript = DATADIR + sep + 'N_VII_1.xml'
self.manuscript_copy = self.manuscript.replace('.', '_copy.')
self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg'
self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml'
self.Mp_XIV_1_mytest_421 = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml'
self.test_tcm_xml = DATADIR + sep + 'N_VII_1_page001.xml'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
def test_main(self):
process_words_post_merging.main([self.manuscript])
def test_categorize_paths(self):
page = Page(self.pdf_xml)
page.source = self.pdf_xml_source
tr = TranskriptionField(page.source)
page.words = [ word for word in page.words if word.line_number == 33 ]
path_dict = process_words_post_merging.categorize_paths(page, tr)
self.assertEqual(True in [ word.deleted for word in page.words if word.id == 269 ], False)
self.assertEqual(len(path_dict.get('deletion_or_underline_paths')) > 0, True)
self.assertEqual(len(path_dict.get('box_paths')), 5)
"""
words = [ word for word in page.words if len(word.box_paths) > 0 ]
self.assertEqual(len(words), 1)
self.assertEqual(words[0].word_parts[0].earlier_version is not None, True)
self.assertEqual(words[0].word_parts[0].earlier_version.text, ')')
"""
def test_find_special_words(self):
page = Page(self.xml_file)
process_words_post_merging.find_special_words(page)
self.assertEqual(len(page.mark_foreign_hands), 1)
self.assertEqual(page.mark_foreign_hands[0].foreign_hands_text, 'x')
page.update_and_attach_words2tree()
nodes = page.page_tree.xpath('//' + MarkForeignHands.XML_TAG)
page = Page(self.test_tcm_xml)
process_words_post_merging.find_special_words(page)
self.assertEqual(len(page.text_connection_marks), 1)
self.assertEqual(page.text_connection_marks[0].text_source.first_line, 2)
"""
page.update_and_attach_words2tree()
nodes = page.page_tree.xpath('//' + TextConnectionMark.XML_TAG)
print(ET.dump(nodes[0]))
"""
@unittest.skip('fix word.process_boxes first')
def test_process_word_boxes(self):
page = Page(self.pdf_xml) # W_I_8_page125.xml
page.source = self.pdf_xml_source
for word in page.words:
word.partition_according_to_writing_process_id()
tr = TranskriptionField(page.source)
box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\
'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\
'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\
'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\
'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315']
box_paths = [ Path(d_string=d_string) for d_string in box_path_d ]
process_words_post_merging.process_word_boxes(page, box_paths, tr)
words_with_boxes = [ word for word in page.words if len(word.box_paths) > 0 ]
expected_values = {'Aber': 'aber', 'seiner': ')', 'mit': ',', '(–': ':', 'Um': 'Denn'}
self.assertEqual(len(words_with_boxes), len(expected_values.keys()))
for word in words_with_boxes:
self.assertEqual(word.earlier_version.text, expected_values.get(word.text))
#print(f'{word.id} {word.text} "{word.earlier_version.text}" "{word.box_paths[0].earlier_text}"')
def test_update_writing_process_ids(self):
page = Page(self.pdf_xml)
page.words = [ word for word in page.words if word.text == 'Aber' and word.line_number == 2 ]
process_words_post_merging.update_writing_process_ids(page)
self.assertEqual(len(page.words[0].word_parts), 2)
self.assertEqual(page.words[0].word_parts[0].writing_process_id, 1)
self.assertEqual(page.words[0].word_parts[1].writing_process_id, 0)
@unittest.skip('takes long')
#@unittest.skipUnless(__name__ == "__main__", 'test takes too long, we do not run it with unittest discover')
def test_reset_page(self):
page = Page(self.pdf_xml)
page.source = self.pdf_xml_source
process_words_post_merging.post_merging_processing_and_saving(page=page)
numWordParts = 7
process_words_post_merging.post_merging_processing_and_saving(page=page)
self.assertEqual(len([ word for word in page.words if len(word.word_parts) > 0 ]), numWordParts)
process_words_post_merging.reset_page(page)
self.assertEqual(len([ word for word in page.words if word.earlier_version is not None ]), 0)
self.assertEqual(len([ word for word in page.words if len(word.word_parts) > 0 ]), 0)
if __name__ == "__main__":
unittest.main()

Event Timeline