Page MenuHomec4science

test_process_words_post_merging.py
No OneTemporary

File Metadata

Created
Thu, Mar 28, 15:30

test_process_words_post_merging.py

import unittest
from os import sep, path, remove
from os.path import isdir, isfile, dirname
import shutil
import sys
import lxml.etree as ET
import warnings
import sys
sys.path.append('svgscripts')
import process_words_post_merging
from datatypes.faksimile import FaksimilePage
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page
from datatypes.path import Path
from datatypes.positional_word_part import PositionalWordPart
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word
from datatypes.word_position import WordPosition
class TestPostMerge(unittest.TestCase):
def setUp(self):
process_words_post_merging.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.faksimile_dir = DATADIR + sep + 'faksimile_svg'
self.manuscript = DATADIR + sep + 'N_VII_1.xml'
self.manuscript_copy = self.manuscript.replace('.', '_copy.')
self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg'
self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml'
self.Mp_XIV_1_mytest_421 = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml'
self.test_tcm_xml = DATADIR + sep + 'N_VII_1_page001.xml'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.xml_merged = DATADIR + sep + 'N_VII_1_page005_faksimile_merged.xml'
@unittest.skip('takes long')
def test_main(self):
process_words_post_merging.main([self.manuscript])
#@unittest.skip('produces error')
def test_categorize_paths(self):
## :map <F5> :w<Enter>:!python3 -m unittest tests_svgscripts.test_process_words_post_merging.TestPostMerge.test_categorize_paths<Enter>
page = Page(self.pdf_xml)
page.source = self.pdf_xml_source
tr = TranskriptionField(page.source)
page.words = [ word for word in page.words if word.line_number == 33 ]
path_dict = process_words_post_merging.categorize_paths(page, tr)
self.assertEqual(True in [ word.deleted for word in page.words if word.id == 269 ], False)
self.assertEqual(len(path_dict.get('deletion_or_underline_paths')) > 0, True)
self.assertEqual(len(path_dict.get('box_paths')), 5)
words = [ word for word in page.words if word.text == 'seiner' ]
self.assertEqual(len(words), 1)
self.assertTrue(words[0].word_parts[0].overwrites_word is not None)
self.assertEqual(words[0].word_parts[0].overwrites_word.text, ')')
"""
print('starting ...')
page = Page('xml/W_II_1_page131.xml')
transkription_field = TranskriptionField(page.source)
process_words_post_merging.reset_page(page)
process_words_post_merging.find_special_words(page, transkription_field=transkription_field)
page.update_styles(partition_according_to_styles=True)
path_dict = process_words_post_merging.categorize_paths(page, transkription_field)
print('...ending')
"""
def test_find_special_words(self):
page = Page(self.xml_file)
process_words_post_merging.find_special_words(page)
self.assertEqual(len(page.mark_foreign_hands), 1)
self.assertEqual(page.mark_foreign_hands[0].foreign_hands_text, 'x')
page.update_and_attach_words2tree()
nodes = page.page_tree.xpath('//' + MarkForeignHands.XML_TAG)
page = Page(self.test_tcm_xml)
process_words_post_merging.find_special_words(page)
self.assertEqual(len(page.text_connection_marks), 1)
self.assertEqual(page.text_connection_marks[0].text_source.first_line, 2)
"""
page.update_and_attach_words2tree()
nodes = page.page_tree.xpath('//' + TextConnectionMark.XML_TAG)
print(ET.dump(nodes[0]))
"""
def test_process_word_boxes(self):
page = Page(self.pdf_xml) # W_I_8_page125.xml
page.source = self.pdf_xml_source
#page.words = [ page.words[30]]
page.update_styles(partition_according_to_styles=True)
tr = TranskriptionField(page.source)
box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\
'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\
'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\
'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\
'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315']
box_paths = [ Path(d_string=d_string) for d_string in box_path_d ]
process_words_post_merging.process_word_boxes(page, box_paths, tr)
words_with_boxes = [ word for word in page.words if word.word_box is not None\
or len([ part for part in word.word_parts if part.word_box is not None]) > 0]
expected_values = {'Aber': {'text': 'aber'}, 'seiner': {'text': ')'},\
'mit': { 'text': ','}, '(–': {'text': ':'}, 'Um': {'text': 'Denn'}}
self.assertEqual(len(words_with_boxes), len(expected_values.keys()))
references = [ words_with_boxes[0].earlier_version,\
words_with_boxes[1].word_parts[0].overwrites_word,\
words_with_boxes[2].word_parts[0].overwrites_word,\
words_with_boxes[3].word_parts[0].overwrites_word,\
words_with_boxes[4].overwrites_word ]
for index, key in enumerate(expected_values.keys()):
expected_values[key].update({'reference': references[index]})
for word in words_with_boxes:
self.assertEqual(expected_values[word.text].get('reference') is not None, True)
page = Page('xml/Mp_XIV_page416.xml')
@unittest.skip('relies on local file')
def test_process_word_boxes_multiple_boxes_perLIne(self):
page = Page('xml/N_VII_1_page034.xml')
page.update_styles(partition_according_to_styles=True)
page.words[205].word_parts[0].deleted = True
page.words[205].word_parts[3].deleted = True
tr = TranskriptionField(page.source)
box_path_d = ['M 69.497,460.726 L 81.959,460.726 L 81.959,467.404 L 69.497,467.404 L 69.497,460.726', 'M 65.997,461.974 L 68.084,461.974 L 68.084,467.277 L 65.997,467.277 L 65.997,461.974', 'M 191.939,423.806 L 197.602,423.806 L 197.602,431.817 L 191.939,431.817 L 191.939,423.806', 'M 47.048,245.659 L 63.779,245.659 L 63.779,252.795 L 47.048,252.795 L 47.048,245.659', 'M 180.995,89.054 L 188.23000000000002,89.054 L 188.23000000000002,95.515 L 180.995,95.515 L 180.995,89.054', 'M 142.367,90.315 L 149.72799999999998,90.315 L 149.72799999999998,95.515 L 142.367,95.515 L 142.367,90.315', 'M 133.745,90.143 L 137.48000000000002,90.143 L 137.48000000000002,95.554 L 133.745,95.554 L 133.745,90.143']
box_paths = [ Path(d_string=d_string) for d_string in box_path_d ]
process_words_post_merging.process_word_boxes(page, box_paths, tr)
words_with_boxes = [ word for word in page.words if word.word_box is not None\
or word.has_mixed_status('word_box', include_parts=True)]
expected_values = { 'großen': {'text': 'größtem'}, 'daß': {'text': 'dem'}, 'seine': {'text': 'ihre'},\
'Rococo-Geschmack': {'text': 'Rococo-geschmack'}, '(:': {'text': '–'}, 'und': {'text': 'es'} }
self.assertEqual(len(words_with_boxes), len(expected_values.keys()))
references = [ words_with_boxes[0].earlier_version,\
words_with_boxes[1].earlier_version,\
words_with_boxes[2].overwrites_word,\
words_with_boxes[3].earlier_version,\
words_with_boxes[4].word_parts[1].overwrites_word,\
words_with_boxes[5].overwrites_word ]
for index, key in enumerate(expected_values.keys()):
expected_values[key].update({'reference': references[index]})
for word in words_with_boxes:
if expected_values[word.text].get('reference') is None:
print(word.text, len(word.word_parts))
self.assertEqual(expected_values[word.text].get('reference') is not None, True)
self.assertEqual(expected_values[word.text].get('reference').text, expected_values[word.text].get('text'))
def test_update_faksimile_line_positions(self):
page = Page(self.xml_merged)
process_words_post_merging.update_faksimile_line_positions(page)
#for line_number in page.line_numbers: print(f'{line_number.id}: {line_number.faksimile_inner_top} {line_number.faksimile_inner_bottom}')
def test_update_writing_process_ids(self):
page = Page(self.pdf_xml)
page.words = [ word for word in page.words if word.text == 'Aber' and word.line_number == 2 ]
process_words_post_merging.update_writing_process_ids(page)
self.assertEqual(len(page.words[0].word_parts), 2)
self.assertEqual(page.words[0].word_parts[0].writing_process_id, 1)
self.assertEqual(page.words[0].word_parts[1].writing_process_id, 0)
@unittest.skip('takes long')
#@unittest.skipUnless(__name__ == "__main__", 'test takes too long, we do not run it with unittest discover')
def test_reset_page(self):
page = Page(self.pdf_xml)
page.source = self.pdf_xml_source
process_words_post_merging.post_merging_processing_and_saving(page=page)
numWordParts = 7
process_words_post_merging.post_merging_processing_and_saving(page=page)
self.assertEqual(len([ word for word in page.words if len(word.word_parts) > 0 ]), numWordParts)
process_words_post_merging.reset_page(page)
self.assertEqual(len([ word for word in page.words if word.earlier_version is not None ]), 0)
self.assertEqual(len([ word for word in page.words if len(word.word_parts) > 0 ]), 0)
if __name__ == "__main__":
unittest.main()

Event Timeline