Page MenuHomec4science

test_extractWordPosition.py
No OneTemporary

File Metadata

Created
Fri, May 3, 02:59

test_extractWordPosition.py

import unittest
import os
from os import sep, path
from os.path import isfile, isdir, dirname
import re
import shutil
import tempfile
import lxml.etree as ET
from lxml.etree import XMLSyntaxError
import sys
sys.path.append('svgscripts')
import extractWordPosition
from myxmlwriter import write_pretty
from datatypes.transkriptionField import TranskriptionField
from datatypes.matrix import Matrix
from datatypes.page_creator import PageCreator, FILE_TYPE_SVG_WORD_POSITION
from datatypes.page import Page
from datatypes.pdf import PDFText
from datatypes.word import Word
from datatypes.lineNumber import LineNumber
from datatypes.word_insertion_mark import WordInsertionMark
def test_write(xml_element_tree=None, file_name=None):
write_pretty(xml_element_tree=xml_element_tree, file_name=None, script_name='test', file_type=FILE_TYPE_SVG_WORD_POSITION)
class TestExtractor(unittest.TestCase):
def setUp(self):
extractWordPosition.Extractor.UNITTESTING = True
DATADIR = dirname(__file__) + sep + 'test_data'
self.test_file_find_word = DATADIR + sep + 'test_find_word.xml'
self.test_dir = tempfile.mkdtemp()
self.title = 'ABC 111'
self.matrix_string = 'matrix(1 0 0 1 183.6558 197.9131)'
self.test_file = DATADIR + sep + 'Mp_XIV_1_mytest_421.svg'
self.test_empty_file = DATADIR + sep + 'my_empty_test.svg'
self.test_source = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml'
self.xml420 = DATADIR + sep + 'Mp_XIV_1_page420.xml'
self.pdf420 = DATADIR + sep + 'Mp_XIV_1_online_420.pdf'
self.pdf_file = DATADIR + sep + 'W_I_8_page125.pdf'
self.faulty_xml = DATADIR + sep + 'W_I_8_faulty_page125.xml'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.testA = DATADIR + sep + 'testA.xml'
self.multipage = DATADIR + sep + 'multipage_small_above.svg'
def test_extract_information(self):
extractor = extractWordPosition.Extractor()
page = extractor.extract_information(self.multipage, multipage_index=0)
self.assertEqual(len(page.words), 59)
self.assertEqual(page.multipage_index, 0)
page = extractor.extract_information(self.multipage, multipage_index=1)
self.assertEqual(page.multipage_index, 1)
self.assertTrue(len(page.words) > 59)
extractor = extractWordPosition.Extractor()
source_page = Page('xml/Mp_XV_page78v.xml')
extractor = extractWordPosition.Extractor()
transkription_field = TranskriptionField(source_page.source)
svg_tree = ET.parse(source_page.source)
text_items = extractor.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)
self.assertTrue('matrix(1 0 0 1 115.6299 719.3535)' in [ item.get('transform') for item in text_items ])
page = extractor.extract_information(source_page.source, svg_file=source_page.svg_file)
self.assertTrue(page.svg_image.text_field is not None)
def test_update_title(self):
extractor = extractWordPosition.Extractor(xml_dir=self.test_dir)
extractor.update_title_and_manuscript('test')
self.assertEqual(extractor.title, 'test')
self.assertEqual(extractor.manuscript_file, '{}/test.xml'.format(self.test_dir))
self.assertEqual(isfile('{}/test.xml'.format(self.test_dir)), True)
def test_get_page_number(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.get_page_number(self.test_file, page_number='1'), '001')
self.assertEqual(extractor.get_page_number(self.test_file), '421')
def test_get_file_name(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.get_file_name(self.test_file), 'xml/Mp_XIV_1_mytest_421.xml')
extractor = extractWordPosition.Extractor(title=self.title)
self.assertEqual(extractor.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_')))
extractorA = extractWordPosition.Extractor(title=self.title)
extractorB = extractWordPosition.Extractor(manuscript_file=extractorA.manuscript_file)
self.assertEqual(extractorB.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_')))
def test_get_style(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
self.assertEqual(sonderzeichen_list, [ 'st21', 'st23'])
self.assertEqual(style_dict.get('st11').get('font-family'), 'Frutiger-Europeen')
self.assertEqual(style_dict.get('st5').get('stroke'), '#CED5CE')
def test_get_word_from_part_obj(self):
extractor = extractWordPosition.Extractor()
mylist = [{'text': 'a', 'class': 'asdf' }, {'text': 'b', 'endX': 0 }, {'text': 'c'}]
self.assertEqual(extractor.get_word_from_part_obj(mylist), 'abc')
def test_get_break_points(self):
extractor = extractWordPosition.Extractor()
page = Page(self.pdf_xml)
page.source = self.pdf_xml_source
matrix = Matrix('matrix(1 0 0 1 543.8164 173.9126)')
matrixB = Matrix('matrix(1 0 0 1 573.6758 173.9126)')
matrixC = Matrix('matrix(1 0 0 1 575.9873 173.9126)')
mylist = [{'text': 'es', 'class': 'st5 st6', 'x': matrix.add2X(23.968), 'y': matrix.getY() },\
{'text': 'A', 'class': 'st9 st10', 'x': matrixB.getX(), 'y': matrixB.getY() },\
{'text': 'sich', 'class': "st5 st6", 'x': matrixC.getX(), 'y': matrixC.getY()}]
break_points = extractor._get_break_points(page, mylist)
self.assertTrue(len(break_points) > 0)
def test_get_text_items(self):
svg_tree = ET.parse(self.test_file)
extractor = extractWordPosition.Extractor()
mytest_items = [ x for x in extractor.get_text_items(svg_tree.getroot()) ]
self.assertEqual(len(mytest_items), 300)
self.assertEqual(mytest_items[0].get('transform'), 'matrix(1 0 0 1 386.8218 57.1914)')
tf = TranskriptionField(self.test_file)
mytest_itemsTF = [ x for x in extractor.get_text_items(svg_tree.getroot(), transkription_field=tf) ]
self.assertEqual(mytest_itemsTF[0].get('transform'), 'matrix(1 0 0 1 204.8618 91.7134)')
def test_init_tree_and_target_file(self):
target_file = self.testA
page = PageCreator(target_file, title=self.title)
tree = page.page_tree
self.assertEqual(tree.getroot().get('title'), self.title)
self.assertEqual(tree.getroot().findall('./style'), [])
test_write(xml_element_tree=tree, file_name=target_file)
page = PageCreator(target_file)
tree = page.page_tree
self.assertEqual(tree.getroot().get('title'), self.title)
self.assertEqual(tree.getroot().findall('./style'), [])
isfile(target_file) and os.remove(target_file)
def test_add_style(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
target_file = self.testA
page = PageCreator(target_file,title=self.title)
page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict)
test_write(xml_element_tree=page.page_tree, file_name=target_file)
fromTarget_xml_tree = ET.parse(target_file)
self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title)
self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23")
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE')
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen')
page = PageCreator(target_file)
page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict)
test_write(xml_element_tree=page.page_tree, file_name=target_file)
fromTarget_xml_tree = ET.parse(target_file)
self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title)
self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23")
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE')
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen')
isfile(target_file) and os.remove(target_file)
def test_add_word(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
mylist = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }]
matrix = Matrix(self.matrix_string)
for dict in mylist:
dict['class'] = 'st22'
dict['x'] = matrix.add2X(0)
dict['y'] = matrix.getY()
target_file = self.test_dir + sep + 'asdfasdf.xml'
page = PageCreator(target_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 1)
mylist[1]['text'] = 'A'
mylist[1]['class'] = 'st21'
mylist[1]['x'] = matrix.add2X(1)
self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 2)
page.update_and_attach_words2tree()
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="1"]')[0].get('text'), 'a')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]')[0].get('text'), 'c')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('left'), '183.506')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('height'), '8.25')
def test_extractor(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.title, None)
self.assertEqual(extractor.manuscript_file, None)
self.assertEqual(extractor.xml_dir, 'xml/')
self.assertEqual(extractor.manuscript_tree, None)
def test_write_title_to_manuscript_file(self):
extractor = extractWordPosition.Extractor(xml_dir=self.test_dir, title=self.title)
self.assertEqual(isfile(extractor.manuscript_file), True)
extractor = extractWordPosition.Extractor(manuscript_file=extractor.manuscript_file)
self.assertEqual(extractor.title, self.title)
def tearDown(self):
isdir(self.test_dir) and shutil.rmtree(self.test_dir)
isfile('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) and os.remove('{}/{}.xml'.format('xml', self.title.replace(' ', '_')))
if __name__ == "__main__":
unittest.main()

Event Timeline