Page MenuHomec4science

test_extractWordPosition.py
No OneTemporary

File Metadata

Created
Tue, May 14, 03:50

test_extractWordPosition.py

import unittest
import os
from os import sep, path
from os.path import isfile, isdir, dirname
import re
import shutil
import tempfile
import lxml.etree as ET
from lxml.etree import XMLSyntaxError
import sys
sys.path.append('svgscripts')
import extractWordPosition
from myxmlwriter import write_pretty
from datatypes.transkriptionField import TranskriptionField
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.pdf import PDFText
from datatypes.word import Word
from datatypes.lineNumber import LineNumber
from datatypes.word_insertion_mark import WordInsertionMark
class TestExtractor(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
self.test_file_find_word = DATADIR + sep + 'test_find_word.xml'
self.test_dir = tempfile.mkdtemp()
self.title = 'ABC 111'
self.matrix_string = 'matrix(1 0 0 1 183.6558 197.9131)'
self.test_file = DATADIR + sep + 'Mp_XIV_1_mytest_421.svg'
self.test_empty_file = DATADIR + sep + 'my_empty_test.svg'
self.test_source = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml'
self.xml420 = DATADIR + sep + 'Mp_XIV_1_page420.xml'
self.pdf420 = DATADIR + sep + 'Mp_XIV_1_online_420.pdf'
self.pdf_file = DATADIR + sep + 'W_I_8_page125.pdf'
self.faulty_xml = DATADIR + sep + 'W_I_8_faulty_page125.xml'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
def test_main(self):
extractWordPosition.Extractor.UNITTESTING = True
Page.UNITTESTING = True
argv = ['-d', self.test_dir, '-o', '--title=My Hero', '--page=1', self.test_empty_file]
with self.assertRaises(XMLSyntaxError):
extractor = extractWordPosition.Extractor()
extractor.extract_information(self.test_empty_file)
def test_update_title(self):
extractor = extractWordPosition.Extractor(xml_dir=self.test_dir)
extractor.update_title_and_manuscript('test')
self.assertEqual(extractor.title, 'test')
self.assertEqual(extractor.manuscript_file, '{}/test.xml'.format(self.test_dir))
self.assertEqual(isfile('{}/test.xml'.format(self.test_dir)), True)
def test_get_page_number(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.get_page_number(self.test_file, page_number='1'), '001')
self.assertEqual(extractor.get_page_number(self.test_file), '421')
def test_get_file_name(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.get_file_name(self.test_file), 'xml/Mp_XIV_1_mytest_421.xml')
extractor = extractWordPosition.Extractor(title=self.title)
self.assertEqual(extractor.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_')))
extractorA = extractWordPosition.Extractor(title=self.title)
extractorB = extractWordPosition.Extractor(manuscript_file=extractorA.manuscript_file)
self.assertEqual(extractorB.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_')))
def test_get_style(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
self.assertEqual(sonderzeichen_list, [ 'st21', 'st23'])
self.assertEqual(style_dict.get('st11').get('font-family'), 'Frutiger-Europeen')
self.assertEqual(style_dict.get('st5').get('stroke'), '#CED5CE')
def test_get_word_from_part_obj(self):
extractor = extractWordPosition.Extractor()
mylist = [{'text': 'a', 'class': 'asdf' }, {'text': 'b', 'endX': 0 }, {'text': 'c'}]
self.assertEqual(extractor.get_word_from_part_obj(mylist), 'abc')
def test_get_bottoms(self):
svg_tree = ET.parse(self.test_file)
extractor = extractWordPosition.Extractor()
mybottoms = extractor.get_bottoms(svg_tree.getroot())
self.assertEqual(mybottoms[0], '57.1914')
self.assertEqual(len(mybottoms), 106)
self.assertEqual(mybottoms[len(mybottoms)-1], '1155.6899')
mybottoms = extractor.get_bottoms(svg_tree.getroot(), from_position=100.0, to_position=800.0)
self.assertEqual(mybottoms[0], '100.5132')
self.assertEqual(len(mybottoms), 84)
self.assertEqual(mybottoms[len(mybottoms)-1], '792.8218')
tf = TranskriptionField(self.test_file)
mybottoms = extractor.get_bottoms(svg_tree.getroot(), transkription_field=tf)
self.assertEqual(mybottoms[0], '91.7134')
self.assertEqual(len(mybottoms), 75)
self.assertEqual(mybottoms[len(mybottoms)-1], '681.7134')
def test_get_text_items(self):
svg_tree = ET.parse(self.test_file)
extractor = extractWordPosition.Extractor()
mytest_items = [ x for x in extractor.get_text_items(svg_tree.getroot()) ]
self.assertEqual(len(mytest_items), 300)
self.assertEqual(mytest_items[0].get('transform'), 'matrix(1 0 0 1 386.8218 57.1914)')
tf = TranskriptionField(self.test_file)
mytest_itemsTF = [ x for x in extractor.get_text_items(svg_tree.getroot(), transkription_field=tf) ]
self.assertEqual(mytest_itemsTF[0].get('transform'), 'matrix(1 0 0 1 204.8618 91.7134)')
def test_init_tree_and_target_file(self):
target_file = 'xml/testA.xml'
page = Page(xml_target_file=target_file, title=self.title)
tree = page.page_tree
self.assertEqual(tree.getroot().get('title'), self.title)
self.assertEqual(tree.getroot().findall('./style'), [])
write_pretty(xml_element_tree=tree, file_name=target_file)
page = Page(xml_target_file=target_file)
tree = page.page_tree
self.assertEqual(tree.getroot().get('title'), self.title)
self.assertEqual(tree.getroot().findall('./style'), [])
isfile(target_file) and os.remove(target_file)
def test_add_style(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
target_file = 'xml/testA.xml'
page = Page(xml_target_file=target_file,title=self.title)
page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict)
write_pretty(xml_element_tree=page.page_tree, file_name=target_file)
fromTarget_xml_tree = ET.parse(target_file)
self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title)
self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23")
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE')
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen')
page = Page(xml_target_file=target_file)
page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict)
write_pretty(xml_element_tree=page.page_tree, file_name=target_file)
fromTarget_xml_tree = ET.parse(target_file)
self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title)
self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23")
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE')
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen')
isfile(target_file) and os.remove(target_file)
def test_add_word(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
mylist = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }]
matrix = Matrix(self.matrix_string)
for dict in mylist:
dict['class'] = 'st22'
dict['x'] = matrix.add2X(0)
dict['y'] = matrix.getY()
target_file = self.test_dir + sep + 'asdfasdf.xml'
page = Page(xml_target_file=target_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 1)
mylist[1]['text'] = 'A'
mylist[1]['class'] = 'st21'
mylist[1]['x'] = matrix.add2X(1)
self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 2)
extractor.update_and_attach_words2tree(page)
#self.assertEqual(page.word_insertion_marks[0].x, 184.656)
#self.assertEqual(page.word_insertion_marks[0].y, 197.913)
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="1"]')[0].get('text'), 'a')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]')[0].get('text'), 'c')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('left'), '183.506')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('height'), '8.25')
def test_extractor(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.title, None)
self.assertEqual(extractor.manuscript_file, None)
self.assertEqual(extractor.xml_dir, 'xml/')
self.assertEqual(extractor.manuscript_tree, None)
def test_write_title_to_manuscript_file(self):
extractor = extractWordPosition.Extractor(xml_dir=self.test_dir, title=self.title)
self.assertEqual(isfile(extractor.manuscript_file), True)
extractor = extractWordPosition.Extractor(manuscript_file=extractor.manuscript_file)
self.assertEqual(extractor.title, self.title)
def test_extract_line_numbers(self):
svg_tree = ET.parse(self.test_file)
tf = TranskriptionField(self.test_file)
extractor = extractWordPosition.Extractor()
line_numbers = extractor.extract_line_numbers(svg_tree, tf)
self.assertEqual(line_numbers[0].id, 2)
self.assertEqual(len(line_numbers), 24)
self.assertEqual(line_numbers[0].top, 45.163)
def tearDown(self):
isdir(self.test_dir) and shutil.rmtree(self.test_dir)
isfile('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) and os.remove('{}/{}.xml'.format('xml', self.title.replace(' ', '_')))
if __name__ == "__main__":
unittest.main()

Event Timeline