Page MenuHomec4science

test_pdf.py
No OneTemporary

File Metadata

Created
Wed, May 15, 17:04

test_pdf.py

import unittest
from os import sep, path
from os.path import isdir, dirname, basename
import lxml.etree as ET
import sys
import re
import sys
sys.path.append('svgscripts')
from datatypes.pdf import PDFText
from datatypes.page import Page
from datatypes.page_creator import PageCreator
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word
from extractWordPosition import Extractor
class TestPDFText(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.pdf_file = DATADIR + sep + 'Mp_XIV_1_online_420.pdf'
self.pdf_fileB = DATADIR + sep + 'W_I_8_page125.pdf'
self.xml420 = DATADIR + sep + 'Mp_XIV_1_page420.xml'
self.xml420_source = DATADIR + sep + 'Mp_XIV_1_online_420.svg'
self.pdf420 = DATADIR + sep + 'Mp_XIV_1_online_420.pdf'
self.faulty_xml = DATADIR + sep + 'W_I_8_faulty_page125.xml'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.pdf_source = DATADIR + sep + "W_I_8_neu_125-01.svg"
self.dir = DATADIR
def test_init(self):
pdftext = PDFText(self.pdf_file)
self.assertEqual(len(pdftext.text_tree.xpath('.//text')), 102)
self.assertEqual(len(pdftext.text_tree.xpath('.//text[@id="{0}"]'.format(101))), 1)
with self.assertRaises(Exception):
PDFText(self.pdf_file, current_page_number=1)
def test_tree_contains_text_at(self):
x = 146.1
y = 81
pdftext = PDFText(self.pdf_file)
self.assertEqual(pdftext.tree_contains_text_at('nicht', x, y), True)
def test_tree_contains_text(self):
pdftext = PDFText(self.pdf_fileB)
self.assertEqual(pdftext.tree_contains_text('richtiger(richtiger'), False)
self.assertEqual(pdftext.tree_contains_text('2ter'), True)
self.assertEqual(pdftext.tree_contains_text_at('$', 320, 183), True)
def test_split_str_according_to_pdf_tree(self):
pdftext = PDFText(self.pdf_fileB)
self.assertEqual(pdftext.split_str_according_to_pdf_tree('.Insofern'), 'Insofern')
self.assertEqual(pdftext.split_str_according_to_pdf_tree('sticht('), 'sticht')
self.assertEqual(pdftext.split_str_according_to_pdf_tree('.sticht('), 'sticht')
def test_split_wrongly_concatenated_words(self):
page = Page(self.faulty_xml)
self.assertEqual('wünschtheißt.' in [ item.text for item in page.words ], True)
self.assertEqual(len(page.words), 1)
pdftext = PDFText(self.pdf_fileB, sonderzeichen=Extractor.SONDERZEICHEN_LIST)
page.words = pdftext.split_wrongly_concatenated_words(page)
self.assertEqual('wünschtheißt.' in [ item.text for item in page.words ], False)
self.assertEqual(len(page.words), 2)
@unittest.skip("have to fix PDFText.add_punctuation2words")
def test_add_punctuation2words(self):
page = Page(self.pdf_xml)
tr = TranskriptionField(self.pdf_source)
pat = r'^[-.=,:;?]$'
punctuations = [ word for word in page.words if re.match(pat, word.text) ]
self.assertEqual(len(punctuations), 5)
self.assertEqual(len(page.words), 430)
pdftext = PDFText(self.pdf_fileB, sonderzeichen=Extractor.SONDERZEICHEN_LIST)
pdftext.add_punctuation2words(page, transkription_field=tr)
punctuations = [ word for word in page.words if re.match(pat, word.text) ]
self.assertEqual(len(punctuations), 1)
self.assertEqual(len(page.words), 426)
@unittest.skip("have to fix PDFText.join_composita")
def test_add_composita(self):
page = Page(self.pdf_xml)
tr = TranskriptionField(self.pdf_source)
pat = r'^[=-]\s*[A-Z]'
composita_part = [ word for word in page.words if re.match(pat, word.text) ]
self.assertEqual(len(composita_part), 1)
pdftext = PDFText(self.pdf_fileB, sonderzeichen=Extractor.SONDERZEICHEN_LIST)
pdftext.join_composita(page, transkription_field=tr)
composita_part = [ word for word in page.words if re.match(pat, word.text) ]
self.assertEqual(len(composita_part), 0)
self.assertEqual(len(page.words), 429)
@unittest.skip("have to fix PDFText.join_single_char_words")
def test_join_single_char_words(self):
pat = r'^\w$'
"""
page = PageCreator(self.xml420, pdfFile=self.pdf420)
tr = TranskriptionField(page.source) if page.source is not None else None
page.words[:] = [ word for word in page.words if word.line_number == 13 ]
singles = [ word for word in page.words if re.match(pat, word.text) ]
#print(['{}/{}: {}'.format(word.line_number, word.id, word.text) for word in singles])
self.assertEqual(len(singles), 8)
pdftext = PDFText(page.pdfFile, sonderzeichen=Extractor.SONDERZEICHEN_LIST)
pdftext.join_single_char_words(page, transkription_field=tr)
singles = [ word for word in page.words if re.match(pat, word.text) ]
#print(['----->{}/{}: {}'.format(word.line_number, word.id, word.text) for word in singles])
self.assertEqual(len(singles), 0)
"""
page = PageCreator(self.pdf_xml, pdfFile=self.pdf_fileB)
page.words[:] = [ word for word in page.words if word.line_number == 19 ]
tr = TranskriptionField(self.dir + sep + page.source) if page.source is not None else None
singles = [ word for word in page.words if re.match(pat, word.text) ]
self.assertEqual(len(singles), 26)
pdftext = PDFText(self.pdf_fileB, sonderzeichen=Extractor.SONDERZEICHEN_LIST)
pdftext.join_single_char_words(page, transkription_field=tr)
singles = [ word for word in page.words if re.match(pat, word.text) ]
self.assertEqual(len(singles), 0)
self.assertEqual(':' in [word.text for word in page.words], True)
@unittest.skip("have to fix PDFText.find_word_path")
def test_find_word_path(self):
page = PageCreator(self.pdf_xml, pdfFile=self.pdf_fileB)
full_line19 = [ word for word in page.words if word.line_number == 19 ]
pdftext = PDFText(self.pdf_fileB, sonderzeichen=Extractor.SONDERZEICHEN_LIST)
words_on_path = pdftext.find_word_path(full_line19)
self.assertEqual(len(words_on_path), len([':', 'aber', 'schon', 'in', 'der', 'Gebur', 't', 'd', 'e', 'r', 'T', 'r', 'a', 'g', 'ö', 'd', 'i', 'e', 'u', '.', 'i', 'h', 'r', 'e', 'r', 'L', 'e', 'h', 'r', 'e', 'v', 'o', 'm', 'Dionys.', 'ist', 'der', 'Schop.', 'Pessimismus', 'überwunden.']))
if __name__ == "__main__":
unittest.main()

Event Timeline