Index: svgscripts/test_word_insertion_mark.py
===================================================================
--- svgscripts/test_word_insertion_mark.py (revision 27)
+++ svgscripts/test_word_insertion_mark.py (revision 28)
@@ -1,58 +1,63 @@
import unittest
from os import sep, path
from os.path import dirname, isdir
import lxml.etree as ET
from datatypes.transkriptionField import TranskriptionField
from datatypes.word_insertion_mark import WordInsertionMark
from datatypes.word import Word
class TestWordInsertionMark(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.reference_file = DATADIR + sep + 'test_find_word.xml'
self.test_svg_file = DATADIR + sep + 'path_svg.svg'
def test_WIM_with_input(self):
wim = WordInsertionMark(x=1.0, y=1.0, previous_word_id=0, inserted_word_id=1)
self.assertEqual(wim.id, '0')
self.assertEqual(wim.left, 1.0)
self.assertEqual(wim.previous_word_id, 0)
def test_WIM_with_node(self):
mylist = { 'id': '0', 'left': '1.0', 'top': '1.0', 'height': '0', 'width': '0', 'bottom': '0', 'previous-word-id': '0', 'inserted-word-id': '1' }
node = ET.Element(WordInsertionMark.XML_TAG, attrib=mylist)
wim = WordInsertionMark(wim_node=node)
self.assertEqual(wim.id, '0')
self.assertEqual(wim.left, 1.0)
self.assertEqual(wim.previous_word_id, 0)
def test_WIM_attach_object_to_tree(self):
empty_tree= ET.parse(self.reference_file)
for node in empty_tree.xpath('//freehand'):
node.getparent().remove(node)
newWim = WordInsertionMark(x=1.0, y=1.0, previous_word_id=0)
newWim.attach_object_to_tree(empty_tree)
newWim = WordInsertionMark(id=1,x=1.0, y=1.0, previous_word_id=0)
newWim.attach_object_to_tree(empty_tree)
self.assertEqual(len(empty_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG)), 2)
for wim_node in empty_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG):
wim = WordInsertionMark(wim_node=wim_node)
self.assertEqual(wim.left, 1.0)
self.assertEqual(wim.top, 1.0)
self.assertEqual(wim.previous_word_id, 0)
def test_CREATE_WIM(self):
svg_tree = ET.parse(self.test_svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
xmin = 311.8125
ymin = 158.0117
x = 261.865
y = 15.9
wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_tree, namespaces, x=x, y=y, xmin=xmin, ymin=ymin, line_number=1)
self.assertEqual(wim.symbol_id, 'glyph2-1')
+
+ def test_get_semantic_dictionary(self):
+ dictionary = WordInsertionMark.get_semantic_dictionary()
+ self.assertEqual('previous_word_id' in dictionary['properties'].keys(), True)
+
if __name__ == "__main__":
unittest.main()
Index: svgscripts/test_transkription_position.py
===================================================================
--- svgscripts/test_transkription_position.py (revision 27)
+++ svgscripts/test_transkription_position.py (revision 28)
@@ -1,83 +1,87 @@
import unittest
from os import sep, path
from os.path import dirname, isdir, isfile
import lxml.etree as ET
from datatypes.debug_message import DebugMessage
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkription_position import TranskriptionPosition
from datatypes.transkriptionField import TranskriptionField
from datatypes.word_position import WordPosition
class TestTranskriptionPosition(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.test_svg_file = DATADIR + sep + 'W_I_8_page125_web.svg'
self.test_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.dir = DATADIR
def test_init(self):
dmsg = DebugMessage(message='test')
word_position = TranskriptionPosition(id=1, height=10, width=10, x=0, y=10, debug_message=dmsg)
self.assertEqual(word_position.tag, WordPosition.TRANSKRIPTION)
self.assertEqual(word_position.id, '1')
self.assertEqual(word_position.debug_message.message, 'test')
self.assertEqual(word_position.height, 10)
self.assertEqual(word_position.top, 10)
self.assertEqual(word_position.bottom, 20)
self.assertEqual(word_position.left, 0)
self.assertEqual(word_position.isOnTranskription(), True)
self.assertEqual(word_position.isOnFaksimile(), False)
def test_attach_object_to_tree(self):
matrix = Matrix('matrix(0 0 0 0 0 0)')
dmsg = DebugMessage(message='test')
pwps = [ PositionalWordPart(text='test') ]
word_position = TranskriptionPosition(id=1, height=10, width=10, x=0, y=10, matrix=matrix, debug_message=dmsg, positional_word_parts=pwps)
empty_tree = ET.ElementTree(ET.Element('page'))
word_position.attach_object_to_tree(empty_tree)
#print(ET.dump(empty_tree.getroot()))
for node in empty_tree.getroot().xpath('//' + word_position.tag):
self.assertEqual(node.get('id'), '1')
self.assertEqual(node.get('bottom'), '20')
self.assertEqual(node.get('transform'), matrix.toString())
self.assertEqual(node.get('writing-process-id'), '-1')
word_position = TranskriptionPosition(node=empty_tree.getroot().find('.//' + word_position.tag))
self.assertEqual(word_position.height, 10)
self.assertEqual(word_position.debug_message is not None, True)
self.assertEqual(word_position.debug_message.message, 'test')
self.assertEqual(len(word_position.positional_word_parts), 1)
def test_CREATE_TRANSKRIPTION_POSITION_LIST(self):
page = Page(xml_source_file=self.test_xml, svg_file=self.test_svg_file)
tf = TranskriptionField(page.svg_file)
word_part_objs = [{'text': 'es', 'class': 'st5 st6', 'x': 258.148, 'y': '8.5' }]
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf)
self.assertEqual(transkription_positions[0].top, 3.829)
self.assertEqual(transkription_positions[0].height, 5.672)
word_part_objs = [{'text': 'Meine', 'class': 'st5 st8', 'x': 8.504, 'y': 70.5 }]
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf)
self.assertEqual(transkription_positions[0].height, 11.11)
self.assertEqual(transkription_positions[0].top, 61.266)
self.assertEqual(transkription_positions[0].bottom, 72.376)
def test_CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(self):
page = Page(xml_source_file=self.test_xml, svg_file=self.test_svg_file)
tf = TranskriptionField(page.svg_file)
word_part_objs = [{'text': 'Meine', 'class': 'st5 st8', 'x': 8.504, 'y': 70.5 }]
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf)
transkription_positions[0].positional_word_parts[2].transform = Matrix('rotate(20)')
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, transkription_positions[0].positional_word_parts)
self.assertEqual(len(transkription_positions), 3)
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, transkription_field=tf)
transkription_positions[0].positional_word_parts[0].style_class = 'st5 st10'
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, transkription_positions[0].positional_word_parts)
self.assertEqual(len(transkription_positions), 2)
+
+ def test_get_semantic_dictionary(self):
+ dictionary = TranskriptionPosition.get_semantic_dictionary()
+ self.assertEqual(TranskriptionPosition.XML_TAG in dictionary['properties'].get('writing_process_id').get('xpath'), True)
if __name__ == "__main__":
unittest.main()
Index: svgscripts/test_extractWordPosition.py
===================================================================
--- svgscripts/test_extractWordPosition.py (revision 27)
+++ svgscripts/test_extractWordPosition.py (revision 28)
@@ -1,195 +1,183 @@
import unittest
import os
from os import sep, path
from os.path import isfile, isdir, dirname
import re
import shutil
import tempfile
import lxml.etree as ET
import extractWordPosition
from myxmlwriter import write_pretty
from datatypes.transkriptionField import TranskriptionField
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.pdf import PDFText
from datatypes.word import Word
from datatypes.lineNumber import LineNumber
from datatypes.word_insertion_mark import WordInsertionMark
class TestExtractor(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
self.test_file_find_word = DATADIR + sep + 'test_find_word.xml'
self.test_dir = tempfile.mkdtemp()
self.title = 'ABC 111'
self.matrix_string = 'matrix(1 0 0 1 183.6558 197.9131)'
self.test_file = DATADIR + sep + 'Mp_XIV_1_mytest_421.svg'
self.xml420 = DATADIR + sep + 'Mp_XIV_1_page420.xml'
self.pdf420 = DATADIR + sep + 'Mp_XIV_1_online_420.pdf'
self.pdf_file = DATADIR + sep + 'W_I_8_page125.pdf'
self.faulty_xml = DATADIR + sep + 'W_I_8_faulty_page125.xml'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
def test_main(self):
+ extractWordPosition.Extractor.UNITTESTING = True
argv = ['-d', self.test_dir, '-o', '--title=My Hero', '--page=1', self.test_file]
self.assertEqual(extractWordPosition.main(argv), 0)
def test_get_page_number(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.get_page_number(self.test_file, page_number='1'), '001')
self.assertEqual(extractor.get_page_number(self.test_file), '421')
def test_get_file_name(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.get_file_name(self.test_file), 'xml/Mp_XIV_1_mytest_421.xml')
extractor = extractWordPosition.Extractor(title=self.title)
self.assertEqual(extractor.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_')))
extractorA = extractWordPosition.Extractor(title=self.title)
extractorB = extractWordPosition.Extractor(manuscript_file=extractorA.manuscript_file)
self.assertEqual(extractorB.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_')))
def test_get_style(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
self.assertEqual(sonderzeichen_list, [ 'st21', 'st23'])
self.assertEqual(style_dict.get('st11').get('font-family'), 'Frutiger-Europeen')
self.assertEqual(style_dict.get('st5').get('stroke'), '#CED5CE')
def test_get_word_from_part_obj(self):
extractor = extractWordPosition.Extractor()
mylist = [{'text': 'a', 'class': 'asdf' }, {'text': 'b', 'endX': 0 }, {'text': 'c'}]
self.assertEqual(extractor.get_word_from_part_obj(mylist), 'abc')
def test_get_bottoms(self):
svg_tree = ET.parse(self.test_file)
extractor = extractWordPosition.Extractor()
mybottoms = extractor.get_bottoms(svg_tree.getroot())
self.assertEqual(mybottoms[0], '57.1914')
self.assertEqual(len(mybottoms), 106)
self.assertEqual(mybottoms[len(mybottoms)-1], '1155.6899')
mybottoms = extractor.get_bottoms(svg_tree.getroot(), from_position=100.0, to_position=800.0)
self.assertEqual(mybottoms[0], '100.5132')
self.assertEqual(len(mybottoms), 84)
self.assertEqual(mybottoms[len(mybottoms)-1], '792.8218')
tf = TranskriptionField(self.test_file)
mybottoms = extractor.get_bottoms(svg_tree.getroot(), transkription_field=tf)
self.assertEqual(mybottoms[0], '91.7134')
self.assertEqual(len(mybottoms), 75)
self.assertEqual(mybottoms[len(mybottoms)-1], '681.7134')
def test_get_text_items(self):
svg_tree = ET.parse(self.test_file)
extractor = extractWordPosition.Extractor()
mytest_items = [ x for x in extractor.get_text_items(svg_tree.getroot()) ]
self.assertEqual(len(mytest_items), 300)
self.assertEqual(mytest_items[0].get('transform'), 'matrix(1 0 0 1 386.8218 57.1914)')
tf = TranskriptionField(self.test_file)
mytest_itemsTF = [ x for x in extractor.get_text_items(svg_tree.getroot(), transkription_field=tf) ]
self.assertEqual(mytest_itemsTF[0].get('transform'), 'matrix(1 0 0 1 204.8618 91.7134)')
def test_init_tree_and_target_file(self):
target_file = 'xml/testA.xml'
page = Page(xml_target_file=target_file, title=self.title)
tree = page.page_tree
self.assertEqual(tree.getroot().get('title'), self.title)
self.assertEqual(tree.getroot().findall('./style'), [])
write_pretty(xml_element_tree=tree, file_name=target_file)
page = Page(xml_target_file=target_file)
tree = page.page_tree
self.assertEqual(tree.getroot().get('title'), self.title)
self.assertEqual(tree.getroot().findall('./style'), [])
isfile(target_file) and os.remove(target_file)
def test_add_style(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
target_file = 'xml/testA.xml'
page = Page(xml_target_file=target_file,title=self.title)
page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict)
write_pretty(xml_element_tree=page.page_tree, file_name=target_file)
fromTarget_xml_tree = ET.parse(target_file)
self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title)
self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23")
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE')
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen')
page = Page(xml_target_file=target_file)
page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict)
write_pretty(xml_element_tree=page.page_tree, file_name=target_file)
fromTarget_xml_tree = ET.parse(target_file)
self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title)
self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23")
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE')
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen')
isfile(target_file) and os.remove(target_file)
def test_add_word(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
mylist = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }]
matrix = Matrix(self.matrix_string)
for dict in mylist:
dict['class'] = 'st22'
dict['x'] = matrix.add2X(0)
dict['y'] = matrix.getY()
target_file = self.test_dir + sep + 'asdfasdf.xml'
page = Page(xml_target_file=target_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 1)
mylist[1]['text'] = 'A'
mylist[1]['class'] = 'st21'
mylist[1]['x'] = matrix.add2X(1)
self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 2)
extractor.update_and_attach_words2tree(page)
#self.assertEqual(page.word_insertion_marks[0].x, 184.656)
#self.assertEqual(page.word_insertion_marks[0].y, 197.913)
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="1"]')[0].get('text'), 'a')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]')[0].get('text'), 'c')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('left'), '183.506')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('height'), '8.25')
-
- def test_find_inserted_words(self):
- """PAUSED
- """
- """
- reference_tree = ET.parse(self.test_file_find_word)
- extractor = extractWordPosition.Extractor()
- svg_tree = ET.parse(self.test_file)
- page = Page(xml_source_file=self.test_file_find_word)
- for word_insertion in [ WordInsertionMark(wim_node=node) for node in reference_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG()) ]:
- words = extractor.find_inserted_words(page.page_tree, word_insertion)
- self.assertEqual([ str(word.id) for word in words ], [ str(word.id) for word in word_insertion.inserted_words])
- """
def test_extractor(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.title, None)
self.assertEqual(extractor.manuscript_file, None)
self.assertEqual(extractor.xml_dir, 'xml/')
self.assertEqual(extractor.manuscript_tree, None)
def test_write_title_to_manuscript_file(self):
extractor = extractWordPosition.Extractor(xml_dir=self.test_dir, title=self.title)
self.assertEqual(isfile(extractor.manuscript_file), True)
extractor = extractWordPosition.Extractor(manuscript_file=extractor.manuscript_file)
self.assertEqual(extractor.title, self.title)
def test_extract_line_numbers(self):
svg_tree = ET.parse(self.test_file)
tf = TranskriptionField(self.test_file)
extractor = extractWordPosition.Extractor()
line_numbers = extractor.extract_line_numbers(svg_tree, tf)
self.assertEqual(line_numbers[0].id, 2)
self.assertEqual(len(line_numbers), 24)
self.assertEqual(line_numbers[0].top, 45.163)
def tearDown(self):
isdir(self.test_dir) and shutil.rmtree(self.test_dir)
isfile('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) and os.remove('{}/{}.xml'.format('xml', self.title.replace(' ', '_')))
if __name__ == "__main__":
unittest.main()
Index: svgscripts/extractWordPosition.py
===================================================================
--- svgscripts/extractWordPosition.py (revision 27)
+++ svgscripts/extractWordPosition.py (revision 28)
@@ -1,560 +1,562 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract the position of the words in a svg file and write them to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import inspect
import getopt
from lxml import etree as ET
from os import sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
from progress.bar import Bar
import re
import sys
from svgpathtools import svg2paths2
import warnings
from myxmlwriter import write_pretty
from datatypes.lineNumber import LineNumber
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.pdf import PDFText
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word
from datatypes.word_insertion_mark import WordInsertionMark
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Extractor:
"""
This class can be used to extract the word positions in a svg file and write it to a xml file.
Args:
[xml_dir (str): target directory]
[title (str): title of document]
[manuscript_file (str): xml file containing information about the archival unity to which the current page belongs
[extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that
are part of the transkription field.
"""
+ UNITTESTING = False
SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ]
def __init__(self, xml_dir=None, title=None, manuscript_file=None, extract_transkription_field_only=False, compare2pdf=False):
if bool(xml_dir):
self.xml_dir = xml_dir
not isdir(self.xml_dir) and mkdir(self.xml_dir)
else:
self.xml_dir = 'xml' if(isdir('xml')) else ''
self.compare2pdf = compare2pdf
self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
self.title = title
self.manuscript_file = manuscript_file
self.extract_transkription_field_only = extract_transkription_field_only
self.manuscript_tree = None
if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file):
self.manuscript_tree = ET.parse(self.manuscript_file)
self.title = self.manuscript_tree.getroot().get('title')
elif bool(self.manuscript_file):
raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file))
elif bool(self.title):
if not bool(self.manuscript_file):
self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml'
if not isfile(self.manuscript_file):
self.manuscript_tree = ET.ElementTree(ET.Element('page', attrib={"title": self.title}))
write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile')
def get_page_number(self, file_name, page_number=None):
""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
"""
if not bool(page_number) and bool(re.search(r'\d', file_name)):
"""if page_number=None and filename contains digits,
then split filename into its parts that contain only digits, remove empty strings
and return the last part containing only digits.
"""
page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop()
if bool(page_number):
leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else ''
return leading_zeros + str(page_number)
else:
return ''
def get_file_name(self, file_name, page_number=None):
"""Returns the file_name of the target xml file.
"""
dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else ''
if bool(self.title):
return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml'
else:
return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml'))
def get_style(self, etree_root):
"""Returns the style specification as a dictionary.
:returns:
sonderzeichen_list: list of keys for classes that are 'Sonderzeichen'
style_dict: dictionary: key = class name (str), value = style specification (dictionary)
"""
style_dict = {}
sonderzeichen_list = []
letterspacing_list = []
style = etree_root.find('style', etree_root.nsmap)
if style is not None:
for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))):
style_key = style_item.split('{')[0].replace('.', '')
style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \
for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))}
style_dict[style_key] = style_value_dict
if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'):
sonderzeichen_list.append(style_key)
if bool(style_value_dict.get('letter-spacing')):
letterspacing_list.append(style_key)
return sonderzeichen_list, letterspacing_list, style_dict
def get_word_from_part_obj(self, word_part_obj):
"""Extracts all 'text' from a list of dicitonaries and concats it to a string.
"""
return ''.join([ dict['text'] for dict in word_part_obj])
def find_inserted_words_by_position(self, target_tree, x, y):
"""Returns an Array with the words that are inserted above the x, y position or [] if not found.
"""
warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.')
MINY = 31.0
MAXY = 10.0
DIFFX = 9.0
if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
result_list = []
minus2left = 20.0
minus2top = 19.0
while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX :
result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
'//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ]
minus2left -= 1
minus2top += 1
if len(result_list) > 0:
result_bottom = result_list[len(result_list)-1].bottom
result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)):
result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
result_left_max = result_left_min + DIFFX
if float(item.get('left')) - result_left_max < DIFFX:
result_list.append(Word.CREATE_WORD(item))
else:
break
return result_list
else:
return []
def find_inserted_words(self, target_tree, word_insertion_mark):
"""Returns an Array with the words that are inserted above/underneath the word_insertion_mark.
"""
warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.')
if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1:
return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y)
if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
MINY = 31.0
MAXY = 10.0
DIFFX = 9.0
result_list = []
x = word_insertion_mark.x
y = word_insertion_mark.y
if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line
line_number = word_insertion_mark.line_number - 1
words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
'//word[@line-number={0}]'.format(line_number)) ]
if len(words_on_line) > 0:
minus2top = 1.0
while len(result_list) == 0 and minus2top < MINY:
for word in words_on_line:
for transkription_position in word.transkription_positions:
if transkription_position.top > y - minus2top\
and transkription_position.left > x - DIFFX\
and transkription_position.left < x + DIFFX:
result_list.append(word)
break
minus2top += 1
elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line
line_number = word_insertion_mark.line_number + 1
words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
'//word[@line-number={0}]'.format(line_number)) ]
if len(words_on_line) > 0:
plus2top = 1.0
while len(result_list) == 0 and plus2top < MINY :
for word in words_on_line:
for transkription_position in word.transkription_positions:
if transkription_position.top > y + plus2top\
and transkription_position.left > x - DIFFX\
and transkription_position.left < x + DIFFX:
result_list.append(word)
break
plus2top += 1
if len(result_list) > 0: # now, collect more words that are right of already collected words
result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom
result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
+ result_list[len(result_list)-1].transkription_positions[0].width
for item in target_tree.getroot().xpath(\
'//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)):
result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
+ result_list[len(result_list)-1].transkription_positions[0].width
result_left_max = result_left_min + DIFFX
if float(item.get('left')) - result_left_max < DIFFX:
result_list.append(Word.CREATE_WORD(item))
else:
break
return result_list
else:
return []
def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None):
"""Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word).
If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created.
:returns: the new word counter (int)
"""
break_points = []
if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
for Sonderzeichen in self.SONDERZEICHEN_LIST:
contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ]
if True in contains_Sonderzeichen:
break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]]
for sz_point in [i for i, e in break_points]:
wim_index = len(page.word_insertion_marks)
x = float(word_part_objs[sz_point]['x'])
y = float(word_part_objs[sz_point]['y'])
if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None:
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
xmin = transkription_field.xmin
ymin = transkription_field.ymin
wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\
line_number=page.get_line_number(y-1), mark_type=Sonderzeichen)
page.word_insertion_marks.append(wim)
if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points
THRESHOLDX = 20 # Threshold between line number and text
last_x = -1
for i, x in enumerate([float(dict['x']) for dict in word_part_objs]):
if(last_x > -1 and (x - last_x > THRESHOLDX)):
break_points.append((i, i))
last_x = x
if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words
from_index = 0
for end_point, next_from_index in break_points:
new_word_part_objs = word_part_objs[from_index:end_point]
new_endX = word_part_objs[end_point]['x']
from_index = next_from_index
index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
if from_index > 0 and from_index < len(word_part_objs):
new_word_part_objs = word_part_objs[from_index:]
index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
return index
else:
if len(word_part_objs) > 0:
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\
debug_msg_string=debug_msg, transkription_field=transkription_field)
text = self.get_word_from_part_obj(word_part_objs)
line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
#newWord = Word.CREATE_WORD(page=page, word_part_objs=word_part_objs, id=index, endX=endX, endSign=endSign, matrix=matrix, debug_msg=debug_msg)
#newWord.attach_word_to_tree(page.page_tree) -> now we attach all words with update_and_attach_words2tree()
page.words.append(newWord)
return int(index) + 1
else:
return int(index)
def get_bottoms(self, tree_root, from_position=-1.0, to_position=-1.0, transkription_field=None):
"""Returns all unique bottom values (Float) as a sorted list.
"""
bottom_list = sorted(set(item.get('transform').split(' ')[5].replace(')','') for item in tree_root.findall(".//text", tree_root.nsmap)), key=float)
if transkription_field is not None:
from_position = transkription_field.ymin
to_position = transkription_field.ymax
if (from_position > 0.0 and to_position > 0.0):
return [ item for item in filter(lambda x: float(x) > from_position and float(x) < to_position, bottom_list) ]
else:
return bottom_list
def get_text_items(self, tree_root, transkription_field=None):
"""Returns all text elements with a matrix or (if transkription_field is specified)
all text elements that are located inside the transkription field.
"""
if transkription_field is not None:
return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\
tree_root.iterfind(".//text", tree_root.nsmap))
else:
return tree_root.iterfind(".//text", tree_root.nsmap)
def extract_line_numbers(self, svg_tree, transkription_field):
"""Extracts line numbers and write them to a xml file.
"""
nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
line_numbers = [ LineNumber(raw_text_node=item, transkription_field=transkription_field)\
for item in filter(lambda x: LineNumber.IS_A_LINE_NUMBER(x), nodes_near_tf)]
if len(line_numbers) > 0:
MINABOVE = 3
last_to_position = transkription_field.ymin
for line_number in line_numbers:
above_current_line_bottom = line_number.bottom + transkription_field.ymin - MINABOVE
bottoms = self.get_bottoms(svg_tree.getroot(), from_position=last_to_position, to_position=above_current_line_bottom)
last_to_position = above_current_line_bottom
if len(bottoms) > 0:
current_line_top = float(bottoms[len(bottoms)-1]) - transkription_field.ymin + MINABOVE
line_number.setTop(current_line_top)
return line_numbers
def get_word_object_multi_char_x(self, word_part_obj_dict):
"""Returns the x of the last char of word_part_object.
TODO: get real widths from svg_file!!!
"""
WIDTHFACTOR = 2.6
return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR
def extract_word_position(self, svg_tree, page, transkription_field=None):
"""Extracts word positions.
"""
counter = 0
word_part_obj = []
endSign = '%'
last_matrix = None
MAXBOTTOMDIFF = 5
MAXXDIFF = 6
- bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
+ if not Extractor.UNITTESTING:
+ bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field)
# check for line breaks
if (last_matrix is not None and len(word_part_obj) > 0 and (\
Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field)
word_part_obj = []
endX = current_matrix.getX()
if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: