Index: svgscripts/extractWordPosition.py
===================================================================
--- svgscripts/extractWordPosition.py (revision 111)
+++ svgscripts/extractWordPosition.py (revision 112)
@@ -1,710 +1,711 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract the position of the words in a svg file and write them to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import inspect
import getopt
from lxml import etree as ET
from os import sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
from progress.bar import Bar
import re
import sys
import warnings
from datatypes.lineNumber import LineNumber
from datatypes.matrix import Matrix
from datatypes.page_creator import PageCreator, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from datatypes.pdf import PDFText
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word
from datatypes.word_insertion_mark import WordInsertionMark
from util import process_warnings4status, reset_tp_with_matrix
sys.path.append('shared_util')
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Extractor:
"""
This class can be used to extract the word positions in a svg file and write it to a xml file.
Args:
[xml_dir (str): target directory]
[title (str): title of document]
[manuscript_file (str): xml file containing information about the archival unity to which the current page belongs
"""
UNITTESTING = False
SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ]
SET_POSITIONS_TO_TEXTFIELD_0_0 = False
def __init__(self, xml_dir=None, title=None, manuscript_file=None, compare2pdf=False):
if bool(xml_dir):
self.xml_dir = xml_dir
not isdir(self.xml_dir) and mkdir(self.xml_dir)
else:
self.xml_dir = 'xml' if(isdir('xml')) else ''
self.latest_status = None
self.compare2pdf = compare2pdf
self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
self.title = title
self.manuscript_file = manuscript_file
self.manuscript_tree = None
self.svg_tree = None
if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file):
self.manuscript_tree = ET.parse(self.manuscript_file)
self.title = self.manuscript_tree.getroot().get('title')
elif bool(self.manuscript_file):
raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file))
elif bool(self.title):
self.update_title_and_manuscript(self.title, False)
def _get_pwps_break_points(self, page, pwps) ->list:
"""Return a list of break points from word_part_objs.
"""
break_points = []
if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
for Sonderzeichen in self.SONDERZEICHEN_LIST:
- sonderzeichen_pwps = [ pwp for pwp in pwps if pwp.text == Sonderzeichen and any(sz in pwp.style_class for sz in page.sonderzeichen_list) ]
- if len(sonderzeichen_pwps) > 0:
- break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(sonderzeichen_pwps) ]]
- for pwp in sonderzeichen_pwps:
+ sonderzeichen_break_points = [ (i, pwp) for i, pwp in enumerate(pwps) if pwp.text == Sonderzeichen and any(sz in pwp.style_class for sz in page.sonderzeichen_list) ]
+ if len(sonderzeichen_break_points) > 0:
+ #break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(sonderzeichen_pwps) ]]
+ for i, pwp in sonderzeichen_break_points:
+ break_points.append((i, i+1))
wim_index = len(page.word_insertion_marks)
wim = WordInsertionMark(id=wim_index, x=pwp.left, y=pwp.top-pwp.height, height=pwp.height, width=pwp.width,\
line_number=page.get_line_number(pwp.top-pwp.height-1), mark_type=Sonderzeichen)
page.word_insertion_marks.append(wim)
if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_pwps(pwps)))): # case: digits from line number and chars from words -> create break points
THRESHOLDX = 20 # Threshold between line number and text
last_x = -1
for i, x in enumerate([float(pwp.left) for pwp in pwps]):
if(last_x > -1 and (x - last_x > THRESHOLDX)):
break_points.append((i, i))
last_x = x
return break_points
def _get_break_points(self, page, word_part_objs, transkription_field=None) ->list:
"""Return a list of break points from word_part_objs.
"""
break_points = []
if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
for Sonderzeichen in self.SONDERZEICHEN_LIST:
contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ]
if True in contains_Sonderzeichen:
break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]]
for sz_point in [i for i, e in break_points]:
wim_index = len(page.word_insertion_marks)
x = float(word_part_objs[sz_point]['x'])
y = float(word_part_objs[sz_point]['y'])
if page.svg_file is not None and isfile(page.svg_file)\
and (not self.SET_POSITIONS_TO_TEXTFIELD_0_0 or transkription_field is not None):
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
xmin = 0 if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field.xmin
ymin = 0 if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field.ymin
wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\
line_number=page.get_line_number(y-1), mark_type=Sonderzeichen)
page.word_insertion_marks.append(wim)
if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points
THRESHOLDX = 20 # Threshold between line number and text
last_x = -1
for i, x in enumerate([float(dict['x']) for dict in word_part_objs]):
if(last_x > -1 and (x - last_x > THRESHOLDX)):
break_points.append((i, i))
last_x = x
return break_points
def _process_pwps_break_points(self, break_points, page, index, pwps) ->int:
"""Process break points on pwps and return new index.
"""
from_index = 0
debug_msg = 'process break points'
for end_point, next_from_index in break_points:
new_pwps = pwps[from_index:end_point]
from_index = next_from_index
index = self.create_word_from_pwps(page, index, new_pwps, debug_msg=debug_msg)
if from_index > 0 and from_index < len(pwps):
new_pwps = pwps[from_index:]
index = self.create_word_from_pwps(page, index, new_pwps, debug_msg=debug_msg + ' ... end point')
if len(page.words) > 1\
and re.match(r'[^\w\s]', page.words[-1].text):
last_word = page.words.pop()
page.words[-1].join(last_word)
return last_word.id
return index
def _process_break_points(self, break_points, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None) ->int:
"""Process break points on word_part_objs and return new index.
"""
from_index = 0
for end_point, next_from_index in break_points:
new_word_part_objs = word_part_objs[from_index:end_point]
new_endX = word_part_objs[end_point]['x']
from_index = next_from_index
index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
if from_index > 0 and from_index < len(word_part_objs):
new_word_part_objs = word_part_objs[from_index:]
index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
return index
def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None) ->int:
"""Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word).
If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created.
:returns: the new word counter (int)
"""
break_points = self._get_break_points(page, word_part_objs, transkription_field=transkription_field)
if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words
return self._process_break_points(break_points, page, index, word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
else:
if len(word_part_objs) > 0:
provide_tf = None if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\
debug_msg_string=debug_msg, transkription_field=provide_tf, svg_path_tree=self.svg_tree)
text = self.get_word_from_part_obj(word_part_objs)
line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
if line_number == -1:
if transkription_positions[0].transform is not None:
line_number = page.get_line_number(transkription_positions[0].transform.getY())
if line_number == -1 and len(page.words) > 0:
lastWord = page.words[-1]
lastWord_lastTP = lastWord.transkription_positions[-1]
lastTP = transkription_positions[-1]
if transkription_positions[0].left > lastWord_lastTP.left\
and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2:
line_number = lastWord.line_number
else:
line_number = lastWord.line_number+1
#reset_tp_with_matrix(transkription_positions)
newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
page.words.append(newWord)
return int(index) + 1
else:
return int(index)
def create_word_from_pwps(self, page, index, pwps, debug_msg=None) ->int:
"""Creates transkription_positions and a new word from pwps (i.e. a list of PositionalWordPart).
If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, pwps will be split and several words are created.
:returns: the new word counter (int)
"""
break_points = self._get_pwps_break_points(page, pwps)
if(len(break_points) > 0): # if there are break points -> split pwps and add the corresponding words
return self._process_pwps_break_points(break_points, page, index, pwps)
else:
if len(pwps) > 0:
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps, debug_msg_string=debug_msg)
text = self.get_word_from_pwps(pwps)
line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
if line_number == -1:
if transkription_positions[0].transform is not None:
line_number = page.get_line_number(transkription_positions[0].transform.getY())
if line_number == -1 and len(page.words) > 0:
lastWord = page.words[-1]
lastWord_lastTP = lastWord.transkription_positions[-1]
lastTP = transkription_positions[-1]
if transkription_positions[0].left > lastWord_lastTP.left\
and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2:
line_number = lastWord.line_number
else:
line_number = lastWord.line_number+1
#reset_tp_with_matrix(transkription_positions)
newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
page.words.append(newWord)
return int(index) + 1
else:
return int(index)
def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default', multipage_index=-1, marginals_page=None):
"""Extracts information about positions of text elements and writes them to a xml file.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
exit_status = 0
with warnings.catch_warnings(record=record_warnings) as w:
warnings.simplefilter(warning_filter)
page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile,\
multipage_index=multipage_index, marginals_page=marginals_page)
status_message = process_warnings4status(w, [ PageCreator.WARNING_MISSING_USE_NODE4PWP, PageCreator.WARNING_MISSING_GLYPH_ID4WIM ],\
'', 'OK', 'with warnings')
if status_message != 'OK':
self.latest_status = status_message
exit_status = 1
else:
self.latest_status = None
page.page_tree.getroot().set('status', status_message)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
return exit_status
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, multipage_index=-1, marginals_page=None) -> PageCreator:
"""Extracts information about positions of text elements.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
transkription_field = TranskriptionField(file_name, multipage_index=multipage_index)
text_field = transkription_field.convert_to_text_field()
self.svg_tree = ET.parse(file_name)
page = PageCreator(xml_target_file, title=self.title, multipage_index=multipage_index,\
page_number=page_number, pdfFile=pdfFile, svg_file=svg_file,\
svg_text_field=text_field, source=file_name, marginals_source=marginals_page)
sonderzeichen_list, letterspacing_list, style_dict = self.get_style(self.svg_tree.getroot())
page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
page.init_line_numbers(LineNumber.extract_line_numbers(self.svg_tree, transkription_field, set_to_text_field_zero=self.SET_POSITIONS_TO_TEXTFIELD_0_0),\
transkription_field.ymax)
self.improved_extract_word_position(self.svg_tree, page, transkription_field=transkription_field)
page.create_writing_processes_and_attach2tree()
page.update_and_attach_words2tree()
for word_insertion_mark in page.word_insertion_marks:
# it is not clear if we really need to know this alternative word ordering. See 'TODO.md'
#word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark)
word_insertion_mark.attach_object_to_tree(page.page_tree)
return page
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def improved_extract_word_position(self, svg_tree, page, transkription_field=None):
"""Extracts word positions.
"""
if page.svg_file is None or not isfile(page.svg_file):
warnings.warn('There is no page.svg_file or it does not exist ... using old function "extract_word_position"!')
self.extract_word_position(svg_tree, page, transkription_field=transkription_field)
else:
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
counter = 0
word_part_obj = []
pwps = []
endSign = '%'
last_matrix = None
MAXBOTTOMDIFF = 5
MAXXDIFF = 11
INTERCHARSPACE = 1.1
if not Extractor.UNITTESTING:
bar = Bar('(improved) extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
current_matrix = Matrix(text_item.get('transform'))
# check for line breaks
if last_matrix is not None and len(pwps) > 0 and (\
(current_matrix.getX() > pwps[-1].left+pwps[-1].width + INTERCHARSPACE or last_matrix.getX()-current_matrix.getX() > MAXXDIFF) or\
(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF)):
endSign = '%'
if(self.get_word_from_pwps(pwps) != ''):
debug_msg = 'check for line breaks, diffx: {}, diffy: {}, current_matrix: {}, last_matrix: {}'.format(\
round(current_matrix.getX() - (pwps[-1].left+pwps[-1].width), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
current_matrix.toString(), last_matrix.toString())
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg=debug_msg)
pwps = []
endX = current_matrix.getX()
if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT
if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))):
x = current_matrix.getX() if not current_matrix.isRotationMatrix() else 0.0
y = current_matrix.getY() if not current_matrix.isRotationMatrix() else 0.0
pwps += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST({ "text": text_item.text, "x": x, "y": y, "class": text_item.get('class'), "matrix": current_matrix},\
svg_path_tree, namespaces, page=page)
else:
if(self.get_word_from_pwps(pwps) != ''):
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg="text: next string empty")
pwps = []
for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT
endX = current_matrix.add2X(tspan_item.get('x'))
if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))):
y = current_matrix.add2Y(tspan_item.get('y'))
pwps += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST({ "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'),\
"matrix": current_matrix }, svg_path_tree, namespaces, page=page)
if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0:
"""text_item has letterspacing class
(set s & set t = new set with elements common to s and t)
"""
if(self.get_word_from_pwps(pwps) != ''):
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg="letterspacing class")
pwps = []
else:
if(self.get_word_from_pwps(pwps) != ''):
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg="tspan: next string empty")
pwps = []
last_matrix = current_matrix
not bool(Extractor.UNITTESTING) and bar.next()
if(self.get_word_from_pwps(pwps) != ''):
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg='end of loop')
pwps = []
not bool(Extractor.UNITTESTING) and bar.finish()
def extract_word_position(self, svg_tree, page, transkription_field=None):
"""Extracts word positions.
"""
counter = 0
word_part_obj = []
endSign = '%'
last_matrix = None
MAXBOTTOMDIFF = 5
MAXXDIFF = 6
if not Extractor.UNITTESTING:
bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
provide_tf = None if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field
current_matrix = Matrix(text_item.get('transform'), transkription_field=provide_tf)
# check for line breaks
if (last_matrix is not None and len(word_part_obj) > 0 and (\
Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field)
word_part_obj = []
endX = current_matrix.getX()
if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT
if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))):
word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} )
else:
endSign = text_item.text
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field)
word_part_obj = []
endSign = '%'
for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT
endX = current_matrix.add2X(tspan_item.get('x'))
if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))):
y = current_matrix.add2Y(tspan_item.get('y'))
word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix })
if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0:
"""text_item has letterspacing class
(set s & set t = new set with elements common to s and t)
"""
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\
debug_msg='tspan with letterspacing', transkription_field=transkription_field)
word_part_obj = []
else:
endSign = tspan_item.text
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\
debug_msg='svg/text/tspan/\s', transkription_field=transkription_field)
word_part_obj = []
endSign = '%'
last_matrix = current_matrix
not bool(Extractor.UNITTESTING) and bar.next()
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\
transkription_field=transkription_field)
word_part_obj = []
endSign = '%'
not bool(Extractor.UNITTESTING) and bar.finish()
def find_inserted_words_by_position(self, target_tree, x, y):
"""Returns an Array with the words that are inserted above the x, y position or [] if not found.
"""
warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.')
MINY = 31.0
MAXY = 10.0
DIFFX = 9.0
if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
result_list = []
minus2left = 20.0
minus2top = 19.0
while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX :
result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
'//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ]
minus2left -= 1
minus2top += 1
if len(result_list) > 0:
result_bottom = result_list[len(result_list)-1].bottom
result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)):
result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
result_left_max = result_left_min + DIFFX
if float(item.get('left')) - result_left_max < DIFFX:
result_list.append(Word.CREATE_WORD(item))
else:
break
return result_list
else:
return []
def find_inserted_words(self, target_tree, word_insertion_mark):
"""Returns an Array with the words that are inserted above/underneath the word_insertion_mark.
"""
warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.')
if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1:
return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y)
if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
MINY = 31.0
MAXY = 10.0
DIFFX = 9.0
result_list = []
x = word_insertion_mark.x
y = word_insertion_mark.y
if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line
line_number = word_insertion_mark.line_number - 1
words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
'//word[@line-number={0}]'.format(line_number)) ]
if len(words_on_line) > 0:
minus2top = 1.0
while len(result_list) == 0 and minus2top < MINY:
for word in words_on_line:
for transkription_position in word.transkription_positions:
if transkription_position.top > y - minus2top\
and transkription_position.left > x - DIFFX\
and transkription_position.left < x + DIFFX:
result_list.append(word)
break
minus2top += 1
elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line
line_number = word_insertion_mark.line_number + 1
words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
'//word[@line-number={0}]'.format(line_number)) ]
if len(words_on_line) > 0:
plus2top = 1.0
while len(result_list) == 0 and plus2top < MINY :
for word in words_on_line:
for transkription_position in word.transkription_positions:
if transkription_position.top > y + plus2top\
and transkription_position.left > x - DIFFX\
and transkription_position.left < x + DIFFX:
result_list.append(word)
break
plus2top += 1
if len(result_list) > 0: # now, collect more words that are right of already collected words
result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom
result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
+ result_list[len(result_list)-1].transkription_positions[0].width
for item in target_tree.getroot().xpath(\
'//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)):
result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
+ result_list[len(result_list)-1].transkription_positions[0].width
result_left_max = result_left_min + DIFFX
if float(item.get('left')) - result_left_max < DIFFX:
result_list.append(Word.CREATE_WORD(item))
else:
break
return result_list
else:
return []
def get_file_name(self, file_name, page_number=None):
"""Returns the file_name of the target xml file.
"""
dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else ''
if bool(self.title):
return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml'
else:
return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml'))
def get_page_number(self, file_name, page_number=None):
""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
"""
if not bool(page_number) and bool(re.search(r'\d', file_name)):
"""if page_number=None and filename contains digits,
then split filename into its parts that contain only digits, remove empty strings
and return the last part containing only digits.
"""
page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop()
if bool(page_number):
leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else ''
return leading_zeros + str(page_number)
else:
return ''
def get_style(self, etree_root):
"""Returns the style specification as a dictionary.
:returns:
sonderzeichen_list: list of keys for classes that are 'Sonderzeichen'
style_dict: dictionary: key = class name (str), value = style specification (dictionary)
"""
style_dict = {}
sonderzeichen_list = []
letterspacing_list = []
style = etree_root.find('style', etree_root.nsmap)
if style is not None:
for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))):
style_key = style_item.split('{')[0].replace('.', '')
style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \
for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))}
style_dict[style_key] = style_value_dict
if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'):
sonderzeichen_list.append(style_key)
if bool(style_value_dict.get('letter-spacing')):
letterspacing_list.append(style_key)
return sonderzeichen_list, letterspacing_list, style_dict
def get_text_items(self, tree_root, transkription_field=None):
"""Returns all text elements with a matrix or (if transkription_field is specified)
all text elements that are located inside the transkription field.
"""
if transkription_field is not None:
return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=x),\
tree_root.iterfind(".//text", tree_root.nsmap))
else:
return tree_root.iterfind(".//text", tree_root.nsmap)
def get_word_from_pwps(self, pwps):
"""Extracts all 'text' from a list of dicitonaries and concats it to a string.
"""
return ''.join([ pwp.text for pwp in pwps ])
def get_word_from_part_obj(self, word_part_obj):
"""Extracts all 'text' from a list of dicitonaries and concats it to a string.
"""
return ''.join([ dict['text'] for dict in word_part_obj])
def get_word_object_multi_char_x(self, word_part_obj_dict):
"""Returns the x of the last char of word_part_object.
TODO: get real widths from svg_file!!!
"""
WIDTHFACTOR = 2.6
return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR
def update_title_and_manuscript(self, title, update_manuscript=True):
"""Updates title and manuscript.
"""
self.title = title
if update_manuscript or not bool(self.manuscript_file):
self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml'
if not isfile(self.manuscript_file):
self.manuscript_tree = ET.ElementTree(ET.Element('manuscript', attrib={"title": self.title}))
write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile')
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to extract the position of the words in a svg file and write them to a xml file.
svgscripts/extractWordPosition.py [OPTIONS]
svg file OR xml target file containing file name of svg file as "/page/@source".
directory containing svg files
OPTIONS:
-h|--help: show help
-c|--compare-to-pdf compare words to pdf and autocorrect
-d|--xml-dir=xmlDir: target directory for the xml output file(s)
-m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s)
-p|--page=pageNumber: page number of the current page. For use with _one_ file only.
-P|--PDF=pdfFile: pdf file - used for word correction
-s|--svg=svgFile: svg web file
-t|--title=title: title of the manuscript to which the current page(s) belong(s)
-x|--xml-target-file=xmlOutputFile: xml target file
:return: exit code (int)
"""
compare2pdf = True
manuscript_file = None
page_number = None
pdfFile = None
svg_file = None
title = None
xml_target_file = None
xml_dir = ".{}xml".format(sep)
try:
opts, args = getopt.getopt(argv, "hcd:m:t:p:s:x:P:", ["help", "compare-to-pdf", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-c', '--compare-to-pdf'):
compare2pdf = True
elif opt in ('-d', '--xml-dir'):
xml_dir = arg
elif opt in ('-m', '--manuscript-file'):
manuscript_file = arg
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-p', '--page'):
page_number = str(arg)
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-P', '--PDF'):
pdfFile = arg
elif opt in ('-x', '--xml-target-file'):
xml_target_file = str(arg)
files_to_process = list()
for arg in args:
if isfile(arg):
files_to_process.append(arg)
elif isdir(arg):
files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg)))
else:
print("'{}' does not exist!".format(arg))
return 2
if len(files_to_process) < 1 or args[0].endswith('xml'):
if xml_target_file is None:
xml_target_file = args[0] if len(args) > 0 else None
if xml_target_file is not None and isfile(xml_target_file):
target_file_tree = ET.parse(xml_target_file)
file_name = target_file_tree.getroot().get('source')
title = target_file_tree.getroot().get('title') if title is None else title
page_number = target_file_tree.getroot().get('number') if page_number is None else page_number
if svg_file is None:
if len(target_file_tree.xpath('//svg-image')) > 0:
svg_file = target_file_tree.xpath('.//svg-image/@file-name')[0]\
if len(target_file_tree.xpath('.//svg-image/@file-name')) > 0 else None
else:
svg_file = target_file_tree.xpath('.//svg/@file')[0]\
if len(target_file_tree.xpath('.//svg/@file')) > 0 else None
files_to_process.insert(0, file_name)
if xml_target_file in files_to_process:
files_to_process.remove(xml_target_file)
else:
usage()
return 2
if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)):
print("ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!")
usage()
return 2
extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, compare2pdf=compare2pdf)
for file in files_to_process:
extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/datatypes/footnotes.py
===================================================================
--- svgscripts/datatypes/footnotes.py (revision 111)
+++ svgscripts/datatypes/footnotes.py (revision 112)
@@ -1,347 +1,358 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract footnotes from a svg file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
import warnings
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from .atypical_writing import AtypicalWriting
from .clarification import Clarification
from .editor_correction import EditorCorrection
from .line_continuation import LineContinuation
from .matrix import Matrix
from .standoff_tag import StandoffTag
from .text import Text
from .transkriptionField import TranskriptionField
from .uncertain_decipherment import UncertainDecipherment
UNITTESTING = False
DEBUG = False
class FootnoteColumns:
"""This class represents footnote columns.
"""
REFERENCE_PATTERN = re.compile('.*(\d+-)*[0-9]+:')
EXTENDED_REFERENCE_PATTERN = re.compile('.*(\d+(-|/))*[0-9]+:')
REFERENCE_GROUP = re.compile('(.*\D)((\d+-)*[0-9]+:)')
EXCEPTION = re.compile('((\d+/)+[0-9]+:)')
def __init__(self, nsmap, nodes, bottom_values, style_dict, debug=False, skip_after=-1.0):
self.bottom_values = bottom_values
self.footnote_columns = []
self.footnote_keys = {}
self.index = 0
self.nodes = nodes
self.nsmap = nsmap
self.skip_after = skip_after
self.style_dict = style_dict
self.debug = debug
self._init_columns()
def _init_columns(self):
"""Initialize footnote column positions
by creating lists in self.footnote_columns and adding the positions a keys
to self.footnote_keys while the index of self.footnote_columns are their values.
"""
first_line_fn_nodes = sorted([ item for item in self.nodes\
if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == round(self.bottom_values[0], 1)\
and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after],\
key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX())
+ if self.debug:
+ print(self.bottom_values[0])
current_nodes = []
for node in first_line_fn_nodes:
matrix = Matrix(transform_matrix_string=node.get('transform'))
if len(node.getchildren()) > 0:
for tspan in node.findall('tspan', self.nsmap):
x = matrix.add2X(float(tspan.get('x')))
current_nodes.append({ 'x': x, 'text': tspan.text })
elif node.text is not None:
x = matrix.getX()
current_nodes.append({ 'x': x, 'text': node.text })
if re.match(self.EXTENDED_REFERENCE_PATTERN,\
''.join([ item.get('text') for item in current_nodes])):
current_nodes = self._remove_unused_texts(current_nodes)
self.footnote_columns.append([])
self.footnote_keys.update({ round(current_nodes[0].get('x')): len(self.footnote_columns)-1 })
current_nodes = []
if len(self.footnote_keys) == 0:
raise Exception(f'ERROR: there are no footnote_keys')
def _remove_unused_texts(self, nodes):
"""Remove tspan that contain text that is not a line reference.
"""
threshold = 100
node_text = ''.join([ item.get('text') for item in nodes])
match = re.match(self.REFERENCE_GROUP, node_text)
if match is not None and match.group(1) is not None\
and not re.match(self.EXCEPTION, node_text):
unused_text = ''
index = 0
for item in nodes:
unused_text += item.get('text')
if match.group(1).startswith(unused_text):
index += 1
else:
break
if len(nodes) > index+1:
counter = 0
has_gap = False
for item in nodes[index:]:
if len(nodes) > index+counter+1\
and nodes[index+counter+1].get('x')-nodes[index+counter].get('x') > threshold:
index += counter+1
has_gap = True
break
counter += 1
if has_gap:
return nodes[index+1:]
return nodes[index:]
return nodes
def append(self, footnote):
"""Append footnote to a column
"""
self.footnote_columns[self.index].append(footnote)
@classmethod
def create_cls(cls, style_dict=None, page=None, transkription_field=None, svg_tree=None, svg_file=None, marginals_on_extra_page=False, skip_after=-1.0):
"""Returns all footnotes as a list of Text.
"""
if page is not None and page.source is not None and svg_file is None:
svg_file = page.source\
if page.marginals_source is None\
else page.marginals_source
if transkription_field is None and svg_file is not None:
multipage_index = -1\
if page is None\
else page.multipage_index
transkription_field = TranskriptionField(svg_file, multipage_index=multipage_index)
if svg_tree is None and svg_file is not None:
svg_tree = ET.parse(svg_file)
if style_dict is None and page is not None:
style_dict = StandoffTag.create_relevant_style_dictionary(page)
if page is not None and page.marginals_source is not None:
marginals_on_extra_page = True
svg_tree = ET.parse(page.marginals_source)
nodes_in_footnote_area = cls.EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field, marginals_on_extra_page=marginals_on_extra_page)
bottom_values = cls.GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area)
if len(bottom_values) == 0:
return None
else:
- return cls(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, style_dict, skip_after=skip_after)
+ fc = None
+ tries = 10 if len(bottom_values) > 3 else len(bottom_values)
+ while fc is None and tries > 0:
+ try:
+ fc = cls(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, style_dict, skip_after=skip_after)
+ except:
+ print(bottom_values)
+ bottom_values = bottom_values[1:]
+ tries = tries-1
+ return fc
def extract_footnotes(self, contains_string='', contains_strings=None) -> list:
"""Returns all footnotes as a list of Text.
"""
left_value = -1
for bottom_value in self.bottom_values:
nodes_on_line = sorted([ item for item in self.nodes\
if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == bottom_value\
and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after\
],\
key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX())
footnote = None
matrix = None
for node in nodes_on_line:
matrix = Matrix(transform_matrix_string=node.get('transform'))
footnote, left_value = self._process_content_and_markup(node, footnote, matrix)
if footnote is not None:
self.append(footnote)
footnotes = self.toList()
if contains_strings is not None:
footnotes = [ footnote for footnote in footnotes if True in [ contains_string in footnote.content for contains_string in contains_strings] ]
if contains_string != '':
footnotes = [ footnote for footnote in footnotes if contains_string in footnote.content ]
return footnotes
def get_index(self, left_value) -> int:
"""Return index of column for left value.
"""
index = -1
if round(left_value) in self.footnote_keys.keys():
index = self.footnote_keys[round(left_value)]
else:
for key, value in self.footnote_keys.items():
if abs(key - round(left_value)) < 2:
index = value
break
return index
def register_index(self, left_value):
"""Register index for next column to be used.
"""
index = self.get_index(left_value)
if index > -1:
self.index = index
else:
error_value = round(left_value)
msg = f'Left value not part of columns: {error_value} -> {self.footnote_keys}'
raise Exception(msg)
def toList(self):
"""Return footnotes as a list of Text.
"""
footnotes = []
for footnote_list in self.footnote_columns:
for footnote in footnote_list:
if re.match(self.REFERENCE_PATTERN, footnote.content):
footnotes.append(footnote)
elif len(footnotes) > 0:
footnotes[-1].join(footnote)
else:
print([ footnote.content for footnote in self.footnote_columns[1]])
print(self.footnote_keys)
raise Exception(f'List of footnotes empty and footnote "{footnote.content}" does not match {self.REFERENCE_PATTERN.pattern}!')
return footnotes
def _process_content_and_markup(self, node, footnote, matrix):
"""Process content and markup of node.
[:return:] (footnote: Text, left_value: float)
"""
startIndex = 0
next_text = node.text
left_value = matrix.getX()
items = [ item for item in node.findall('tspan', self.nsmap)]
if len(items) > 0:
next_text = ''.join([ item.text for item in items])
left_value = matrix.add2X(float(items[0].get('x')))
elif bool(node.get('x')):
left_value = matrix.add2X(float(node.get('x')))
if footnote != None and\
((re.match(r'.*[0-9]+:', next_text)\
and re.match(r'.*[0-9]+:', footnote.content)\
and not re.match(r'.*\d-', footnote.content))\
or (self.get_index(left_value) > -1\
and self.get_index(left_value) != self.index)):
if DEBUG and re.match(r'.*[0-9]+:', next_text)\
and not re.match(r'.*[0-9]+:', footnote.content):
print(footnote, next_text)
self.append(footnote)
footnote = None
if len(items) > 0:
for item in items:
footnote, left_value = self._process_content_and_markup(item, footnote, matrix)
else:
if footnote is None:
footnote = Text(content=next_text)
try:
self.register_index(left_value)
except Exception:
print(self.footnote_columns)
raise Exception(f'{footnote}')
else:
startIndex = footnote.append(next_text)
if bool(node.get('class')):
standoff_markups = StandoffTag.create_cls(startIndex, len(footnote.content)-1, node.get('class'), style_dict=self.style_dict)
if len(standoff_markups) > 0:
if len(footnote.standoff_markups) > 0:
standoff_markups = footnote.standoff_markups[-1].join_list(standoff_markups)
if len(standoff_markups) > 0:
footnote.standoff_markups += standoff_markups
return footnote, left_value
@staticmethod
def EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field=None, marginals_on_extra_page=False) ->list:
"""Return a list of nodes that are in footnote area.
"""
if transkription_field is None and svg_tree is not None:
transkription_field = TranskriptionField(svg_tree.docinfo.URL)
nodes_in_footnote_area = [ item for item in filter(lambda node: Matrix.NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, transkription_field,\
marginals_on_extra_page=marginals_on_extra_page),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
for node in nodes_in_footnote_area:
if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, marginals_on_extra_page=marginals_on_extra_page):
for child in node.getchildren():
if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, x=float(child.get('x')), marginals_on_extra_page=marginals_on_extra_page):
node.remove(child)
return nodes_in_footnote_area
@staticmethod
def GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area) ->list:
"""Return sorted list of unique bottom values.
"""
return sorted([ bottom_value for bottom_value in set(round(Matrix(transform_matrix_string=item.get('transform')).getY(),1) for item in nodes_in_footnote_area) ])
def extract_footnotes_as_strings(transkription_field=None, svg_tree=None, svg_file=None, contains_string='', marginals_extra=False):
"""Returns all footnotes as a list of strings.
"""
if transkription_field is None and svg_file is not None:
transkription_field = TranskriptionField(svg_file)
if svg_tree is None and svg_file is not None:
svg_tree = ET.parse(svg_file)
footnotes = []
nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field, marginals_on_extra_page=marginals_extra),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ])
for bottom_value in bottom_values:
nodes_on_line = [ item for item in nodes_in_footnote_area if Matrix(transform_matrix_string=item.get('transform')).getY() == bottom_value ]
nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX())
footnote_string = ''
for node in nodes_on_line:
if len(node.getchildren()) == 0:
if footnote_string != '' and re.match(r'.*[0-9]+:', node.text):
footnotes.append(footnote_string)
footnote_string = node.text
else:
footnote_string += node.text
else:
next_string = ''.join([ item.text for item in node.findall('tspan', svg_tree.getroot().nsmap)])
if footnote_string != '' and re.match(r'.*[0-9]+:', next_string):
footnotes.append(footnote_string)
footnote_string = next_string
else:
footnote_string += next_string
footnotes.append(footnote_string)
if contains_string != '':
footnotes = [ footnote_string for footnote_string in footnotes if contains_string in footnote_string ]
return footnotes
def extract_footnotes(page, transkription_field=None, svg_tree=None, svg_file=None, contains_string='', contains_strings=None, skip_after=-1.0) ->list:
"""Returns all footnotes as a list of Text.
"""
marginals_on_extra_page = False
if page.marginals_source is not None:
marginals_on_extra_page = True
svg_tree = ET.parse(page.marginals_source)
if transkription_field is None:
transkription_field = TranskriptionField(page.source)
footnote_columns = FootnoteColumns.create_cls(page=page, transkription_field=transkription_field,\
svg_tree=svg_tree, svg_file=svg_file, marginals_on_extra_page=marginals_on_extra_page, skip_after=skip_after)
if footnote_columns is None:
return []
return footnote_columns.extract_footnotes(contains_string=contains_string, contains_strings=contains_strings)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/datatypes/faksimile.py
===================================================================
--- svgscripts/datatypes/faksimile.py (revision 111)
+++ svgscripts/datatypes/faksimile.py (revision 112)
@@ -1,205 +1,205 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a faksimile page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import re
from lxml import etree as ET
from os import path
from os.path import isdir, isfile, sep, basename
from svgpathtools.parser import parse_path
from .faksimile_image import FaksimileImage
from .matrix import Matrix
from .text_field import TextField
from .word_position import WordPosition
class FaksimilePage:
"""
This class represents a faksimile page.
Args:
xml_target_file (str): name of the xml file to which page info will be written.
xml_source_file (str): name of the xml file that will be instantiated.
"""
XML_TAG = 'faksimile-page'
def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, svg_source_file=None, faksimile_image=None, text_field=None):
xml_file = xml_source_file if xml_source_file is not None else xml_target_file
self.title = title
self.page_number = page_number
self.xml_file = xml_file
if xml_file is not None and isfile(xml_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_file, parser)
self.title = self.page_tree.getroot().get('title')
self.page_number = self.page_tree.getroot().get('page-number')
self.width = float(self.page_tree.getroot().get('width')) if bool(self.page_tree.getroot().get('width')) else 0.0
self.height = float(self.page_tree.getroot().get('height')) if bool(self.page_tree.getroot().get('height')) else 0.0
else:
self.page_tree = ET.ElementTree(ET.Element(self.XML_TAG))
if title is not None:
self.page_tree.getroot().set('title', title)
if page_number is not None:
self.page_tree.getroot().set('page-number', str(page_number))
if xml_target_file is not None:
self.remove_tags_from_page_tree([WordPosition.FAKSIMILE])
if svg_source_file is not None:
self.page_tree.getroot().set('svg-source-file', svg_source_file)
if faksimile_image is not None:
faksimile_image.attach_object_to_tree(self.page_tree)
if text_field is not None:
text_field.attach_object_to_tree(self.page_tree)
self.svg_source_file = self.page_tree.getroot().get('svg-source-file')
self.faksimile_image = FaksimileImage(node=self.page_tree.getroot().find('.//' + FaksimileImage.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + FaksimileImage.XML_TAG)) > 0 else None
self.text_field = TextField(node=self.page_tree.getroot().find('.//' + TextField.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + TextField.XML_TAG)) > 0 else None
self.word_positions = [ WordPosition(node=node) for node in self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE) ]\
if len(self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE)) > 0 else []
def append_word_position(self, word_position):
"""Appends word_position to word_positions and attaches it to page_tree.
"""
self.word_positions.append(word_position)
word_position.attach_object_to_tree(self.page_tree)
@classmethod
def get_faksimile_pages(cls, svg_file, page_number='') -> list:
"""Creates and returns text fields contained in a svg_file as a list.
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
return cls.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces, page_number=page_number)
@staticmethod
def GET_FAKSIMILEPAGES(svg_tree, namespaces=None, page_number='') -> list:
"""Creates and returns text fields contained in a svg_tree as a list.
"""
THRESHOLD_X = 10
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
source_file_name = svg_tree.docinfo.URL
image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap), source_file_name)
xml_dir = '.{}xml'.format(sep)
faksimile_pages = list()
title_string = re.sub(r'[,_][0-9]+.*\.svg', '', basename(source_file_name))
if re.match(r'.*-\d+[a-z]$', title_string):
title_string = re.sub(r'-\d+[a-z]$', '', title_string)
title = title_string.replace('-', ' ')
rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap)\
if rect.get('id', svg_tree.getroot().nsmap).startswith(title_string)\
and rect.get('id', svg_tree.getroot().nsmap).endswith(str(page_number)) ]
for text_field_rect in rect_list:
tf_x = float(text_field_rect.get('x', svg_tree.getroot().nsmap)) - image.x
tf_y = float(text_field_rect.get('y', svg_tree.getroot().nsmap)) - image.y
tf_width = float(text_field_rect.get('width', svg_tree.getroot().nsmap))
tf_height = float(text_field_rect.get('height', svg_tree.getroot().nsmap))
tf_matrix = Matrix(transform_matrix_string=text_field_rect.get('transform'))\
if bool(text_field_rect.get('transform'))\
else None
id = text_field_rect.get('id', svg_tree.getroot().nsmap)
target_file_name = xml_dir + sep + id + '.xml' if isdir(xml_dir) else id + '.xml'
page_number = re.sub(r'.*[,_]', '', id)
if page_number.startswith('0'):
page_number = page_number.lstrip('0')
text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y, matrix=tf_matrix)
faksimile_page = FaksimilePage(xml_target_file=target_file_name, svg_source_file=source_file_name,\
title=title, page_number=page_number, faksimile_image=image, text_field=text_field)
x_min = text_field.xmin + image.x
y_min = text_field.ymin + image.y
#rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\
# x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces)
rect_titles = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\
y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces)
rect_titles += get_paths_inside_rect(svg_tree, '//ns:path/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\
y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces)
for rect_title in rect_titles:
rect = rect_title.getparent()
x, y, height, width = 0.0, 0.0, 0.0, 0.0
if rect.tag.endswith('path'):
path = parse_path(rect.get('d'))
x, xmax, y, ymax = path.bbox()
width = xmax - x
height = ymax - y
else:
x = float(rect.get('x', svg_tree.getroot().nsmap))
y = float(rect.get('y', svg_tree.getroot().nsmap))
height = float(rect.get('height', svg_tree.getroot().nsmap))
width = width=float(rect.get('width', svg_tree.getroot().nsmap))
matrix = None
if bool(rect.get('transform')):
matrix = Matrix(transform_matrix_string=rect.get('transform'))
- text = re.sub(r'(\s(?=[-;:.,?!’–])|(?<=[-;:.,?!’–])\s)', '', rect_title.text)
+ text = re.sub(r'(\s(?=[-;:.,…?!’–])|(?<=[-;:.,…?!’–])\s)', '', rect_title.text)
faksimile_page.append_word_position(\
WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=text, height=height,\
width=width, x=x, y=y, matrix=matrix, tag=WordPosition.FAKSIMILE))
faksimile_pages.append(faksimile_page)
return faksimile_pages
def remove_tags_from_page_tree(self, list_of_tags_to_remove):
"""Removes the tags specified in the list from the target tree.
"""
for xpath2remove in list_of_tags_to_remove:
for node in self.page_tree.xpath('//' + xpath2remove):
node.getparent().remove(node)
def get_paths_inside_rect(svg_tree, xpath, x_min, x_max, y_min, y_max, not_id, namespaces={}):
"""Returns a list of all paths selected by xpath that are inside x_min, x_max, y_min, y_max and do not have id == not_id.
"""
paths = []
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
for path_node in svg_tree.xpath(xpath, namespaces=namespaces):
append_node = path_node
if not path_node.tag.endswith('path') and not path_node.tag.endswith('rect'):
path_node = path_node.getparent()
x, xmax, y, ymax = -1, -1, -1, -1
init_xy = False
if path_node.tag.endswith('rect'):
x = float(path_node.get('x')) if bool(path_node.get('x')) else -1
y = float(path_node.get('y')) if bool(path_node.get('y')) else -1
xmax = x + float(path_node.get('width')) if bool(path_node.get('width')) else -1
ymax = y + float(path_node.get('height')) if bool(path_node.get('height')) else -1
init_xy = True
elif path_node.tag.endswith('path') and bool(path_node.get('d')) and path_node.get('d') != 0:
path = parse_path(path_node.get('d'))
x, xmax, y, ymax = path.bbox()
init_xy = True
if init_xy:
if bool(path_node.get('transform')):
matrix = Matrix(transform_matrix_string=path_node.get('transform'))
x, xmax = matrix.get_new_x(x=x, y=y), matrix.get_new_x(x=xmax, y=ymax)
y, ymax = matrix.get_new_y(x=x, y=y), matrix.get_new_y(x=xmax, y=ymax)
width = xmax - x
height = ymax - y
if x > x_min and x < x_max\
and y > y_min and y < y_max\
and path_node.get('id') != not_id:
paths.append(append_node)
return paths
Index: svgscripts/datatypes/transkription_position.py
===================================================================
--- svgscripts/datatypes/transkription_position.py (revision 111)
+++ svgscripts/datatypes/transkription_position.py (revision 112)
@@ -1,203 +1,203 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a transkription word position.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
import sys
from .debug_message import DebugMessage
from .image import SVGImage
from .positional_word_part import PositionalWordPart
from .word_position import WordPosition
from .matrix import Matrix
sys.path.append('py2ttl')
from class_spec import SemanticClass
class TranskriptionPosition(WordPosition):
"""
This class represents the position of a word on the transkription as it is displayed by a svg image.
@label position of a word on the topological transkription
Args:
id (int): word id
matrix (datatypes.Matrix): matrix containing information about transformation.
height (float): height of word
width (float): width of word
x (float): x position of word
y (float): y position of word
positional_word_parts a list of (datatypes.positional_word_part) PositionalWordPart
debug_message a (datatypes.debug_message) DebugMessage
"""
ADD2X = 0.15
ADD2TOP = 1.0
ADD2BOTTOM = 0.2
HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
XML_TAG = WordPosition.TRANSKRIPTION
def __init__(self, id=0, node=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, positional_word_parts=None, debug_message=None):
super(TranskriptionPosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION)
self.positional_word_parts = positional_word_parts if positional_word_parts is not None else []
self.debug_message = debug_message
self.deleted = False
self._deletion_paths = []
self.has_box = None
self.style = None
self.svg_image = None
if node is not None:
self.debug_message = DebugMessage(node=node.xpath('.//' + DebugMessage.XML_TAG)[0])\
if len(node.xpath('.//' + DebugMessage.XML_TAG)) > 0 else None
self.positional_word_parts = [ PositionalWordPart(node=pwp_node) for pwp_node in node.xpath('.//' + PositionalWordPart.XML_TAG) ]
self.attachable_objects += self.positional_word_parts
if self.debug_message is not None:
self.attachable_objects.append(self.debug_message)
def get_text(self):
"""Returns the concatenated text of all positional_word_parts.
"""
return ''.join([pwp.text for pwp in self.positional_word_parts])
def is_mergebale_with(self, other) -> bool:
"""Return whether self and other have same writing_process_id or style.
"""
if self.writing_process_id == other.writing_process_id:
return True
if self.writing_process_id == -1 or other.writing_process_id == -1\
and (len(self.positional_word_parts) > 0 and len(other.positional_word_parts) > 0):
return self.positional_word_parts[0].style_class == other.positional_word_parts[0].style_class
return False
def split(self, split_position, second_split=-1) ->list:
"""Split a transkription_position in two at split_position.
:return: a list of the new transkription_positions
"""
transkription_positions = []
left_pwp = [ pwp for pwp in self.positional_word_parts if pwp.left + pwp.width < split_position ]
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(left_pwp, transkription_position_id=self.id)
if second_split == -1:
right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp ]
next_id = int(self.id) + 1
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id))
else:
middle_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp.left + pwp.width < second_split ]
next_id = int(self.id) + 1
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(middle_pwp, transkription_position_id=str(next_id))
right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp not in middle_pwp ]
next_id = int(self.id) + 1
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id))
return transkription_positions
def update_positional_word_parts(self, positional_word_parts):
"""Update positional_word_parts.
"""
if len(self.positional_word_parts) > 0 and self.positional_word_parts in self.attachable_objects:
self.attachable_objects.remove(self.positional_word_parts)
self.positional_word_parts = positional_word_parts
self.attachable_objects += self.positional_word_parts
@staticmethod
def CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=None, debug_msg_string=None, transkription_position_id=0):
"""Creates a list of TranskriptionPosition from a list of (datatypes.positional_word_part) PositionalWordPart.
[:return:] a list of (datatypes.transkription_position) TranskriptionPosition
"""
TOPCORRECTION = 1
debug_message = DebugMessage(message=debug_msg_string)\
if debug_msg_string is not None else debug_message
transkription_positions = []
if len(positional_word_parts) < 1:
return []
matrix = positional_word_parts[0].transform
index = 0
matrices_differ = False
style_class = positional_word_parts[0].style_class
styles_differ = False
while index < len(positional_word_parts) and not matrices_differ and not styles_differ:
if Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix, positional_word_parts[index].transform):
matrices_differ = True
elif style_class != positional_word_parts[index].style_class:
styles_differ = True
else:
index += 1
if (matrices_differ or styles_differ) and index < len(positional_word_parts):
debug_msg_string = 'matrices differ' if matrices_differ else 'styles differ'
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
positional_word_parts[index:], debug_msg_string=debug_msg_string, transkription_position_id=int(transkription_position_id)+1)
positional_word_parts = positional_word_parts[:index]
heighest_pwp = sorted(positional_word_parts, key=lambda pwp: pwp.height, reverse=True)[0]
toppest_pwp = sorted(positional_word_parts, key=lambda pwp: pwp.top)[0]
height = heighest_pwp.height + 2*TOPCORRECTION
if heighest_pwp != toppest_pwp:
height += abs(heighest_pwp.top-toppest_pwp.top)
x = positional_word_parts[0].left - TranskriptionPosition.ADD2X
width = positional_word_parts[-1].left - x\
+ positional_word_parts[-1].width + TranskriptionPosition.ADD2X
if matrix is not None and matrix.isRotationMatrix():
x = positional_word_parts[0].left - matrix.matrix[Matrix.XINDEX]\
if positional_word_parts[0].left - matrix.matrix[Matrix.XINDEX] > 0\
else 0.0
y = toppest_pwp.top - TOPCORRECTION\
if matrix is None or not matrix.isRotationMatrix()\
- else height*-1
+ else (height-TOPCORRECTION)*-1
for pwp_index, pwp in enumerate(positional_word_parts):
pwp.id = pwp_index
transkription_positions.insert(0, TranskriptionPosition(id=transkription_position_id, height=height, width=width, x=x, y=y, matrix=matrix,\
positional_word_parts=positional_word_parts, debug_message=debug_message))
return transkription_positions
@staticmethod
def CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=None, debug_msg_string=None, transkription_field=None, svg_path_tree=None, namespaces=None):
"""Creates a list of TranskriptionPosition from word_part_objs (i.e. a list of dictionaries
with the keys: text, x, y, matrix, class).
[:return:] a list of (datatypes.transkription_position) TranskriptionPosition
"""
positional_word_parts = []
debug_message = DebugMessage(message=debug_msg_string)\
if debug_msg_string is not None else None
if page.svg_file is not None and isfile(page.svg_file):
svg_path_tree = ET.parse(page.svg_file) if svg_path_tree is None else svg_path_tree
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }\
if namespaces is None else namespaces
xmin = 0.0
ymin = 0.0
if transkription_field is not None:
xmin = transkription_field.xmin
ymin = transkription_field.ymin
for part_obj in word_part_objs:
positional_word_parts += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(\
part_obj, svg_path_tree, namespaces, page, start_id=len(positional_word_parts),\
xmin=xmin, ymin=ymin)
else:
positional_word_parts = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs)
if len(positional_word_parts) > 0:
return TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=debug_message)
else:
return [ TranskriptionPosition(matrix=matrix, debug_message=debug_message) ]
Index: svgscripts/datatypes/style.py
===================================================================
--- svgscripts/datatypes/style.py (revision 111)
+++ svgscripts/datatypes/style.py (revision 112)
@@ -1,206 +1,209 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent the style of a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
from lxml import etree as ET
import re
import sys
from .color import Color
sys.path.append('py2ttl')
from class_spec import SemanticClass
class Style(SemanticClass):
"""
This class represents the style of a word.
Args:
manuscript: a ArchivalManuscriptUnity
"""
NIETSCHES_FONTS = { 'german': 'deutsche Schreibschrift', 'latin': 'lateinische Schreibschrift' }
COLOR_KEYS = [ 'black', 'red', 'blue', 'green', 'grey' ]
RELEVANT_STYLE_KEYS = [ 'font-family', 'fill', 'stroke' ]
ADDITIONAL_STYLE_KEYS = [ 'font-size' ]
PERCENTS = [ '80%', '70%' ]
WRITING_INSTRUMENTS = { (COLOR_KEYS[0], False): 'schwarze Tinte',\
(COLOR_KEYS[0], True): 'Bleistift',\
(COLOR_KEYS[4], True): 'Bleistift',\
(COLOR_KEYS[4], False): 'Bleistift',\
(COLOR_KEYS[1], False): 'braune Tinte',\
(COLOR_KEYS[1], True): 'Rotstift',\
(COLOR_KEYS[2], False): 'violette Tinte',\
(COLOR_KEYS[2], True): 'Blaustift',\
(COLOR_KEYS[3], False): '„Tinte der letzten Korrektur“',\
(COLOR_KEYS[3], True): '„Tinte der letzten Korrektur“'}
- def __init__(self, manuscript=None, writing_process_id=-1, extended_styles=False, deletion_color=None):
+ def __init__(self, manuscript=None, writing_process_id=-1, extended_styles=False, deletion_color=None, underline=False):
self.color = Color.create_cls(manuscript=manuscript)
self.css_styles = []
self.css_string = None
self.deletion_color = deletion_color
self.is_german = True
self.font = self.NIETSCHES_FONTS['german']
self.font_family = 'Weidemann-Book'
self.font_size = ''
self.manuscript = manuscript
self.relevant_key_map = {}
relevant_style_keys = self.RELEVANT_STYLE_KEYS + self.ADDITIONAL_STYLE_KEYS\
if extended_styles else self.RELEVANT_STYLE_KEYS
for key in relevant_style_keys:
if not key.startswith('font'):
self.relevant_key_map.update({key: self.set_color})
elif key == 'font-family':
self.relevant_key_map.update({key: self.set_font})
elif key == 'font-size':
self.relevant_key_map.update({key: self.set_size})
+ self.underline = underline
self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, False)]
self.writing_process_id = writing_process_id
def create_a_copy_wo_writing_process_id(self):
new_self = copy.deepcopy(self)
new_self.writing_process_id = -1
return new_self
def create_a_copy(self, reduce_writing_process_id=False):
writing_process_id = self.writing_process_id\
if not reduce_writing_process_id\
else self.writing_process_id-1
copy = Style(manuscript=self.manuscript, writing_process_id=writing_process_id)
copy.color = self.color
copy.font_family = self.font_family
copy.process_style_classes()
if copy.manuscript is not None:
copy.manuscript.update_styles(copy)
return copy
def create_css_styles(self):
"""Create css styles.
"""
if self.deletion_color is not None:
self.css_styles.append('text-decoration:line-through;')
self.css_styles.append(f'text-decoration-color:{self.deletion_color.hex_color};')
self.css_styles.append(f'-webkit-text-decoration-color:{self.deletion_color.hex_color};')
+ if self.underline:
+ self.css_styles.append('text-decoration:underline;')
if self.font_family.endswith('Bold'):
self.css_styles.append(f'font-weight:bold;')
#if self.font_size != '':
# self.css_styles.append(f'font-size:{self.font_size};')
if self.writing_process_id > 0:
self.css_styles.append(f'font-size:{self.PERCENTS[self.writing_process_id-1]};')
self.css_styles.append(f'color:{self.color.hex_color};')
self.css_string = ''.join(self.css_styles)
@classmethod
def create_cls(cls, page, style_string, manuscript=None, create_css=False, deletion_color=None, writing_process_id=-1):
"""Creates a Style from a style_string.
:return: (datatypes.style) Style
"""
style = cls(manuscript=manuscript, extended_styles=create_css, deletion_color=deletion_color, writing_process_id=writing_process_id)
style_dict = { key: key_dict for key, key_dict in page.style_dict.items()\
if any(relevant_key in key_dict.keys() for relevant_key in style.relevant_key_map.keys()) }
if style_string is not None:
for style_key in style_string.split(' '):
if style_key in style_dict.keys():
dictionary = style_dict[style_key]
for key, set_function in [ (key, func) for key, func in style.relevant_key_map.items() if key in dictionary.keys() ]:
if callable(set_function):
set_function(dictionary[key])
style.process_style_classes()
if create_css:
style.create_css_styles()
return style
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
properties = {}
properties.update(cls.create_semantic_property_dictionary('font', str, cardinality=1,\
name='styleHasFont', label='style has font', comment='Connects a style with the kind of font Nietzsche used in writing.'))
properties.update(cls.create_semantic_property_dictionary('writing_instrument', str, cardinality=1,\
name='styleHasWritingInstrument', label='style has writing instrument', comment='Connects a style with the description of a writing instrument.'))
properties.update(cls.create_semantic_property_dictionary('color', Color, cardinality=1,\
name='styleHasColor', label='style has color', comment='Connects a style with a color.'))
#properties.update(cls.create_semantic_property_dictionary('css_styles', str,\
properties.update(cls.create_semantic_property_dictionary('css_string', str,\
subPropertyOf=cls.STOFF_STYLE_HAS_CSS_URL_STRING,\
name='styleHasCSS', label='style has css', comment='Connects a style with CSS style.'))
dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties }
return cls.return_dictionary_after_updating_super_classes(dictionary)
def process_style_classes(self):
"""Infere writing instrument from font-family and color.
"""
if self.font_family.startswith('NewsGothic'):
self.is_german = False
self.font = self.NIETSCHES_FONTS['latin']
if self.color.name in self.COLOR_KEYS:
self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, self.font_family.endswith('Bold'))]
def set_color(self, hex_color: str):
if hex_color != 'none':
self.color = Color.create_cls(hex_color=hex_color, manuscript=self.manuscript)
def set_font(self, font_family: str):
self.font_family = font_family
def set_size(self, font_size: str):
self.font_size = font_size
@classmethod
def remove_irrelevant_style_keys(cls, style_string, page, extended_styles=False) -> str:
"""Return a style_string without irrelevant style keys.
"""
relevant_style_keys = cls.RELEVANT_STYLE_KEYS + cls.ADDITIONAL_STYLE_KEYS\
if extended_styles else cls.RELEVANT_STYLE_KEYS
return ' '.join(sorted( style_key for style_key in style_string.split(' ')\
if len(\
[ key for key in page.style_dict[style_key].keys()\
if key in relevant_style_keys ]\
) > 0 ))
def __eq__(self, other):
"""Returns true if self is qualitatively identical to other.
Reason: For qualities, the idea of numerical identity is silly.
"""
if other is None:
return False
return self.color == other.color\
and self.font_family == other.font_family\
and self.writing_process_id == other.writing_process_id\
and self.css_styles == other.css_styles\
and self.font_size == other.font_size
def __hash__(self):
"""Return a hash value for self.
"""
return hash((self.color.__hash__, self.font_family, self.writing_process_id))
Index: svgscripts/datatypes/word.py
===================================================================
--- svgscripts/datatypes/word.py (revision 111)
+++ svgscripts/datatypes/word.py (revision 112)
@@ -1,919 +1,921 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
import inspect
from lxml import etree as ET
from operator import attrgetter
import re
import string
import sys
import warnings
from .box import Box
from .editor_comment import EditorComment
from .matrix import Matrix
from .path import Path
from .simple_word import SimpleWord
from .style import Style
from .word_deletion_path import WordDeletionPath
from .word_position import WordPosition
from .transkription_position import TranskriptionPosition
from .writing_process import WritingProcess
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
def execute_function_on_parts(word_parts, func_name):
"""Execute function on parts and add those parts instead of original word to word_parts.
:return: new word_parts, output from func
"""
copy_parts = word_parts[:]
for word in word_parts:
output = eval('word.{0}()'.format(func_name))
if len(word.word_parts) > 0:
for part_word in word.word_parts:
copy_parts.insert(copy_parts.index(word), part_word)
copy_parts.remove(word)
word.word_parts = []
return copy_parts, output
def update_transkription_position_ids(word):
"""Update transkription_position' ids according to index.
"""
word_part_ids = [ wp.id for wp in word.word_parts ]
if len(word_part_ids) != len(set(word_part_ids)):
for id, wp in enumerate(word.word_parts):
wp.id = id
for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
transkription_position.id = index
transkription_position.has_box = None
transkription_position.deleted = False
class Word(SimpleWord):
"""
This class represents a word.
"""
COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ]
APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' }
DATA = 'debug-data'
RDFS_SUBCLASSOF_LIST = ['https://www.e-editiones.ch/ontology/text#HandwrittenText']
XML_TAG = 'word'
XML_EARLIER_VERSION = 'earlier-version'
XML_OVERWRITES = 'overwrites'
XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\
'isDeletionOfWord': 'deletesEarlierPart',\
'isExtensionOfWord': 'extendsEarlierVersion',\
'isTransformationOfWord': 'transformsEarlierPart' }
def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None):
super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
self.corrections = []
self.clean_edited_text = None
self.deleted = deleted
self.deletion_paths = []
self.deletion_paths_near_word = []
self.debug_container = {}
self.debug_msg = None
self.earlier_version = earlier_version
self.edited_text = None
#self.editor_comment = None
self.editor_comments = []
self.isClarificationOfWord = None
self.isDeletionOfWord = None
self.isExtensionOfWord = None
self.isTransformationOfWord = None
if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0:
self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ])
self.overwrites_word = None
self.process_flags = []
self.styles = styles\
if styles is not None\
else []
+ self.undeleted_from_deletion_paths = []
self.verified = None
self.writing_process_id = writing_process_id
self.writing_processes = []
self.word_insertion_mark = None
self.word_box = None
self.word_parts = word_parts if word_parts is not None else []
self.word_part_objs = word_part_objs if word_part_objs is not None else []
def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Add a word deletion path to word.
"""
if len(self.word_parts) > 0:
for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
elif self.deleted:
index = 0
while len(self.deletion_paths) == 0 and index < len(self.transkription_positions):
include_pwps = (len(self.transkription_positions[index].positional_word_parts) > 0
and abs(self.transkription_positions[index].left-self.transkription_positions[index].positional_word_parts[0].left) < 10)
word_path = Path.create_path_from_transkription_position(self.transkription_positions[index],\
tr_xmin=tr_xmin, tr_ymin=tr_ymin, include_pwps=include_pwps)
self.deletion_paths += [ deletion_path for deletion_path in deletion_paths\
if not Path.is_path_contained(self.deletion_paths, deletion_path)\
and deletion_path.do_paths_intersect(word_path) ]
index += 1
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
word_node = super(Word,self).attach_word_to_tree(target_tree)
if self.deleted is not None:
word_node.set('deleted', str(self.deleted).lower())
if self.verified is not None:
word_node.set('verified', str(self.verified).lower())
if self.edited_text is not None:
word_node.set('edited-text', self.edited_text)
#if self.editor_comment is not None:
# self.editor_comment.attach_object_to_tree(word_node)
for editor_comment in self.editor_comments:
editor_comment.attach_object_to_tree(word_node)
if self.writing_process_id > -1:
word_node.set('writing-process-id', str(self.writing_process_id))
if len(self.process_flags) > 0:
word_node.set('process-flags', ' '.join(self.process_flags))
for index, word_part in enumerate(self.word_parts):
word_part.id = index
word_part.attach_word_to_tree(word_node)
if self.earlier_version is not None:
earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION)
self.earlier_version.attach_word_to_tree(earlier_node)
if self.overwrites_word is not None\
and len(self.overwrites_word.transkription_positions) > 0:
overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES)
self.overwrites_word.attach_word_to_tree(overwrite_node)
if self.word_box is not None:
self.word_box.attach_object_to_tree(word_node)
if len(self.corrections) > 0:
word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ])))
for deletion_id, deletion_path in enumerate(self.deletion_paths):
deletion_path.id = deletion_id
deletion_path.tag = WordDeletionPath.XML_TAG
deletion_path.attach_object_to_tree(word_node)
for key in self.XML_CORRECTION_DICT.keys():
if self.__dict__[key] is not None:
word_node.set(self.XML_CORRECTION_DICT[key], 'true')
return word_node
def belongs_to_multiple_writing_processes(self, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if len(self.word_parts) > 0 and include_parts:
return len(set(word.writing_process_id for word in self.word_parts)) > 1
return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1
def set_parent_word_writing_process_id(self):
"""Set writing_process_id for parent word.
"""
ids = set(word.transkription_positions[0].style for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None)
if len(ids) > 1:
self.writing_process_id = max([style.writing_process_id for style in ids])
if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\
for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\
> 1:
self.writing_process_id += 1
@classmethod
def create_cls(cls, word_node):
"""Creates a word from a (lxml.Element) node.
[:return:] Word
"""
cls = super(Word,cls).create_cls(word_node)
cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
cls.split_strings = None
cls.join_string = word_node.get('join')
if bool(word_node.get('split')):
cls.split_strings = word_node.get('split').split(' ')
if ''.join(cls.split_strings) != cls.text:
error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\
format(word_node.getroottree().docinfo.URL, str(cls.id))\
+ 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\
+ 'Text attribute: "{0}".\n'.format(cls.text)
raise Exception(error_msg)
cls.verified = word_node.get('verified') == 'true'\
if bool(word_node.get('verified')) else None
cls.deleted = word_node.get('deleted') == 'true'\
if bool(word_node.get('deleted')) else None
cls.edited_text = word_node.get('edited-text')
if cls.edited_text is not None:
cls.clean_edited_text = cls._create_clean_text(cls.edited_text)
cls.editor_comments = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ]
- """
- cls.editor_comment = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ][0]\
- if len([ node for node in word_node.xpath('./' + EditorComment.XML_TAG) ]) > 0 else None
- """
cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ]
if bool(word_node.get('corrections')):
for index in [ int(i) for i in word_node.get('corrections').split(' ') ]:
if index < len(cls.word_parts):
cls.corrections.append(cls.word_parts[index])
cls.earlier_version = None
if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0:
cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0]
for key_value in cls.XML_CORRECTION_DICT.values():
if word_node.get(key_value) == 'true':
cls.__dict__[key_value] = True
if cls.earlier_version is not None:
for word_part in cls.word_parts:
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\
and len(cls.word_parts) <= len(cls.earlier_version.word_parts):
try:
word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id]
except Exception:
msg = f'{cls.id} {cls.text}: {word_part.id}'
raise Exception(msg)
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls.earlier_version
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls
cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\
if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\
else None
cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\
if len(word_node.xpath('./' + Box.XML_TAG)) > 0\
else None
cls.deletion_paths = [ Path(node=node) for node in word_node.xpath(f'./{WordDeletionPath.XML_TAG}') ]
+ if bool(word_node.get('undeleted')):
+ cls.undeleted_from_deletion_paths = cls.deletion_paths
cls.process_flags = word_node.get('process-flags').split(' ')\
if bool(word_node.get('process-flags'))\
else []
return cls
@classmethod
def join_words(cls, list_of_words, add_white_space_between_words=False):
"""Creates a word from a list of words.
[:return:] Word
"""
if len(list_of_words) > 1:
deleted = True in [ word.deleted for word in list_of_words ]\
and len(set([ word.deleted for word in list_of_words ])) == 1
line_number = list_of_words[0].line_number\
if len(set([ word.line_number for word in list_of_words ])) == 1\
else -1
faksimile_positions = []
for word in list_of_words:
if len(word.word_parts) > 0:
faksimile_positions += word.faksimile_positions
index = list_of_words.index(word)
list_of_words.remove(word)
for part_word in reversed(word.word_parts):
list_of_words.insert(index, part_word)
new_word_text = ''.join([word.text for word in list_of_words])\
if not add_white_space_between_words\
else ' '.join([word.text for word in list_of_words])
new_word = cls(id=list_of_words[0].id, text=new_word_text, faksimile_positions=faksimile_positions,\
line_number=line_number, deleted=deleted, word_parts=list_of_words)
if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]:
change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0]
new_word.edited_text = new_word.text.replace(change_text, change_text[:-1])
for id, word in enumerate(new_word.word_parts): word.id = id
return new_word
if len(list_of_words) > 0:
return list_of_words[0]
else:
return None
def create_earlier_version(self, root_word=None, id=0):
"""Create an earlier version of word.
"""
if root_word is None:
root_word = self
root_word.set_parent_word_writing_process_id()
word_parts = []
non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\
if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ]
non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts)
if non_single_punctuation_word_parts_length > 0\
and len([ word_part for word_part in non_single_punctuation_word_parts\
if word_part.deleted ])\
== non_single_punctuation_word_parts_length:
self.deleted = True
for word_part in non_single_punctuation_word_parts: word_part.deleted = False
for id, word_part in enumerate(self.word_parts):
earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id)
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
elif word_part.overwrites_word is not None\
and ((len(word_part.transkription_positions) > 0\
and word_part.overwrites_word.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style\
!= word_part.overwrites_word.transkription_positions[0].style)
or word_part.word_box.earlier_version):
word_part.overwrites_word.id = word_part.id
word_parts.append(word_part.overwrites_word)
word_part.isTransformationOfWord = word_part.overwrites_word
#print(f'transform: {self.text}')
if word_part not in self.corrections:
self.corrections.append(word_part)
elif root_word.writing_process_id > -1\
and (len(word_part.transkription_positions) > 0\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style.writing_process_id\
== root_word.writing_process_id):
word_part.extendsEarlierVersion = True
#print('extends')
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
#print(f'default: {self.text}')
word_parts.append(earlierWordPart)
text = ''.join([ word.text for word in word_parts ])\
if len(word_parts) > 0\
else self.text
if len(word_parts) == 1:
self.transkription_positions += word_parts[0].transkription_positions
self.faksimile_positions += word_parts[0].faksimile_positions
word_parts = []
new_transkription_positions = copy.deepcopy(self.transkription_positions)
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None:
writing_process_id = self.transkription_positions[0].style.writing_process_id
for new_tp in new_transkription_positions:
new_tp.style.writing_process_id = writing_process_id
return Word(id=id, text=text, transkription_positions=new_transkription_positions,\
faksimile_positions=self.faksimile_positions, line_number=self.line_number,\
word_parts=word_parts)
def create_correction_history(self, page=None, box_style=None):
"""Create correction history.
"""
if self.word_box is not None:
manuscript = self.transkription_positions[0].style.manuscript\
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None\
else None
style = Style()
if box_style is not None:
style = box_style
if page is not None:
style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript)
for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]:
style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions)
for transkription_position in transkription_positions:
transkription_position.style = style
self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\
line_number=self.line_number)
for word_part in self.word_parts:
word_part.create_correction_history(page=page, box_style=box_style)
if len(self.word_parts) > 0:
earlier_version = self.create_earlier_version()
extending_words = self._get_parts_with_property_key('extendsEarlierVersion')
if len(extending_words) > 0:
for word in extending_words:
word.isExtensionOfWord = earlier_version
if self.has_mixed_status('deleted', include_parts=True):
self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ])
if len(self.corrections) > 0:
self.earlier_version = earlier_version
@staticmethod
def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
"""Creates a word from a (lxml.Element) node or word_part_objs.
[:return:] Word
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
text = word_node.get('text')
deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
if len(word_node.findall('.//' + Word.DATA)) > 0\
else [ item.attrib for item in word_node.findall('.//part')]
return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
WIDTH = 5
TOPCORRECTION = 2.0
FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
height = height
x = round(float(word_part_objs[0]['x']), 3)
if(page is not None and bool(page.style_dict)):
HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
if endSign is not None and '%' in endSign:
lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
if bool(page.style_dict[key].get('font-size'))]
lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
elif endSign is not None and '%' in endSign:
endX = float(endX) + WIDTH
bottom = round(float(word_part_objs[0]['y']), 3)
y = round(bottom - height + TOPCORRECTION, 3)
width = round(float(endX) - x, 3)
transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
text = ''.join([ dict['text'] for dict in word_part_objs])
line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
word.debug_msg = debug_msg
return word
else:
error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(Word,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\
cardinality=1, cardinality_restriction='minCardinality',\
name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\
name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\
name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\
comment='Word has been deleted by the author using a deletion path.'))
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('undeleted_from_deletion_paths', WordDeletionPath,\
+ name='wordIsUndeletedFromPath', label='word has been undeleted',\
+ comment='Word has been undeleted by the author using dots.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comments', EditorComment,\
name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\
name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\
name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('clean_edited_text', str,\
name='hasCleanEditedText', label='word has an edited text without punctuation',\
comment='Word has a text without punctuation that is edited automatically by removing deleted parts or hyphens.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\
name='isClarificationOfWord', label='word is a clarification of word',\
comment='The author has used this part of the word in order to clarify the appearance of that word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\
name='isDeletionOfWord', label='word is a deletion of word',\
comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\
name='isExtensionOfWord', label='word is a extension of word',\
comment='The author has used this part of a word in order to extend an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\
name='isTransformationOfWord', label='word is a transformation of word',\
comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\
name='overwritesWord', label='word overwrites word',\
comment='The author has used this word in order to overwrite that word.'))
# This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING,
# cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class.
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\
name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\
subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING))
super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\
name='isCorrectionOfWord', label='word is a correction of word',\
comment='The author has used this word in order to correct that word.')
for key in cls.XML_CORRECTION_DICT.keys():
correction_dict = dictionary[cls.PROPERTIES_KEY].get(key)
correction_dict.update(super_property_dictionary)
dictionary[cls.PROPERTIES_KEY].update({key: correction_dict})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def has_mixed_status(self, property_key, include_parts=False, concerns_word=True):
"""Returns true if transkription_positions have mixed status concerning the property_key in their __dict__.
"""
if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions):
return False
if len(self.word_parts) > 0 and include_parts:
if concerns_word:
if False in set(property_key in word.__dict__.keys() for word in self.word_parts):
return False
return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1
else:
return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\
if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1
return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1
def init_word(self, page):
"""Initialize word with objects from page.
"""
super(Word,self).init_word(page)
if self.writing_process_id > -1:
self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ]
writing_processes = self.writing_processes
for word_part in self.word_parts:
word_part.init_word(page)
self.lines += word_part.lines
self.writing_processes += word_part.writing_processes
self.lines = [ line for line in set(self.lines) ]
self.writing_processes = [ wp for wp in set(self.writing_processes)]
if self.overwrites_word is not None:
self.overwrites_word.init_word(page)
if self.earlier_version is not None:
if self.earlier_version.writing_process_id == -1:
self.earlier_version.writing_process_id = self.writing_process_id-1
if self.earlier_version.line_number == -1:
self.earlier_version.line_number = self.line_number
self.earlier_version.init_word(page)
self.deletion_paths = [ page.get_word_deletion_path(path) for path in self.deletion_paths if path.path is not None ]
def join(self, other_word, append_at_end_of_new_word=True, add_white_space_between_words=False):
"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
"""
if append_at_end_of_new_word:
self.text = self.text + other_word.text\
if not add_white_space_between_words\
else self.text + ' ' + other_word.text
for position in other_word.transkription_positions:
position.id = str(len(self.transkription_positions))
self.transkription_positions.append(position)
for position in other_word.faksimile_positions:
position.id = str(len(self.faksimile_positions))
self.faksimile_positions.append(position)
else:
self.text = other_word.text + self.text
index = 0
for position in other_word.transkription_positions:
self.transkription_positions.insert(index, position)
index += 1
while index < len(self.transkription_positions):
self.transkription_positions[index].id = str(index)
index += 1
index = 0
for position in other_word.faksimile_positions:
self.faksimile_positions.insert(indexposition)
index += 1
while index < len(self.faksimile_positions):
self.faksimile_positions[index].id = str(index)
index += 1
self.simplify_transkription_positions()
def partition_according_to_deletion(self):
"""Partition a word according to its transkription_positions' deletion status
->split word and add partial words as its parts.
"""
if self.has_mixed_status('deleted'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.deleted != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
for tp in transkription_positions:
newWord.deletion_paths += tp._deletion_paths
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.deleted
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
for tp in transkription_positions:
newWord.deletion_paths += tp._deletion_paths
self.word_parts.append(newWord)
self.transkription_positions = []
self.line_number = -1
self.deleted = False
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion')
elif not self.deleted\
and len(self.transkription_positions) > 0\
and self.transkription_positions[0].deleted:
self.deleted = True
for tp in self.transkription_positions:
self.deletion_paths += tp._deletion_paths
def partition_according_to_writing_process_id(self):
"""Partition a word according to its transkription_positions' writing_process_ids
->split word and add partial words as its parts.
"""
if self.belongs_to_multiple_writing_processes():
last_writing_process_id = -1
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.writing_process_id != last_writing_process_id\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_writing_process_id = transkription_position.writing_process_id
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id')
if self.belongs_to_multiple_writing_processes(include_parts=True):
self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0]
elif len(self.transkription_positions) > 0:
self.writing_process_id = self.transkription_positions[0].writing_process_id
def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False):
"""Determines whether word is over a word box.
"""
word_over_box = None
if len(self.word_parts) > 0:
for word in self.word_parts:
current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None))
if current_word is not None and current_word.word_box is not None:
word_over_box = current_word
else:
new_tp_dict = {}
for index, transkription_position in enumerate(self.transkription_positions):
if previous_word_has_box and index == 0:
if len(transkription_position.positional_word_parts) > 0:
transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2
#print(f'{self.text}: {transkription_position.positional_word_parts[0].left}')
else:
transkription_position.left += 1
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
containing_boxes = [ box_path for box_path in box_paths\
if word_path.is_partially_contained_by(box_path)\
or box_path.do_paths_intersect(word_path) ]
if len(containing_boxes) > 0:
if previous_word_has_box:
print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}')
self._set_box_to_transkription_position(containing_boxes[0], word_path,\
transkription_position, new_tp_dict, tr_xmin)
box_paths.remove(containing_boxes[0])
for replace_tp in new_tp_dict.keys():
for tp in new_tp_dict.get(replace_tp):
self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp)
self.transkription_positions.remove(replace_tp)
word_over_box = self._get_partial_word_over_box()
update_transkription_position_ids(self)
return word_over_box
def set_word_insertion_mark(self, word_insertion_mark):
"""Sets word_insertion_mark
"""
self.word_insertion_mark = word_insertion_mark
def set_writing_process_id_to_transkription_positions(self, page):
"""Determines the writing process id of the transkription_positions.
"""
for transkription_position in self.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
if font_key in page.fontsizekey2stage_mapping.keys():
transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
def simplify_transkription_positions(self):
"""Merge transkription_positions if possible.
"""
index = len(self.transkription_positions)-1
while index > 0\
and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
current_tp = self.transkription_positions[index]
index -= 1
previous_tp = self.transkription_positions[index]
if previous_tp.is_mergebale_with(current_tp):
positional_word_parts = previous_tp.positional_word_parts
positional_word_parts += current_tp.positional_word_parts
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
if len(transkription_positions) == 1:
transkription_positions[0].writing_process_id = previous_tp.writing_process_id\
if previous_tp.writing_process_id != -1\
else current_tp.writing_process_id
self.transkription_positions.pop(index+1)
self.transkription_positions[index] = transkription_positions[0]
#print(self.text, len(self.transkription_positions))
def split(self, split_string, start_id=0):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString, currentString, nextString = self.text.partition(split_string)
currentWord = None
previousWord = None
nextWord = None
previousIndex = 0
current_id = start_id
all_positional_word_parts = []
for position in self.transkription_positions:
all_positional_word_parts += position.positional_word_parts
if len(all_positional_word_parts) == 0:
warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString))
if len(previousString) > 0:
previous_pwps = []
while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
previous_pwps.append(all_positional_word_parts[previousIndex])
previousIndex += 1
if previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString))
else:
previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split')
previous_text = ''.join([ pwp.text for pwp in previous_pwps ])
previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions)
previousWord.faksimile_positions = self.faksimile_positions
current_id += 1
all_positional_word_parts = all_positional_word_parts[previousIndex:]
if len(nextString) > 0:
tmp_pwps = []
index = 0
while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
tmp_pwps.append(all_positional_word_parts[index])
index += 1
if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString))
else:
next_pwps = all_positional_word_parts[index:]
next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split')
next_text = ''.join([ pwp.text for pwp in next_pwps ])
nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions)
nextWord.faksimile_positions = self.faksimile_positions
all_positional_word_parts = all_positional_word_parts[:index]
current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split')
current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ])
currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions)
currentWord.faksimile_positions = self.faksimile_positions
return previousWord, currentWord, nextWord
def split_according_to_status(self, status, splits_are_parts=False):
"""Split a word according to its transkription_positions' text.
:return: a list of new word.Word
"""
new_words = []
if self.has_mixed_status(status):
last_status = None
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.__dict__[status] != last_status\
and len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.__dict__[status]
if len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
if splits_are_parts:
self.word_parts += new_words
if len(self.word_parts) > 0:
self.transkription_positions = []
return new_words
def undo_partitioning(self):
"""Undo partitioning.
"""
if len(self.word_parts) > 0:
for word_part in self.word_parts:
word_part.undo_partitioning()
if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]):
self.transkription_positions += word_part.transkription_positions
self.earlier_version = None
self.edited_text = None
self.word_box = None
self.word_parts = []
self.corrections = []
self.earlier_versions = []
self.box_paths = []
def _create_new_word(self, transkription_positions, status, new_id=0):
"""Create a new word from self and transkription_positions.
"""
newWord = Word(id=new_id, transkription_positions=transkription_positions)
for key in self.COPY_PROPERTY_KEY:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys():
newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status])
else:
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
return newWord
def _get_parts_with_property_key(self, property_key):
"""Return a list of word_parts with property == property_key.
"""
word_parts = []
for word_part in self.word_parts:
if property_key in word_part.__dict__.keys():
word_parts.append(word_part)
else:
word_parts += word_part._get_parts_with_property_key(property_key)
return word_parts
def _get_partial_word_over_box(self):
"""Partition a word according to its transkription_positions' has_box
->split word and add partial words as its parts.
:return: word over box or self
"""
word_over_box = None
if self.has_mixed_status('has_box'):
transkription_positions = []
last_word_box = None
for transkription_position in self.transkription_positions:
if transkription_position.has_box != last_word_box\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
transkription_positions = []
transkription_positions.append(transkription_position)
last_word_box = transkription_position.has_box
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
self.transkription_positions = []
elif len(self.word_parts) > 0:
#self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box')
for word_part in self.word_parts:
if word_over_box is None:
word_over_box = word_part._get_partial_word_over_box()
else:
break
elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1:
word_over_box = self
word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box
return word_over_box
def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin):
"""Set box_path to transkription_position that is contained by box_path.
Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary.
"""
if box_path.contains_path(word_path):
transkription_position.has_box = box_path
elif box_path.contains_start_of_path(word_path):
split_position = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[0].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
elif box_path.contains_end_of_path(word_path):
split_position = box_path.path.bbox()[0] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
else: # box_path in the middle of word_pathz
split_position1 = box_path.path.bbox()[0] - tr_xmin
split_position2 = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position1, split_position2)
if len(new_tps) >= 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
def do_paths_intersect_saveMode(mypath1, mypath2):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try:
return mypath1.path.intersect(mypath2.path, justonemode=True)\
or mypath1.is_partially_contained_by(mypath2)
except AssertionError:
return False
Index: tests_svgscripts/test_extractWordPosition.py
===================================================================
--- tests_svgscripts/test_extractWordPosition.py (revision 111)
+++ tests_svgscripts/test_extractWordPosition.py (revision 112)
@@ -1,236 +1,264 @@
import unittest
import os
from os import sep, path
from os.path import isfile, isdir, dirname
import re
import shutil
import tempfile
import lxml.etree as ET
from lxml.etree import XMLSyntaxError
import sys
sys.path.append('svgscripts')
import extractWordPosition
from myxmlwriter import write_pretty
from datatypes.transkriptionField import TranskriptionField
from datatypes.matrix import Matrix
from datatypes.page_creator import PageCreator, FILE_TYPE_SVG_WORD_POSITION
from datatypes.page import Page
from datatypes.positional_word_part import PositionalWordPart
from datatypes.pdf import PDFText
from datatypes.word import Word
from datatypes.lineNumber import LineNumber
from datatypes.word_insertion_mark import WordInsertionMark
def test_write(xml_element_tree=None, file_name=None):
write_pretty(xml_element_tree=xml_element_tree, file_name=None, script_name='test', file_type=FILE_TYPE_SVG_WORD_POSITION)
class TestExtractor(unittest.TestCase):
def setUp(self):
extractWordPosition.Extractor.UNITTESTING = True
DATADIR = dirname(__file__) + sep + 'test_data'
self.test_file_find_word = DATADIR + sep + 'test_find_word.xml'
self.test_dir = tempfile.mkdtemp()
self.title = 'ABC 111'
self.matrix_string = 'matrix(1 0 0 1 183.6558 197.9131)'
self.test_file = DATADIR + sep + 'Mp_XIV_1_mytest_421.svg'
self.test_empty_file = DATADIR + sep + 'my_empty_test.svg'
self.test_source = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml'
self.xml420 = DATADIR + sep + 'Mp_XIV_1_page420.xml'
self.pdf420 = DATADIR + sep + 'Mp_XIV_1_online_420.pdf'
self.pdf_file = DATADIR + sep + 'W_I_8_page125.pdf'
self.faulty_xml = DATADIR + sep + 'W_I_8_faulty_page125.xml'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.testA = DATADIR + sep + 'testA.xml'
self.multipage = DATADIR + sep + 'multipage_small_above.svg'
def test_extract_information(self):
extractor = extractWordPosition.Extractor()
page = extractor.extract_information(self.multipage, multipage_index=0)
self.assertEqual(len(page.words), 59)
self.assertEqual(page.multipage_index, 0)
page = extractor.extract_information(self.multipage, multipage_index=1)
self.assertEqual(page.multipage_index, 1)
self.assertTrue(len(page.words) > 59)
extractor = extractWordPosition.Extractor()
source_page = Page('xml/Mp_XV_page78v.xml')
extractor = extractWordPosition.Extractor()
transkription_field = TranskriptionField(source_page.source)
svg_tree = ET.parse(source_page.source)
text_items = extractor.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)
self.assertTrue('matrix(1 0 0 1 115.6299 719.3535)' in [ item.get('transform') for item in text_items ])
page = extractor.extract_information(source_page.source, svg_file=source_page.svg_file)
self.assertTrue(page.svg_image.text_field is not None)
##:map :w:!python3 -m unittest tests_svgscripts.test_extractWordPosition.TestExtractor.test_improved_extract_word_position
@unittest.skip('test with local file')
def test_improved_extract_word_position(self):
extractor = extractWordPosition.Extractor()
source_page = Page('xml/Mp_XV_page85v.xml')
print(len(source_page.words))
source_page.words = []
extractor = extractWordPosition.Extractor()
transkription_field = TranskriptionField(source_page.source)
svg_tree = ET.parse(source_page.source)
extractor.improved_extract_word_position(svg_tree, source_page, transkription_field=transkription_field)
print(len(source_page.words))
#for word in source_page.words: print(word.id, word.text)
+ ##:map :w:!python3 -m unittest tests_svgscripts.test_extractWordPosition.TestExtractor.test_process_pwps_break_points
+ def test_process_pwps_break_points(self):
+ extractor = extractWordPosition.Extractor()
+ source_page = Page('xml/Mp_XV_page86r.xml')
+ word = [ word for word in source_page.words if word.text == 'Sorgen' and word.line_number == 2 ][0]
+ pwps = word.transkription_positions[0].positional_word_parts
+ wim = [ wim for wim in source_page.word_insertion_marks if wim.id == '5' ][0]
+ svg_path_tree = ET.parse(source_page.svg_file)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
+ pwps.append(PositionalWordPart(text=wim.mark_type, symbol_id=wim.symbol_id, x=wim.left, y=wim.top, height=wim.height, width=wim.width,\
+ style_class=source_page.sonderzeichen_list[0]))
+ word = [ word for word in source_page.words if word.text == 'los' and word.line_number == 2 ][0]
+ pwps += word.transkription_positions[0].positional_word_parts
+ source_page.words = []
+ break_points = extractor._get_pwps_break_points(source_page, pwps)
+ extractor._process_pwps_break_points(break_points, source_page, 0, pwps)
+ print([ word.text for word in source_page.words])
+ """
+ source_page.words = []
+ extractor = extractWordPosition.Extractor()
+ transkription_field = TranskriptionField(source_page.source)
+ svg_tree = ET.parse(source_page.source)
+ extractor.improved_extract_word_position(svg_tree, source_page, transkription_field=transkription_field)
+ print(len(source_page.words))
+ #for word in source_page.words: print(word.id, word.text)
+ """
+
def test_update_title(self):
extractor = extractWordPosition.Extractor(xml_dir=self.test_dir)
extractor.update_title_and_manuscript('test')
self.assertEqual(extractor.title, 'test')
self.assertEqual(extractor.manuscript_file, '{}/test.xml'.format(self.test_dir))
self.assertEqual(isfile('{}/test.xml'.format(self.test_dir)), True)
def test_get_page_number(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.get_page_number(self.test_file, page_number='1'), '001')
self.assertEqual(extractor.get_page_number(self.test_file), '421')
def test_get_file_name(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.get_file_name(self.test_file), 'xml/Mp_XIV_1_mytest_421.xml')
extractor = extractWordPosition.Extractor(title=self.title)
self.assertEqual(extractor.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_')))
extractorA = extractWordPosition.Extractor(title=self.title)
extractorB = extractWordPosition.Extractor(manuscript_file=extractorA.manuscript_file)
self.assertEqual(extractorB.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_')))
def test_get_style(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
self.assertEqual(sonderzeichen_list, [ 'st21', 'st23'])
self.assertEqual(style_dict.get('st11').get('font-family'), 'Frutiger-Europeen')
self.assertEqual(style_dict.get('st5').get('stroke'), '#CED5CE')
def test_get_word_from_part_obj(self):
extractor = extractWordPosition.Extractor()
mylist = [{'text': 'a', 'class': 'asdf' }, {'text': 'b', 'endX': 0 }, {'text': 'c'}]
self.assertEqual(extractor.get_word_from_part_obj(mylist), 'abc')
def test_get_break_points(self):
extractor = extractWordPosition.Extractor()
page = Page(self.pdf_xml)
page.source = self.pdf_xml_source
matrix = Matrix('matrix(1 0 0 1 543.8164 173.9126)')
matrixB = Matrix('matrix(1 0 0 1 573.6758 173.9126)')
matrixC = Matrix('matrix(1 0 0 1 575.9873 173.9126)')
mylist = [{'text': 'es', 'class': 'st5 st6', 'x': matrix.add2X(23.968), 'y': matrix.getY() },\
{'text': 'A', 'class': 'st9 st10', 'x': matrixB.getX(), 'y': matrixB.getY() },\
{'text': 'sich', 'class': "st5 st6", 'x': matrixC.getX(), 'y': matrixC.getY()}]
break_points = extractor._get_break_points(page, mylist)
self.assertTrue(len(break_points) > 0)
+
def test_get_pwps_break_points(self):
extractor = extractWordPosition.Extractor()
page = Page(self.pdf_xml)
page.svg_file = "./svg/W_I_8_page125_web.svg"
page.source = self.pdf_xml_source
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
matrix = Matrix('matrix(1 0 0 1 543.8164 173.9126)')
matrixB = Matrix('matrix(1 0 0 1 573.6758 173.9126)')
matrixC = Matrix('matrix(1 0 0 1 575.9873 173.9126)')
mylist = [{'text': 'es', 'class': 'st5 st6', 'x': matrix.add2X(23.968), 'y': matrix.getY() },\
{'text': 'A', 'class': 'st9 st10', 'x': matrixB.getX(), 'y': matrixB.getY() },\
{'text': 'sich', 'class': "st5 st6", 'x': matrixC.getX(), 'y': matrixC.getY()}]
pwps = []
for word_part_obj in mylist:
pwps += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces, page=page)
self.assertTrue(len(page.sonderzeichen_list) > 0)
break_points = extractor._get_pwps_break_points(page, pwps)
self.assertTrue(len(break_points) > 0)
def test_get_text_items(self):
svg_tree = ET.parse(self.test_file)
extractor = extractWordPosition.Extractor()
mytest_items = [ x for x in extractor.get_text_items(svg_tree.getroot()) ]
self.assertEqual(len(mytest_items), 300)
self.assertEqual(mytest_items[0].get('transform'), 'matrix(1 0 0 1 386.8218 57.1914)')
tf = TranskriptionField(self.test_file)
mytest_itemsTF = [ x for x in extractor.get_text_items(svg_tree.getroot(), transkription_field=tf) ]
self.assertEqual(mytest_itemsTF[0].get('transform'), 'matrix(1 0 0 1 204.8618 91.7134)')
def test_init_tree_and_target_file(self):
target_file = self.testA
page = PageCreator(target_file, title=self.title)
tree = page.page_tree
self.assertEqual(tree.getroot().get('title'), self.title)
self.assertEqual(tree.getroot().findall('./style'), [])
test_write(xml_element_tree=tree, file_name=target_file)
page = PageCreator(target_file)
tree = page.page_tree
self.assertEqual(tree.getroot().get('title'), self.title)
self.assertEqual(tree.getroot().findall('./style'), [])
isfile(target_file) and os.remove(target_file)
def test_add_style(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
target_file = self.testA
page = PageCreator(target_file,title=self.title)
page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict)
test_write(xml_element_tree=page.page_tree, file_name=target_file)
fromTarget_xml_tree = ET.parse(target_file)
self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title)
self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23")
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE')
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen')
page = PageCreator(target_file)
page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict)
test_write(xml_element_tree=page.page_tree, file_name=target_file)
fromTarget_xml_tree = ET.parse(target_file)
self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title)
self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23")
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE')
self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen')
isfile(target_file) and os.remove(target_file)
def test_add_word(self):
extractor = extractWordPosition.Extractor()
svg_tree = ET.parse(self.test_file)
mylist = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }]
matrix = Matrix(self.matrix_string)
for dict in mylist:
dict['class'] = 'st22'
dict['x'] = matrix.add2X(0)
dict['y'] = matrix.getY()
target_file = self.test_dir + sep + 'asdfasdf.xml'
page = PageCreator(target_file)
sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot())
page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 1)
mylist[1]['text'] = 'A'
mylist[1]['class'] = 'st21'
mylist[1]['x'] = matrix.add2X(1)
self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 2)
page.update_and_attach_words2tree()
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="1"]')[0].get('text'), 'a')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]')[0].get('text'), 'c')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('left'), '183.506')
self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('height'), '8.25')
def test_extractor(self):
extractor = extractWordPosition.Extractor()
self.assertEqual(extractor.title, None)
self.assertEqual(extractor.manuscript_file, None)
self.assertEqual(extractor.xml_dir, 'xml/')
self.assertEqual(extractor.manuscript_tree, None)
def test_write_title_to_manuscript_file(self):
extractor = extractWordPosition.Extractor(xml_dir=self.test_dir, title=self.title)
self.assertEqual(isfile(extractor.manuscript_file), True)
extractor = extractWordPosition.Extractor(manuscript_file=extractor.manuscript_file)
self.assertEqual(extractor.title, self.title)
def tearDown(self):
isdir(self.test_dir) and shutil.rmtree(self.test_dir)
isfile('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) and os.remove('{}/{}.xml'.format('xml', self.title.replace(' ', '_')))
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_faksimile.py
===================================================================
--- tests_svgscripts/test_faksimile.py (revision 111)
+++ tests_svgscripts/test_faksimile.py (revision 112)
@@ -1,90 +1,92 @@
import unittest
from os import sep, path
from os.path import isdir, dirname, basename
import lxml.etree as ET
import sys
import sys
sys.path.append('svgscripts')
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.faksimile_image import FaksimileImage
from datatypes.text_field import TextField
class TestFaksimilePage(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.svg_file = DATADIR + sep + 'W-II-1,49et50.svg'
self.svg_testmatrix = DATADIR + sep + 'TESTMATRIX_1.svg'
self.faksimile_dir = DATADIR + sep + 'faksimile_svg'
self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg'
self.faksimile_rotate90 = self.faksimile_dir + sep + 'Mp-XV-2c,4.svg'
def test_init(self):
image = FaksimileImage(file_name='test.jpg', height=10, width=10)
text_field = TextField(width=10, height=10, x=10, y=10)
faksimile = FaksimilePage(title='test', page_number=1, faksimile_image=image, text_field=text_field)
self.assertEqual(faksimile.page_tree.getroot().get('title'), 'test')
self.assertEqual(faksimile.page_tree.getroot().get('page-number'), '1')
self.assertEqual(faksimile.faksimile_image.width, 10)
self.assertEqual(faksimile.text_field.width, 10)
def test_GET_TEXTFIELDS(self):
+ svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Myriam/Mp_XV/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/Mp-XV-2d,16et17.svg')
+ pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
svg_tree = ET.parse(self.svg_file)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 2)
text_field = pages[0].text_field
self.assertEqual(text_field.width, 663.333)
result_dir = '.{}xml{}'.format(sep, sep) if isdir('xml') else ''
self.assertEqual(pages[0].xml_file, result_dir + 'W-II-1_49.xml')
self.assertEqual(pages[0].title, 'W II 1')
self.assertEqual(pages[0].page_number, '49')
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree, page_number='49')
self.assertEqual(len(pages), 1)
svg_tree = ET.parse(self.svg_testmatrix)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 1)
self.assertEqual(len(pages[0].word_positions), 1)
self.assertEqual(pages[0].word_positions[0].transform.toCSSTransformString(), 'rotate(45deg)')
svg_tree = ET.parse(self.faksimile_file)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 2)
textfield_id = pages[1].title.replace(' ', '-') + '_' + pages[1].page_number
#print([ position.id for position in pages[0].word_positions])
self.assertEqual(textfield_id not in [ position.id for position in pages[0].word_positions ], True)
self.assertEqual('path1237' in [ position.id for position in pages[0].word_positions ], True)
self.assertEqual('Vorgangs' in [ position.text for position in pages[0].word_positions ], False)
svg_tree = ET.parse(self.faksimile_file)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 2)
self.assertEqual(pages[0].page_number, '5')
"""
svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Eric/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/W-II-1,141et142.svg')
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 1)
"""
svg_tree = ET.parse(self.faksimile_rotate90)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 1)
self.assertEqual(len(pages[0].word_positions), len(svg_tree.xpath('//ns:rect/ns:title', namespaces=namespaces)))
def test_get_paths_inside_rect(self):
svg_tree = ET.parse(self.faksimile_file)
paths = get_paths_inside_rect(svg_tree, '//ns:path', 360, 786, 92, 765, 'N-VII-1_5')
self.assertEqual(len(paths), 1)
svg_tree = ET.parse(self.svg_testmatrix)
paths = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', 0, 2038.72, 0, 974.08002, 'TESTMATRIX_1')
self.assertEqual(len(paths), 1)
svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Myriam/Mp_XIV/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/Mp-XIV-1,419a.svg')
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
paths = get_paths_inside_rect(svg_tree, '//ns:rect', 52, 800, 58, 900, 'Mp-XIV-1_419a', namespaces=namespaces)
self.assertEqual(len([ path for path in paths if 'seinen' in path.xpath('./ns:title/text()', namespaces=namespaces)]), 1)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_footnotes.py
===================================================================
--- tests_svgscripts/test_footnotes.py (revision 111)
+++ tests_svgscripts/test_footnotes.py (revision 112)
@@ -1,69 +1,71 @@
import unittest
from os import sep, path, remove
from os.path import isdir, isfile, dirname
import shutil
import sys
import lxml.etree as ET
import warnings
import sys
sys.path.append('svgscripts')
import datatypes.footnotes
from datatypes.footnotes import FootnoteColumns, extract_footnotes, extract_footnotes_as_strings, UNITTESTING, DEBUG
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkriptionField import TranskriptionField
class TestExtractFootnotes(unittest.TestCase):
def setUp(self):
datatypes.footnotes.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.test_footnote = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.test_footnote_verso = DATADIR + sep + 'N_VII_1_xp5_4_page5.svg'
self.test_footnote_recto = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg'
self.test_footnote_multi = DATADIR + sep + 'N_VII_1_xp5_4_page13.svg'
self.test_footnote_multi_xml = DATADIR + sep + 'N_VII_1_page013.xml'
self.test_categorize_footnote = DATADIR + sep + 'N_VII_1_page006.xml'
def test_extract_footnotes(self):
footnotes = extract_footnotes_as_strings(svg_file=self.test_footnote_multi, contains_string='Anschlußzeichen')
self.assertEqual(len(footnotes), 4)
page = Page(self.test_footnote_multi_xml)
footnotes = extract_footnotes(page, svg_file=self.test_footnote_multi, contains_string='Anschlußzeichen')
self.assertEqual(len(footnotes), 4)
footnotes = extract_footnotes(page, svg_file=self.test_footnote_multi)
self.assertEqual(len(footnotes), 7)
"""
- page = Page('xml/Mp_XIV_page418.xml')
- footnotes = extract_footnotes(page, skip_after=183)
+ page = Page('xml/Mp_XV_page86r.xml')
+ footnotes = extract_footnotes(page)
print(footnotes)
"""
def test_columns(self):
svg_tree = ET.parse(self.test_footnote_multi)
transkription_field = TranskriptionField(self.test_footnote_multi)
nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ])
footnote_columns = FootnoteColumns(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, None)
self.assertEqual(len(footnote_columns.footnote_columns), 2)
footnote_columns.register_index(184)
footnote_columns.append('asdf')
self.assertEqual(len(footnote_columns.footnote_columns[0]), 1)
with self.assertRaises(Exception):
FootnoteColumns(svg_tree.getroot().nsmap, [], bottom_values, None)
"""
+ page = Page('xml/Mp_XV_page86r.xml')
local_file = page.source#'/home/knister0/ownCloud/myNietzscheDE/KGW-IX_12/Bd_12_XIV-XVI_Druck_als_SVG//07.svg'
svg_tree = ET.parse(local_file)
transkription_field = TranskriptionField(local_file)
nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ])
- footnote_columns = FootnoteColumns(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, None)
+ print(len(bottom_values[1:]))
+ footnote_columns = FootnoteColumns(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values[1:], None, debug=True)
"""
if __name__ == "__main__":
unittest.main()