Index: Friedrich-Nietzsche-late-work-ontology.ttl
===================================================================
--- Friedrich-Nietzsche-late-work-ontology.ttl (revision 92)
+++ Friedrich-Nietzsche-late-work-ontology.ttl (revision 93)
@@ -1,24 +1,26 @@
@prefix dct: .
@prefix homotypic: .
+@prefix stoff: .
+@prefix text: .
@prefix owl: .
@prefix rdfs: .
@prefix xsd: .
@prefix tln: .
a owl:Ontology;
dct:license ;
dct:title "An ontology about the collected late works of Friedrich Nietzsche"@en;
dct:description """Formal description of specific concepts in the scientific study of Friedrich Nietzsches late work."""@en;
dct:creator "Dominique Steinbach, tool coordinator/software developer, NIE-INE/digital edition of der späte Nietzsche, Basel University, Switzerland"@en;
dct:contributor "Christian Steiner, software developer, digital edition of der späte Nietzsche, University of Basel, Switzerland"@en;
dct:publisher "Basel University, Switzerland"@en.
tln:inheritOverwritesWord a owl:ObjectProperty ;
rdfs:subPropertyOf tln:overwritesWord;
rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ;
rdfs:comment "The author has used this word in order to overwrite that word."@en ;
rdfs:isDefinedBy ;
owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ).
Index: svgscripts/datatypes/word_deletion_path.py
===================================================================
--- svgscripts/datatypes/word_deletion_path.py (revision 0)
+++ svgscripts/datatypes/word_deletion_path.py (revision 93)
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This class can be used to represent word deletion paths.
+"""
+# Copyright (C) University of Basel 2019 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+from lxml import etree as ET
+from os.path import isfile
+from svgpathtools.parser import parse_path
+from svgpathtools.path import Line
+from svgpathtools.path import Path as SVGPath
+import sys
+
+from .path import Path
+from .style import Style
+
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+
+class WordDeletionPath(Path):
+ """
+ This class represents word deletion paths.
+
+ Args:
+ path (svgpathtools.path.Path) svg path representation.
+ style (datatypes.style.Style)
+ """
+ XML_TAG = 'word-deletion-path'
+ BOX_TAG = 'box-path'
+
+ def __init__(self, super_path, style):
+ super(WordDeletionPath,self).__init__(id=super_path.id, path=super_path.path)
+ self.style = style
+
+ @classmethod
+ def create_cls(cls, node, page):
+ """Create and return a cls.
+ """
+ path = Path(node=node)
+ style = Style.create_cls(page, path.style_class)
+ return cls(path, style)
+
+ @classmethod
+ def get_semantic_dictionary(cls):
+ """ Creates and returns a semantic dictionary as specified by SemanticClass.
+ """
+ dictionary = super(WordDeletionPath,cls).get_semantic_dictionary()
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('style', Style))
+ return cls.return_dictionary_after_updating_super_classes(dictionary)
+
+
Index: svgscripts/datatypes/super_page.py
===================================================================
--- svgscripts/datatypes/super_page.py (revision 92)
+++ svgscripts/datatypes/super_page.py (revision 93)
@@ -1,290 +1,290 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a super page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile, basename, dirname
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import sys
import warnings
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .mark_foreign_hands import MarkForeignHands
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .writing_process import WritingProcess
class SuperPage:
"""
This super class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition'
FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile'
PAGE_RECTO = 'recto'
PAGE_VERSO = 'verso'
STATUS_MERGED_OK = 'faksimile merged'
STATUS_POSTMERGED_OK = 'words processed'
UNITTESTING = False
XML_TAG = 'page'
def __init__(self, xml_file, title=None, page_number='', orientation='North', page_type=PAGE_VERSO, should_xml_file_exist=False):
self.properties_dictionary = {\
'faksimile_image': (FaksimileImage.XML_TAG, None, FaksimileImage),\
'faksimile_svgFile': ('data-source/@file', None, str),\
'number': ('page/@number', str(page_number), str),\
'orientation': ('page/@orientation', orientation, str),\
'page_type': ('page/@pageType', page_type, str),\
'pdfFile': ('pdf/@file', None, str),\
'source': ('page/@source', None, str),\
'svg_file': ('svg/@file', None, str),\
'svg_image': (SVGImage.XML_TAG, None, SVGImage),\
'text_field': (FaksimileImage.XML_TAG + '/' + TextField.XML_TAG, None, TextField),\
'title': ('page/@title', title, str),\
}
self.online_properties = []
self.line_numbers = []
self.lines = []
self.mark_foreign_hands = []
self.page_tree = None
self.sonderzeichen_list = []
self.style_dict = {}
self.text_connection_marks = []
self.word_deletion_paths = []
self.word_insertion_marks = []
self.words = []
self.writing_processes = []
self.xml_file = xml_file
if not self.is_page_source_xml_file():
msg = f'ERROR: xml_source_file {self.xml_file} is not of type "{FILE_TYPE_SVG_WORD_POSITION}"'
raise Exception(msg)
self._init_tree(should_xml_file_exist=should_xml_file_exist)
def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self.sonderzeichen_list = sonderzeichen_list
self.letterspacing_list = letterspacing_list
self.style_dict = style_dict
if style_node is not None:
self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') }
self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ]
self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('letterspacing-list')) ]
elif bool(self.style_dict):
style_node = ET.SubElement(self.page_tree.getroot(), 'style')
if len(self.sonderzeichen_list) > 0:
style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list))
if len(self.letterspacing_list) > 0:
style_node.set('letterspacing-list', ' '.join(self.letterspacing_list))
for key in self.style_dict.keys():
self.style_dict[key]['name'] = key
ET.SubElement(style_node, 'class', attrib=self.style_dict[key])
fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value }
fontsizes = sorted(fontsize_dict.values(), reverse=True)
# create a mapping between fontsizes and word stages
self.fontsizekey2stage_mapping = {}
for fontsize_key, value in fontsize_dict.items():
if value >= fontsizes[0]-1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION })
- elif value <= fontsizes[len(fontsizes)-1]+1:
+ elif value <= fontsizes[-1]+1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION })
else:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION })
def get_biggest_fontSize4styles(self, style_set={}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if bool(self.style_dict):
sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True)
return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1
else:
return 1
def get_line_number(self, y):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if len(self.line_numbers) > 0:
result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ]
return result_list[0] if len(result_list) > 0 else -1
else:
return -1
def init_all_properties(self, overwrite=False):
"""Initialize all properties.
"""
for property_key in self.properties_dictionary.keys():
if property_key not in self.online_properties:
self.init_property(property_key, overwrite=overwrite)
def init_property(self, property_key, value=None, overwrite=False):
"""Initialize all properties.
Args:
property_key: key of property in self.__dict__
value: new value to set to property
overwrite: whether or not to update values from xml_file (default: read only)
"""
if value is None:
if property_key not in self.online_properties:
xpath, value, cls = self.properties_dictionary.get(property_key)
if len(self.page_tree.xpath('//' + xpath)) > 0:
value = self.page_tree.xpath('//' + xpath)[0]
if value is not None:
if cls.__module__ == 'builtins':
self.update_tree(value, xpath)
self.__dict__.update({property_key: cls(value)})
else:
value = cls(node=value)\
if type(value) != cls\
else value
self.__dict__.update({property_key: value})
self.__dict__.get(property_key).attach_object_to_tree(self.page_tree)
else:
self.__dict__.update({property_key: value})
self.online_properties.append(property_key)
elif overwrite or property_key not in self.online_properties:
xpath, default_value, cls = self.properties_dictionary.get(property_key)
if cls.__module__ == 'builtins':
self.__dict__.update({property_key: cls(value)})
self.update_tree(value, xpath)
else:
self.__dict__.update({property_key: value})
self.__dict__.get(property_key).attach_object_to_tree(self.page_tree)
self.online_properties.append(property_key)
def is_locked(self):
"""Return true if page is locked.
"""
return len(self.page_tree.xpath('//metadata/lock')) > 0
def is_page_source_xml_file(self, source_tree=None):
"""Return true if xml_file is of type FILE_TYPE_SVG_WORD_POSITION.
"""
if not isfile(self.xml_file):
return True
if source_tree is None:
source_tree = ET.parse(self.xml_file)
return source_tree.getroot().find('metadata/type').text == self.FILE_TYPE_SVG_WORD_POSITION
def lock(self, reference_file, message=''):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if not self.is_locked():
metadata = self.page_tree.xpath('./metadata')[0]\
if len(self.page_tree.xpath('./metadata')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'metadata')
lock = ET.SubElement(metadata, 'lock')
ET.SubElement(lock, 'reference-file').text = reference_file
if message != '':
ET.SubElement(lock, 'message').text = message
def unlock(self):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if self.is_locked():
lock = self.page_tree.xpath('//metadata/lock')[0]
lock.getparent().remove(lock)
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_property_dictionary(self, property_key, default_value):
"""Update properties_dictionary.
"""
content = self.properties_dictionary.get(property_key)
if content is not None:
self.properties_dictionary.update({property_key: (content[0], default_value, content[2])})
else:
msg = f'ERROR: properties_dictionary does not contain a key {property_key}!'
raise Exception(msg)
def update_tree(self, value, xpath):
"""Update tree.
"""
node_name = dirname(xpath)
node = self.page_tree.xpath('//' + node_name)[0]\
if len(self.page_tree.xpath('//' + node_name)) > 0\
else ET.SubElement(self.page_tree.getroot(), node_name)
node.set(basename(xpath).replace('@', ''), str(value))
def _init_tree(self, should_xml_file_exist=False):
"""Initialize page_tree from xml_file if it exists.
"""
if isfile(self.xml_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(self.xml_file, parser)
elif not should_xml_file_exist:
self.page_tree = ET.ElementTree(ET.Element('page'))
self.page_tree.docinfo.URL = self.xml_file
else:
msg = f'ERROR: xml_source_file {self.xml_file} does not exist!'
raise FileNotFoundError(msg)
Index: svgscripts/datatypes/path.py
===================================================================
--- svgscripts/datatypes/path.py (revision 92)
+++ svgscripts/datatypes/path.py (revision 93)
@@ -1,197 +1,197 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent all svg path types.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from svgpathtools.parser import parse_path
from svgpathtools.path import Line
from svgpathtools.path import Path as SVGPath
import sys
from .attachable_object import AttachableObject
sys.path.append('py2ttl')
from class_spec import SemanticClass
class Path(AttachableObject,SemanticClass):
"""
This super class represents all types of svg paths.
Args:
node (lxml.etree.Element) node, containing information
path (svgpathtools.path.Path) svg path representation.
"""
XML_TAG = 'path'
WORD_DELETION_PATH_TAG = 'word-deletion-path'
BOX_TAG = 'box-path'
def __init__(self, id=0, node=None, path=None, parent_path=None, d_string=None, style_class='', tag=XML_TAG):
self.intKeys = [ 'id' ]
self.stringKeys = [ 'style_class' ]
self.floatKeys = []
self.start_line_number = -1
self.parent_path = parent_path
if node is not None:
self.id = int(node.get('id')) if bool(node.get('id')) else 0
self.path = parse_path(node.get('d')) if bool(node.get('d')) else None
self.d_attribute = node.get('d')
self.style_class = node.get('style-class')
self.tag = node.tag
else:
self.tag = tag
self.id = id
self.path = path
if self.path is None\
and d_string is not None\
and d_string != '':
self.path = parse_path(d_string)
self.d_attribute = self.path.d() if self.path is not None else ''
self.style_class = style_class
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.tag)
for key in self.floatKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3)))
for key in self.intKeys + self.stringKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(self.__dict__[key]))
if self.path is not None:
obj_node.set('d', self.path.d())
@classmethod
def create_cls(cls, id=0, path=None, style_class='', page=None, tag=XML_TAG):
"""Create and return a cls.
"""
if path is not None\
and path.start.imag <= path.end.imag\
and page is not None\
and style_class != ''\
and len(path._segments) == 1\
and type(path._segments[0]) == Line\
and style_class in page.style_dict.keys()\
and 'stroke-width' in page.style_dict[style_class].keys():
# If path is a Line and its style_class specifies a stroke-width, correct path
stroke_width_correction = float(page.style_dict[style_class]['stroke-width'])/2
xmin = path.start.real
xmax = path.end.real
ymin = path.start.imag-stroke_width_correction
ymax = path.end.imag+stroke_width_correction
#path = parse_path(f'M {xmin}, {ymin} L {xmax}, {ymin} L {xmax}, {ymax} L {xmin}, {ymax} z')
path = SVGPath(Line(start=(complex(f'{xmin}+{ymin}j')), end=(complex(f'{xmax}+{ymin}j'))),\
Line(start=(complex(f'{xmax}+{ymin}j')), end=(complex(f'{xmax}+{ymax}j'))),\
Line(start=(complex(f'{xmax}+{ymax}j')), end=(complex(f'{xmin}+{ymax}j'))),\
Line(start=(complex(f'{xmin}+{ymax}j')), end=(complex(f'{xmin}+{ymin}j'))))
return cls(id=id, path=path, style_class=style_class, tag=tag)
def contains_path(self, other_path):
"""Returns true if other_path is contained in this path.
"""
this_xmin, this_xmax, this_ymin, this_ymax = self.path.bbox()
other_xmin, other_xmax, other_ymin, other_ymax = other_path.path.bbox()
return other_xmin >= this_xmin and other_xmax <= this_xmax\
and other_ymin >= this_ymin and other_ymax <= this_ymax
def contains_start_of_path(self, other_path):
"""Returns true if start of other_path is contained in this path.
"""
this_xmin, this_xmax, this_ymin, this_ymax = self.path.bbox()
other_xmin, other_xmax, other_ymin, other_ymax = other_path.path.bbox()
return other_xmin >= this_xmin and other_xmin < this_xmax\
and other_ymin >= this_ymin and other_ymax <= this_ymax
def contains_end_of_path(self, other_path):
"""Returns true if end of other_path is contained in this path.
"""
this_xmin, this_xmax, this_ymin, this_ymax = self.path.bbox()
other_xmin, other_xmax, other_ymin, other_ymax = other_path.path.bbox()
return other_xmax >= this_xmin and other_xmax < this_xmax\
and other_ymin >= this_ymin and other_ymax <= this_ymax
@classmethod
def create_path_from_transkription_position(cls, transkription_position, tr_xmin=0.0, tr_ymin=0.0):
"""Create a .path.Path from a .transkription_position.TranskriptionPosition.
"""
if len(transkription_position.positional_word_parts) > 0:
first_pwp = transkription_position.positional_word_parts[0]
last_pwp = transkription_position.positional_word_parts[len(transkription_position.positional_word_parts)-1]
xmin = tr_xmin + first_pwp.left
xmax = tr_xmin + last_pwp.left + last_pwp.width
ymin = tr_ymin + sorted(pwp.top for pwp in transkription_position.positional_word_parts)[0]
ymax = tr_ymin + sorted([pwp.bottom for pwp in transkription_position.positional_word_parts], reverse=True)[0]
else:
xmin = tr_xmin + transkription_position.left
xmax = xmin + transkription_position.width
ymin = tr_ymin + transkription_position.top
ymax = ymin + transkription_position.height
word_path = parse_path('M {}, {} L {}, {} L {}, {} L {}, {} z'.format(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax))
return cls(path=word_path)
def do_paths_intersect(self, other_path):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try:
return self.path.intersect(other_path.path, justonemode=True)
except AssertionError:
return False
def get_median_y(self, tr_ymin=0.0):
"""Return the median of ymin + ymax.
"""
return (self.path.bbox()[2] + self.path.bbox()[3])/2 - tr_ymin
def get_x(self, tr_xmin=0.0):
"""Return xmin.
"""
return self.path.bbox()[0] - tr_xmin
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {'d_attribute': { 'class': str, 'cardinality': 0,\
'name': 'hasDAttribute', 'label': 'svg path has d attribute',\
'comment': 'The d attribute defines a path to be drawn.'}}
- properties.update(cls.create_semantic_property_dictionary('style_class', str))
+ #properties.update(cls.create_semantic_property_dictionary('style_class', str))
dictionary.update({cls.CLASS_KEY: class_dict})
dictionary.update({cls.PROPERTIES_KEY: properties})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def is_partially_contained_by(self, other_path):
"""Returns true if other_path containes this path partially.
"""
return other_path.contains_start_of_path(self) or other_path.contains_end_of_path(self)
Index: svgscripts/datatypes/word.py
===================================================================
--- svgscripts/datatypes/word.py (revision 92)
+++ svgscripts/datatypes/word.py (revision 93)
@@ -1,800 +1,828 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
import inspect
from lxml import etree as ET
from operator import attrgetter
import re
import string
import sys
import warnings
from .box import Box
from .matrix import Matrix
from .path import Path
from .simple_word import SimpleWord
from .style import Style
+from .word_deletion_path import WordDeletionPath
from .word_position import WordPosition
from .transkription_position import TranskriptionPosition
from .writing_process import WritingProcess
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
def execute_function_on_parts(word_parts, func_name):
"""Execute function on parts and add those parts instead of original word to word_parts.
:return: new word_parts, output from func
"""
copy_parts = word_parts[:]
for word in word_parts:
output = eval('word.{0}()'.format(func_name))
if len(word.word_parts) > 0:
for part_word in word.word_parts:
copy_parts.insert(copy_parts.index(word), part_word)
copy_parts.remove(word)
word.word_parts = []
return copy_parts, output
def update_transkription_position_ids(word):
"""Update transkription_position' ids according to index.
"""
word_part_ids = [ wp.id for wp in word.word_parts ]
if len(word_part_ids) != len(set(word_part_ids)):
for id, wp in enumerate(word.word_parts):
wp.id = id
for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
transkription_position.id = index
transkription_position.has_box = None
transkription_position.deleted = False
class Word(SimpleWord):
"""
This class represents a word.
"""
COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ]
APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' }
DATA = 'debug-data'
+ RDFS_SUBCLASSOF_LIST = ['http://www.e-editiones.ch/ontology/text#HandwrittenText']
XML_TAG = 'word'
XML_EARLIER_VERSION = 'earlier-version'
XML_OVERWRITES = 'overwrites'
XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\
'isDeletionOfWord': 'deletesEarlierPart',\
'isExtensionOfWord': 'extendsEarlierVersion',\
'isTransformationOfWord': 'transformsEarlierPart' }
def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None):
super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
self.corrections = []
self.deleted = deleted
+ self.deletion_paths = []
self.debug_container = {}
self.debug_msg = None
self.earlier_version = earlier_version
self.edited_text = None
self.isClarificationOfWord = None
self.isDeletionOfWord = None
self.isExtensionOfWord = None
self.isTransformationOfWord = None
if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0:
self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ])
self.overwrites_word = None
self.styles = styles\
if styles is not None\
else []
self.verified = None
self.writing_process_id = writing_process_id
self.writing_processes = []
self.word_insertion_mark = None
self.word_box = None
self.word_parts = word_parts if word_parts is not None else []
self.word_part_objs = word_part_objs if word_part_objs is not None else []
- self.IS_DEBUG_WORD = (self.text == 'Übertreibung' and self.line_number == 8)
+
+ def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
+ """Add a word deletion path to word.
+ """
+ if len(self.word_parts) > 0:
+ for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
+ elif self.deleted and len(self.transkription_positions) > 0:
+ word_path = Path.create_path_from_transkription_position(self.transkription_positions[0],\
+ tr_xmin=tr_xmin, tr_ymin=tr_ymin)
+ self.deletion_paths = [ deletion_path for deletion_path in deletion_paths\
+ if do_paths_intersect_saveMode(deletion_path, word_path) ]
+
+
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
word_node = super(Word,self).attach_word_to_tree(target_tree)
if self.deleted is not None:
word_node.set('deleted', str(self.deleted).lower())
if self.verified is not None:
word_node.set('verified', str(self.verified).lower())
if self.edited_text is not None:
word_node.set('edited-text', self.edited_text)
if self.writing_process_id > -1:
word_node.set('writing-process-id', str(self.writing_process_id))
for index, word_part in enumerate(self.word_parts):
word_part.id = index
word_part.attach_word_to_tree(word_node)
if self.earlier_version is not None:
earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION)
self.earlier_version.attach_word_to_tree(earlier_node)
if self.overwrites_word is not None\
and len(self.overwrites_word.transkription_positions) > 0:
overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES)
self.overwrites_word.attach_word_to_tree(overwrite_node)
if self.word_box is not None:
self.word_box.attach_object_to_tree(word_node)
if len(self.corrections) > 0:
word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ])))
for key in self.XML_CORRECTION_DICT.keys():
if self.__dict__[key] is not None:
word_node.set(self.XML_CORRECTION_DICT[key], 'true')
return word_node
def belongs_to_multiple_writing_processes(self, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if len(self.word_parts) > 0 and include_parts:
return len(set(word.writing_process_id for word in self.word_parts)) > 1
return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1
def set_parent_word_writing_process_id(self):
"""Set writing_process_id for parent word.
"""
ids = set(word.transkription_positions[0].style for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None)
if len(ids) > 1:
self.writing_process_id = max([style.writing_process_id for style in ids])
if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\
for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\
> 1:
self.writing_process_id += 1
@classmethod
def create_cls(cls, word_node):
"""Creates a word from a (lxml.Element) node.
[:return:] Word
"""
cls = super(Word,cls).create_cls(word_node)
cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
cls.split_strings = None
cls.join_string = word_node.get('join')
if bool(word_node.get('split')):
cls.split_strings = word_node.get('split').split(' ')
if ''.join(cls.split_strings) != cls.text:
error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\
format(word_node.getroottree().docinfo.URL, str(cls.id))\
+ 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\
+ 'Text attribute: "{0}".\n'.format(cls.text)
raise Exception(error_msg)
cls.verified = word_node.get('verified') == 'true'\
if bool(word_node.get('verified')) else None
cls.deleted = word_node.get('deleted') == 'true'\
if bool(word_node.get('deleted')) else None
cls.edited_text = word_node.get('edited-text')
cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ]
if bool(word_node.get('corrections')):
for index in [ int(i) for i in word_node.get('corrections').split(' ') ]:
if index < len(cls.word_parts):
cls.corrections.append(cls.word_parts[index])
cls.earlier_version = None
if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0:
cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0]
for key_value in cls.XML_CORRECTION_DICT.values():
if word_node.get(key_value) == 'true':
cls.__dict__[key_value] = True
if cls.earlier_version is not None:
for word_part in cls.word_parts:
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\
and len(cls.word_parts) <= len(cls.earlier_version.word_parts):
try:
word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id]
except Exception:
msg = f'{cls.id} {cls.text}: {word_part.id}'
raise Exception(msg)
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls.earlier_version
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls
cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\
if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\
else None
cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\
if len(word_node.xpath('./' + Box.XML_TAG)) > 0\
else None
return cls
def create_earlier_version(self, root_word=None, id=0):
"""Create an earlier version of word.
"""
if root_word is None:
root_word = self
root_word.set_parent_word_writing_process_id()
word_parts = []
non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\
if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ]
non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts)
if non_single_punctuation_word_parts_length > 0\
and len([ word_part for word_part in non_single_punctuation_word_parts\
if word_part.deleted ])\
== non_single_punctuation_word_parts_length:
self.deleted = True
for word_part in non_single_punctuation_word_parts: word_part.deleted = False
for id, word_part in enumerate(self.word_parts):
earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id)
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
elif word_part.overwrites_word is not None\
and (len(word_part.transkription_positions) > 0\
and word_part.overwrites_word.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style\
!= word_part.overwrites_word.transkription_positions[0].style):
word_part.overwrites_word.id = word_part.id
word_parts.append(word_part.overwrites_word)
word_part.isTransformationOfWord = word_part.overwrites_word
#print(f'transform: {self.text}')
if word_part not in self.corrections:
self.corrections.append(word_part)
elif root_word.writing_process_id > -1\
and (len(word_part.transkription_positions) > 0\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style.writing_process_id\
== root_word.writing_process_id):
word_part.extendsEarlierVersion = True
#print('extends')
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
#print(f'default: {self.text}')
word_parts.append(earlierWordPart)
text = ''.join([ word.text for word in word_parts ])\
if len(word_parts) > 0\
else self.text
if len(word_parts) == 1:
self.transkription_positions += word_parts[0].transkription_positions
self.faksimile_positions += word_parts[0].faksimile_positions
word_parts = []
new_transkription_positions = copy.deepcopy(self.transkription_positions)
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None:
writing_process_id = self.transkription_positions[0].style.writing_process_id
for new_tp in new_transkription_positions:
new_tp.style.writing_process_id = writing_process_id
return Word(id=id, text=text, transkription_positions=new_transkription_positions,\
faksimile_positions=self.faksimile_positions, line_number=self.line_number,\
word_parts=word_parts)
def create_correction_history(self, page=None, box_style=None):
"""Create correction history.
"""
if self.word_box is not None:
manuscript = self.transkription_positions[0].style.manuscript\
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None\
else None
style = Style()
if box_style is not None:
style = box_style
if page is not None:
style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript)
for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]:
style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions)
for transkription_position in transkription_positions:
transkription_position.style = style
self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\
line_number=self.line_number)
for word_part in self.word_parts:
word_part.create_correction_history(page=page, box_style=box_style)
if len(self.word_parts) > 0:
earlier_version = self.create_earlier_version()
extending_words = self._get_parts_with_property_key('extendsEarlierVersion')
if len(extending_words) > 0:
for word in extending_words:
word.isExtensionOfWord = earlier_version
if self.has_mixed_status('deleted', include_parts=True):
self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ])
if len(self.corrections) > 0:
self.earlier_version = earlier_version
@staticmethod
def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
"""Creates a word from a (lxml.Element) node or word_part_objs.
[:return:] Word
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
text = word_node.get('text')
deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
if len(word_node.findall('.//' + Word.DATA)) > 0\
else [ item.attrib for item in word_node.findall('.//part')]
return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
WIDTH = 5
TOPCORRECTION = 2.0
FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
height = height
x = round(float(word_part_objs[0]['x']), 3)
if(page is not None and bool(page.style_dict)):
HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
if endSign is not None and '%' in endSign:
lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
if bool(page.style_dict[key].get('font-size'))]
lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
elif endSign is not None and '%' in endSign:
endX = float(endX) + WIDTH
bottom = round(float(word_part_objs[0]['y']), 3)
y = round(bottom - height + TOPCORRECTION, 3)
width = round(float(endX) - x, 3)
transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
text = ''.join([ dict['text'] for dict in word_part_objs])
line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
word.debug_msg = debug_msg
return word
else:
error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(Word,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\
cardinality=1, cardinality_restriction='minCardinality',\
name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\
name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.'))
- dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deleted', bool,\
- name='isWordDeleted', label='has word been deleted', comment='Word has been deleted by the author.'))
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\
+ name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\
+ comment='Word has been deleted by the author using a deletion path.'))
+ #dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deleted', bool,\
+ # name='isWordDeleted', label='has word been deleted', comment='Word has been deleted by the author.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\
name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\
name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\
name='isClarificationOfWord', label='word is a clarification of word',\
comment='The author has used this part of the word in order to clarify the appearance of that word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\
name='isDeletionOfWord', label='word is a deletion of word',\
comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\
name='isExtensionOfWord', label='word is a extension of word',\
comment='The author has used this part of a word in order to extend an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\
name='isTransformationOfWord', label='word is a transformation of word',\
comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\
name='overwritesWord', label='word overwrites word',\
comment='The author has used this word in order to overwrite that word.'))
# This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING,
# cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class.
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\
name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\
subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING))
super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\
name='isCorrectionOfWord', label='word is a correction of word',\
comment='The author has used this word in order to correct that word.')
for key in cls.XML_CORRECTION_DICT.keys():
correction_dict = dictionary[cls.PROPERTIES_KEY].get(key)
correction_dict.update(super_property_dictionary)
dictionary[cls.PROPERTIES_KEY].update({key: correction_dict})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def has_mixed_status(self, property_key, include_parts=False, concerns_word=True):
"""Returns true if transkription_positions have mixed status concerning the property_key in their __dict__.
"""
if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions):
return False
if len(self.word_parts) > 0 and include_parts:
if concerns_word:
if False in set(property_key in word.__dict__.keys() for word in self.word_parts):
return False
return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1
else:
return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\
if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1
return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1
def init_word(self, page):
"""Initialize word with objects from page.
"""
super(Word,self).init_word(page)
if self.writing_process_id > -1:
self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ]
writing_processes = self.writing_processes
for word_part in self.word_parts:
word_part.init_word(page)
self.lines += word_part.lines
self.writing_processes += word_part.writing_processes
self.lines = [ line for line in set(self.lines) ]
self.writing_processes = [ wp for wp in set(self.writing_processes)]
if self.overwrites_word is not None:
self.overwrites_word.init_word(page)
if self.earlier_version is not None:
if self.earlier_version.writing_process_id == -1:
self.earlier_version.writing_process_id = self.writing_process_id-1
if self.earlier_version.line_number == -1:
self.earlier_version.line_number = self.line_number
self.earlier_version.init_word(page)
def join(self, other_word, append_at_end_of_new_word=True):
"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
"""
if append_at_end_of_new_word:
self.text = self.text + other_word.text
for position in other_word.transkription_positions:
position.id = str(len(self.transkription_positions))
self.transkription_positions.append(position)
else:
self.text = other_word.text + self.text
index = 0
for position in other_word.transkription_positions:
self.transkription_positions.insert(index, position)
index += 1
while index < len(self.transkription_positions):
self.transkription_positions[index].id = str(index)
index += 1
self.simplify_transkription_positions()
def partition_according_to_deletion(self):
"""Partition a word according to its transkription_positions' deletion status
->split word and add partial words as its parts.
"""
if self.has_mixed_status('deleted'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.deleted != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.deleted
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
self.line_number = -1
self.deleted = False
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion')
elif not self.deleted\
and len(self.transkription_positions) > 0\
and self.transkription_positions[0].deleted:
self.deleted = True
def partition_according_to_writing_process_id(self):
"""Partition a word according to its transkription_positions' writing_process_ids
->split word and add partial words as its parts.
"""
if self.belongs_to_multiple_writing_processes():
last_writing_process_id = -1
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.writing_process_id != last_writing_process_id\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_writing_process_id = transkription_position.writing_process_id
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id')
if self.belongs_to_multiple_writing_processes(include_parts=True):
self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0]
elif len(self.transkription_positions) > 0:
self.writing_process_id = self.transkription_positions[0].writing_process_id
def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False):
"""Determines whether word is over a word box.
"""
word_over_box = None
if len(self.word_parts) > 0:
for word in self.word_parts:
current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None))
if current_word is not None and current_word.word_box is not None:
word_over_box = current_word
else:
new_tp_dict = {}
for index, transkription_position in enumerate(self.transkription_positions):
if previous_word_has_box and index == 0:
if len(transkription_position.positional_word_parts) > 0:
transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2
#print(f'{self.text}: {transkription_position.positional_word_parts[0].left}')
else:
transkription_position.left += 1
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
containing_boxes = [ box_path for box_path in box_paths\
if word_path.is_partially_contained_by(box_path)\
or box_path.do_paths_intersect(word_path) ]
if len(containing_boxes) > 0:
if previous_word_has_box:
print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}')
self._set_box_to_transkription_position(containing_boxes[0], word_path,\
transkription_position, new_tp_dict, tr_xmin)
box_paths.remove(containing_boxes[0])
for replace_tp in new_tp_dict.keys():
for tp in new_tp_dict.get(replace_tp):
self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp)
self.transkription_positions.remove(replace_tp)
word_over_box = self._get_partial_word_over_box()
update_transkription_position_ids(self)
return word_over_box
def set_word_insertion_mark(self, word_insertion_mark):
"""Sets word_insertion_mark
"""
self.word_insertion_mark = word_insertion_mark
def set_writing_process_id_to_transkription_positions(self, page):
"""Determines the writing process id of the transkription_positions.
"""
for transkription_position in self.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
if font_key in page.fontsizekey2stage_mapping.keys():
transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
def simplify_transkription_positions(self):
"""Merge transkription_positions if possible.
"""
index = len(self.transkription_positions)-1
while index > 0\
and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
current_tp = self.transkription_positions[index]
index -= 1
previous_tp = self.transkription_positions[index]
if previous_tp.is_mergebale_with(current_tp):
positional_word_parts = previous_tp.positional_word_parts
positional_word_parts += current_tp.positional_word_parts
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
if len(transkription_positions) == 1:
transkription_positions[0].writing_process_id = previous_tp.writing_process_id\
if previous_tp.writing_process_id != -1\
else current_tp.writing_process_id
self.transkription_positions.pop(index+1)
self.transkription_positions[index] = transkription_positions[0]
#print(self.text, len(self.transkription_positions))
def split(self, split_string, start_id=0):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString, currentString, nextString = self.text.partition(split_string)
currentWord = None
previousWord = None
nextWord = None
previousIndex = 0
current_id = start_id
all_positional_word_parts = []
for position in self.transkription_positions:
all_positional_word_parts += position.positional_word_parts
if len(all_positional_word_parts) == 0:
warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString))
if len(previousString) > 0:
previous_pwps = []
while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
previous_pwps.append(all_positional_word_parts[previousIndex])
previousIndex += 1
if previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString))
else:
previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split')
previous_text = ''.join([ pwp.text for pwp in previous_pwps ])
previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions)
current_id += 1
all_positional_word_parts = all_positional_word_parts[previousIndex:]
if len(nextString) > 0:
tmp_pwps = []
index = 0
while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
tmp_pwps.append(all_positional_word_parts[index])
index += 1
if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString))
else:
next_pwps = all_positional_word_parts[index:]
next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split')
next_text = ''.join([ pwp.text for pwp in next_pwps ])
nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions)
all_positional_word_parts = all_positional_word_parts[:index]
current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split')
current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ])
currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions)
return previousWord, currentWord, nextWord
def split_according_to_status(self, status, splits_are_parts=False):
"""Split a word according to its transkription_positions' text.
:return: a list of new word.Word
"""
new_words = []
if self.has_mixed_status(status):
last_status = None
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.__dict__[status] != last_status\
and len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.__dict__[status]
if len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
if splits_are_parts:
self.word_parts += new_words
if len(self.word_parts) > 0:
self.transkription_positions = []
return new_words
def undo_partitioning(self):
"""Undo partitioning.
"""
if len(self.word_parts) > 0:
for word_part in self.word_parts:
word_part.undo_partitioning()
if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]):
self.transkription_positions += word_part.transkription_positions
self.earlier_version = None
self.edited_text = None
self.word_box = None
self.word_parts = []
self.corrections = []
self.earlier_versions = []
self.box_paths = []
def _create_new_word(self, transkription_positions, status, new_id=0):
"""Create a new word from self and transkription_positions.
"""
newWord = Word(id=new_id, transkription_positions=transkription_positions)
for key in self.COPY_PROPERTY_KEY:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys():
newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status])
else:
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
return newWord
def _get_parts_with_property_key(self, property_key):
"""Return a list of word_parts with property == property_key.
"""
word_parts = []
for word_part in self.word_parts:
if property_key in word_part.__dict__.keys():
word_parts.append(word_part)
else:
word_parts += word_part._get_parts_with_property_key(property_key)
return word_parts
def _get_partial_word_over_box(self):
"""Partition a word according to its transkription_positions' has_box
->split word and add partial words as its parts.
:return: word over box or self
"""
word_over_box = None
if self.has_mixed_status('has_box'):
transkription_positions = []
last_word_box = None
for transkription_position in self.transkription_positions:
if transkription_position.has_box != last_word_box\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
transkription_positions = []
transkription_positions.append(transkription_position)
last_word_box = transkription_position.has_box
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
self.transkription_positions = []
elif len(self.word_parts) > 0:
#self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box')
for word_part in self.word_parts:
if word_over_box is None:
word_over_box = word_part._get_partial_word_over_box()
else:
break
elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1:
word_over_box = self
word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box
return word_over_box
def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin):
"""Set box_path to transkription_position that is contained by box_path.
Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary.
"""
if box_path.contains_path(word_path):
transkription_position.has_box = box_path
elif box_path.contains_start_of_path(word_path):
split_position = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[0].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
elif box_path.contains_end_of_path(word_path):
split_position = box_path.path.bbox()[0] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
else: # box_path in the middle of word_pathz
split_position1 = box_path.path.bbox()[0] - tr_xmin
split_position2 = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position1, split_position2)
if len(new_tps) >= 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
+
+def do_paths_intersect_saveMode(mypath1, mypath2):
+ """Returns true if paths intersect, false if not or if there was an exception.
+ """
+ try:
+ return mypath1.path.intersect(mypath2.path, justonemode=True)\
+ or mypath1.is_partially_contained_by(mypath2)
+ except AssertionError:
+ return False
+
Index: svgscripts/datatypes/page.py
===================================================================
--- svgscripts/datatypes/page.py (revision 92)
+++ svgscripts/datatypes/page.py (revision 93)
@@ -1,280 +1,296 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import sys
import warnings
from .box import Box
from .color import Color
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .faksimile_position import FaksimilePosition
from .lineNumber import LineNumber
from .line import Line
from .mark_foreign_hands import MarkForeignHands
from .matrix import Matrix
from .path import Path
from .positional_word_part import PositionalWordPart
from .super_page import SuperPage
from .style import Style
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
+from .word_deletion_path import WordDeletionPath
from .word_insertion_mark import WordInsertionMark
sys.path.append('py2ttl')
from class_spec import SemanticClass
FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION
FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT
STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK
STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK
class Page(SemanticClass,SuperPage):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
UNITTESTING = False
- def __init__(self, xml_source_file, faksimile_image=None, faksimile_svgFile=None):
+ def __init__(self, xml_source_file, faksimile_image=None, faksimile_svgFile=None, add_deletion_paths_to_words=True):
super(Page,self).__init__(xml_source_file)
self.update_property_dictionary('faksimile_image', faksimile_image)
self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile)
self.init_all_properties()
self.add_style(style_node=self.page_tree.getroot().find('.//style'))
self.init_node_objects()
+ if add_deletion_paths_to_words:
+ self.add_deletion_paths_to_words()
+
+ def add_deletion_paths_to_words(self):
+ """Add deletion paths to words.
+ """
+ if (self.svg_file is not None and isfile(self.svg_file))\
+ or (self.source is not None and isfile(self.source)):
+ svg_file = self.svg_file if self.svg_file is not None else self.source
+ transkription_field = TranskriptionField(svg_file)
+ words = [ word for word in self.words if word.deleted or True in [ part.deleted for part in word.word_parts ]]
+ for word in words:
+ word.add_deletion_paths(self.word_deletion_paths, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin)
@classmethod
def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree = ET.parse(xml_file)
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION:
page = cls(xml_file)
if word_selection_function is None or len(word_selection_function(page.words)) > 0:
return [ page ]
else:
return []
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
pages = []
xpath = '//page/@output'
if status_contains != '' and status_not_contain != '':
xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain)
elif status_contains != '':
xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains)
elif status_not_contain != '':
xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain)
for xml_source_file in source_tree.xpath(xpath):
if isfile(xml_source_file):
pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function)
return pages
else:
return []
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'number': { 'class': str, 'cardinality': 1},\
'faksimile_image': { 'class': FaksimileImage, 'cardinality': 1},\
'orientation': { 'class': str, 'cardinality': 1},\
'svg_image': { 'class': SVGImage, 'cardinality': 1}}
properties.update(cls.create_semantic_property_dictionary('text_field', TextField,\
cardinality=1, name='pageIsOnTextField', label='page is on text field',\
comment='Relates a page to the text field on a faksimile image.'))
for key in [ 'lines', 'words', 'writing_processes', 'word_deletion_paths', 'word_insertion_marks']:
properties.update(cls.create_semantic_property_dictionary(key, list))
dictionary.update({cls.CLASS_KEY: class_dict})
dictionary.update({cls.PROPERTIES_KEY: properties})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def init_node_objects(self):
"""Initialize all node objects.
"""
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ]
self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ]
self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
- self.word_deletion_paths = [ Path(node=node) for node in self.page_tree.xpath('//' + Path.WORD_DELETION_PATH_TAG) ]
+ self.word_deletion_paths = [ WordDeletionPath.create_cls(node, self) for node in self.page_tree.xpath('//' + WordDeletionPath.XML_TAG) ]
if self.faksimile_image is not None and self.text_field is not None:
for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks:
simple_word.init_word(self)
for wim in self.word_insertion_marks:
if wim.line_number > -1:
wim.line = [ line for line in self.lines if line.id == wim.line_number ][0]
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None):
"""Update the data source of page.
"""
if faksimile_svgFile is not None:
self.faksimile_svgFile = faksimile_svgFile
data_node = self.page_tree.xpath('.//data-source')[0]\
if len(self.page_tree.xpath('.//data-source')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'data-source')
data_node.set('file', self.faksimile_svgFile)
if xml_correction_file is not None:
data_node.set('xml-corrected-words', xml_correction_file)
def update_line_number_area(self, transkription_field, svg_tree=None):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD = 0.4
if svg_tree is None:
svg_tree = ET.parse(self.source)
if len(self.line_numbers) > 1:
line_number = self.line_numbers[9]\
if transkription_field.is_page_verso() and len(self.line_numbers) > 8\
else self.line_numbers[1]
ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\
if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\
and LineNumber.IS_A_LINE_NUMBER(item)\
and LineNumber(raw_text_node=item, transkription_field=transkription_field).id == line_number.id ]
if len(ln_nodes) > 0:
matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform'))
if transkription_field.is_page_verso():
transkription_field.add_line_number_area_width(matrix.getX())
elif self.svg_file is not None and isfile(self.svg_file):
svg_path_tree = ET.parse(self.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
svg_x = matrix.getX()
svg_y = self.line_numbers[1].bottom + transkription_field.ymin
use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
if len(use_nodes) > 0:
symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
if len(d_strings) > 0 and d_strings[0] != '':
path = parse_path(d_strings[0])
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
transkription_field.add_line_number_area_width(matrix.getX() + width)
def update_page_type(self, transkription_field=None):
"""Adds a source to page and attaches it to page_tree.
"""
if transkription_field is None:
if self.source is None or not isfile(self.source):
raise FileNotFoundError('Page does not have a source!')
transkription_field = TranskriptionField(self.source)
self.page_type = Page.PAGE_VERSO\
if transkription_field.is_page_verso()\
else Page.PAGE_RECTO
self.page_tree.getroot().set('pageType', self.page_type)
def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False):
"""Update styles of words and add them to their transkription_positions.
Args:
add_to_parents: Add styles also to word (and if not None to manuscript).
partition_according_to_styles: Partition word if its transkription_positions have different styles.
"""
style_dictionary = {}
if words is None:
words = self.words
for word in words:
if len(word.word_parts) > 0:
self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\
add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles)
for transkription_position in word.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
style_class = transkription_position.positional_word_parts[0].style_class
writing_process_id = -1
for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]:
writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
- style_class_key = (Style.remove_irrelevant_style_keys(style_class, self), writing_process_id)
+ style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id)
if create_css:
if style_dictionary.get((style_class_key, word.deleted)) is None:
+ color = word.deletion_paths[0].style.color\
+ if len(word.deletion_paths) > 0 else None
style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\
- create_css=create_css, deleted=word.deleted )
+ create_css=create_css, deletion_color=color )
transkription_position.style = style_dictionary[(style_class_key, word.deleted)]
#print(style_dictionary[(style_class_key, word.deleted)])
else:
if style_dictionary.get(style_class_key) is None:
style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css)
style_dictionary[style_class_key].writing_process_id = style_class_key[1]
transkription_position.style = style_dictionary[style_class_key]
if add_to_parents and transkription_position.style not in word.styles:
word.styles.append(transkription_position.style)
if partition_according_to_styles:
word.split_according_to_status('style', splits_are_parts=True)
if manuscript is not None\
and add_to_parents:
manuscript.update_styles(*style_dictionary.values())
Index: svgscripts/datatypes/style.py
===================================================================
--- svgscripts/datatypes/style.py (revision 92)
+++ svgscripts/datatypes/style.py (revision 93)
@@ -1,193 +1,202 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent the style of a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
from lxml import etree as ET
import re
import sys
from .color import Color
sys.path.append('py2ttl')
from class_spec import SemanticClass
class Style(SemanticClass):
"""
This class represents the style of a word.
Args:
manuscript: a ArchivalManuscriptUnity
"""
NIETSCHES_FONTS = { 'german': 'deutsche Schreibschrift', 'latin': 'lateinische Schreibschrift' }
COLOR_KEYS = [ 'black', 'red', 'blue', 'green', 'grey' ]
RELEVANT_STYLE_KEYS = [ 'font-family', 'fill', 'stroke' ]
ADDITIONAL_STYLE_KEYS = [ 'font-size' ]
WRITING_INSTRUMENTS = { (COLOR_KEYS[0], False): 'schwarze Tinte',\
(COLOR_KEYS[0], True): 'Bleistift',\
(COLOR_KEYS[4], True): 'Bleistift',\
+ (COLOR_KEYS[4], False): 'Bleistift',\
(COLOR_KEYS[1], False): 'braune Tinte',\
(COLOR_KEYS[1], True): 'Rotstift',\
(COLOR_KEYS[2], False): 'violette Tinte',\
(COLOR_KEYS[2], True): 'Blaustift',\
- (COLOR_KEYS[3], False): '„Tinte der letzten Korrektur“'}
+ (COLOR_KEYS[3], False): '„Tinte der letzten Korrektur“',\
+ (COLOR_KEYS[3], True): '„Tinte der letzten Korrektur“'}
- def __init__(self, manuscript=None, writing_process_id=-1, extended_styles=False, deleted=False):
+ def __init__(self, manuscript=None, writing_process_id=-1, extended_styles=False, deletion_color=None):
self.color = Color.create_cls(manuscript=manuscript)
self.css_styles = []
- self.deleted = deleted
+ self.css_string = None
+ self.deletion_color = deletion_color
self.is_german = True
self.font = self.NIETSCHES_FONTS['german']
self.font_family = 'Weidemann-Book'
- self.font_size = None
+ self.font_size = ''
self.manuscript = manuscript
self.relevant_key_map = {}
relevant_style_keys = self.RELEVANT_STYLE_KEYS + self.ADDITIONAL_STYLE_KEYS\
if extended_styles else self.RELEVANT_STYLE_KEYS
for key in relevant_style_keys:
if not key.startswith('font'):
self.relevant_key_map.update({key: self.set_color})
elif key == 'font-family':
self.relevant_key_map.update({key: self.set_font})
elif key == 'font-size':
self.relevant_key_map.update({key: self.set_size})
self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, False)]
self.writing_process_id = writing_process_id
def create_a_copy_wo_writing_process_id(self):
new_self = copy.deepcopy(self)
new_self.writing_process_id = -1
return new_self
def create_a_copy(self, reduce_writing_process_id=False):
writing_process_id = self.writing_process_id\
if not reduce_writing_process_id\
else self.writing_process_id-1
copy = Style(manuscript=self.manuscript, writing_process_id=writing_process_id)
copy.color = self.color
copy.font_family = self.font_family
copy.process_style_classes()
if copy.manuscript is not None:
copy.manuscript.update_styles(copy)
return copy
def create_css_styles(self):
"""Create css styles.
"""
- if self.deleted:
- self.css_styles.append('text-decoration: line-through;')
+ if self.deletion_color is not None:
+ self.css_styles.append('text-decoration:line-through;')
+ self.css_styles.append(f'text-decoration-color:{self.deletion_color.hex_color};')
+ self.css_styles.append(f'-webkit-text-decoration-color:{self.deletion_color.hex_color};')
if self.font_family.endswith('Bold'):
- self.css_styles.append(f'font-weight: bold;')
- if self.font_size is not None:
- self.css_styles.append(f'font-size: {self.font_size};')
- self.css_styles.append(f'color: {self.color.hex_color};')
+ self.css_styles.append(f'font-weight:bold;')
+ if self.font_size != '':
+ self.css_styles.append(f'font-size:{self.font_size};')
+ self.css_styles.append(f'color:{self.color.hex_color};')
+ self.css_string = ''.join(self.css_styles)
@classmethod
- def create_cls(cls, page, style_string, manuscript=None, create_css=False, deleted=False):
+ def create_cls(cls, page, style_string, manuscript=None, create_css=False, deletion_color=None):
"""Creates a Style from a style_string.
:return: (datatypes.style) Style
"""
- style = cls(manuscript=manuscript, extended_styles=create_css, deleted=deleted)
+ style = cls(manuscript=manuscript, extended_styles=create_css, deletion_color=deletion_color)
style_dict = { key: key_dict for key, key_dict in page.style_dict.items()\
if any(relevant_key in key_dict.keys() for relevant_key in style.relevant_key_map.keys()) }
for style_key in style_string.split(' '):
if style_key in style_dict.keys():
dictionary = style_dict[style_key]
for key, set_function in [ (key, func) for key, func in style.relevant_key_map.items() if key in dictionary.keys() ]:
if callable(set_function):
set_function(dictionary[key])
style.process_style_classes()
if create_css:
style.create_css_styles()
return style
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
properties = {}
properties.update(cls.create_semantic_property_dictionary('font', str, cardinality=1,\
name='styleHasFont', label='style has font', comment='Connects a style with the kind of font Nietzsche used in writing.'))
properties.update(cls.create_semantic_property_dictionary('writing_instrument', str, cardinality=1,\
name='styleHasWritingInstrument', label='style has writing instrument', comment='Connects a style with the description of a writing instrument.'))
properties.update(cls.create_semantic_property_dictionary('color', Color, cardinality=1,\
name='styleHasColor', label='style has color', comment='Connects a style with a color.'))
- properties.update(cls.create_semantic_property_dictionary('css_styles', str, cardinality=1,\
- subPropertyOf=cls.STOFF_STYLE_HAS_CSS_URL_STRING,cardinality_restriction='minCardinality',\
+ #properties.update(cls.create_semantic_property_dictionary('css_styles', str,\
+ properties.update(cls.create_semantic_property_dictionary('css_string', str,\
+ subPropertyOf=cls.STOFF_STYLE_HAS_CSS_URL_STRING,\
name='styleHasCSS', label='style has css', comment='Connects a style with CSS style.'))
dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties }
return cls.return_dictionary_after_updating_super_classes(dictionary)
def process_style_classes(self):
"""Infere writing instrument from font-family and color.
"""
if self.font_family.startswith('NewsGothic'):
self.is_german = False
self.font = self.NIETSCHES_FONTS['latin']
if self.color.name in self.COLOR_KEYS:
self.writing_instrument = self.WRITING_INSTRUMENTS[(self.color.name, self.font_family.endswith('Bold'))]
def set_color(self, hex_color: str):
- self.color = Color.create_cls(hex_color=hex_color, manuscript=self.manuscript)
+ if hex_color != 'none':
+ self.color = Color.create_cls(hex_color=hex_color, manuscript=self.manuscript)
def set_font(self, font_family: str):
self.font_family = font_family
def set_size(self, font_size: str):
self.font_size = font_size
@classmethod
def remove_irrelevant_style_keys(cls, style_string, page, extended_styles=False) -> str:
"""Return a style_string without irrelevant style keys.
"""
relevant_style_keys = cls.RELEVANT_STYLE_KEYS + cls.ADDITIONAL_STYLE_KEYS\
if extended_styles else cls.RELEVANT_STYLE_KEYS
return ' '.join(sorted( style_key for style_key in style_string.split(' ')\
if len(\
[ key for key in page.style_dict[style_key].keys()\
if key in relevant_style_keys ]\
) > 0 ))
def __eq__(self, other):
"""Returns true if self is qualitatively identical to other.
Reason: For qualities, the idea of numerical identity is silly.
"""
if other is None:
return False
return self.color == other.color\
and self.font_family == other.font_family\
and self.writing_process_id == other.writing_process_id\
- and self.css_styles == other.css_styles
+ and self.css_styles == other.css_styles\
+ and self.font_size == other.font_size
def __hash__(self):
"""Return a hash value for self.
"""
return hash((self.color.__hash__, self.font_family, self.writing_process_id))
Index: svgscripts/process_words_post_merging.py
===================================================================
--- svgscripts/process_words_post_merging.py (revision 92)
+++ svgscripts/process_words_post_merging.py (revision 93)
@@ -1,481 +1,473 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path as PathlibPath
from progress.bar import Bar
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
import sys
import tempfile
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.box import Box
from datatypes.manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
-from datatypes.word import Word, update_transkription_position_ids
+from datatypes.word import Word, do_paths_intersect_saveMode, update_transkription_position_ids
from util import back_up
from process_files import update_svgposfile_status
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
DEBUG_WORD = None
MERGED_DIR = 'merged'
def categorize_paths(page, transkription_field=None):
"""Categorize all paths that are part of the transkription field.
:return: a dictionary containig a list for each category of path.
"""
if page.source is not None and isfile(page.source):
MAX_HEIGHT_LINES = 1
max_line = sorted(\
[line_number.bottom-line_number.top for line_number in page.line_numbers if line_number.id % 2 == 0],\
reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17
tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0
tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0
paths, attributes = svg_to_paths.svg2paths(page.source)
allpaths_on_tf = []
allpaths_outside_tf = []
attributes_outside_tf = []
if transkription_field is None:
transkription_field = TranskriptionField(page.source)
for index, path in enumerate(paths):
attribute = attributes[index]
if len(path) > 0\
and path != transkription_field.path\
and path.bbox()[0] >= tr_xmin\
and path.bbox()[1] <= transkription_field.xmax:
allpaths_on_tf.append(Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page))
elif len(path) > 0\
and path != transkription_field.path:
allpaths_outside_tf.append(Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page))
#print(index, allpaths_outside_tf[len(allpaths_outside_tf)-1].path, path)
attributes_outside_tf.append(attribute)
path_dict = { 'text_area_deletion_paths': [],\
'deletion_or_underline_paths': [],\
'box_paths': [],\
'dots_paths': [],\
'word_connector_paths': [],\
'uncategorized_paths': [] }
for mypath in allpaths_on_tf:
xmin, xmax, ymin, ymax = mypath.path.bbox()
start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin)
if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1:
path_dict.get('dots_paths').append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed():
path_dict.get('box_paths').append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
path_dict.get('word_connector_paths').append(mypath)
elif abs(ymax-ymin) < MAX_HEIGHT_LINES:
mypath.start_line_number = start_line_number
path_dict.get('deletion_or_underline_paths').append(mypath)
elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin):
# Check for "ladder", i.e. a path with 3 segments (seg0 is horizontal on line x, seg1 moves to line x+1, seg2 is horizontal on line x+1)
if start_line_number + 1 == page.get_line_number(mypath.path.end.imag-tr_ymin)\
and len(mypath.path._segments) == 3\
and abs(mypath.path._segments[0].bbox()[3]-mypath.path._segments[0].bbox()[2]) < MAX_HEIGHT_LINES\
and abs(mypath.path._segments[2].bbox()[3]-mypath.path._segments[2].bbox()[2]) < MAX_HEIGHT_LINES:
for index in 0, 2:
new_path = Path(parent_path=mypath, path=SVGPath(mypath.path._segments[index]))
new_path.start_line_number = page.get_line_number(new_path.path.start.imag-tr_ymin)
path_dict.get('deletion_or_underline_paths').append(new_path)
else:
path_dict.get('text_area_deletion_paths').append(mypath)
else:
path_dict.get('uncategorized_paths').append(mypath)
underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin)
path_dict.update({'underline_path': underline_path})
path_dict['uncategorized_paths'] += process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\
paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line)
return path_dict
elif not UNITTESTING:
error_msg = 'Svg source file {} does not exist!'.format(page.source)\
if page.source is not None else 'Page does not contain a source file!'
raise FileNotFoundError(error_msg)
return {}
def copy_page_to_merged_directory(page, manuscript_file=None):
"""Copy page to directory that contains the first version of all svg_pos_files that have been
merged with the faksimile position data. MERGED_DIR is a subfolder of svg_pos_files-directory.
"""
svg_pos_file = PathlibPath(page.page_tree.docinfo.URL)
target_dir = svg_pos_file.parent / MERGED_DIR
if not target_dir.is_dir():
target_dir.mkdir()
target_pos_file = target_dir / svg_pos_file.name
save_page(page, str(svg_pos_file), target_svg_pos_file=str(target_pos_file), status=STATUS_MERGED_OK, manuscript_file=manuscript_file)
-def do_paths_intersect_saveMode(mypath1, mypath2):
- """Returns true if paths intersect, false if not or if there was an exception.
- """
- try:
- return mypath1.path.intersect(mypath2.path, justonemode=True)\
- or mypath1.is_partially_contained_by(mypath2)
- except AssertionError:
- return False
def find_special_words(page, transkription_field=None):
"""Find special words, remove them from words, process their content.
"""
if page.source is None or not isfile(page.source):
raise FileNotFoundError('Page does not have a source!')
if transkription_field is None:
transkription_field = TranskriptionField(page.source)
special_char_list = MarkForeignHands.get_special_char_list()
special_char_list += TextConnectionMark.get_special_char_list()
single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ]
if not UNITTESTING:
bar = Bar('find special words', max=len(single_char_words))
for word in single_char_words:
not bool(UNITTESTING) and bar.next()
if word.text == MarkForeignHands.CLASS_MARK:
id = len(page.mark_foreign_hands)
page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id))
page.words.remove(word)
elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\
or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\
and any(style in page.sonderzeichen_list for style\
in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))):
id = len(page.text_connection_marks)
page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id))
page.words.remove(word)
not bool(UNITTESTING) and bar.finish()
svg_tree = ET.parse(page.source)
page.update_page_type(transkription_field=transkription_field)
page.update_line_number_area(transkription_field, svg_tree=svg_tree)
italic_classes = [ key for key in page.style_dict\
if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ]
if len(page.mark_foreign_hands) > 0:
MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\
SonderzeichenList=page.sonderzeichen_list)
if len(page.text_connection_marks) > 0:
TextConnectionMark.find_content_in_footnotes(page.text_connection_marks, transkription_field, svg_tree,\
title=page.title, page_number=page.number)
def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Marks all words that intersect with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] list of .path.Path that might be word_underline_paths
"""
if not UNITTESTING:
bar = Bar('mark words that intersect with deletion paths', max=len(page.words))
for word in page.words:
not bool(UNITTESTING) and bar.next()
word = mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
for part_word in word.word_parts:
part_word = mark_word_if_it_intersects_with_paths_as_deleted(part_word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
word.partition_according_to_deletion()
not bool(UNITTESTING) and bar.finish()
# return those paths in deletion_paths that are not in page.word_deletion_paths
return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ]
def mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Marks word if it intersects with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] word
"""
word.deleted = False
for transkription_position in word.transkription_positions:
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
intersecting_paths = [ deletion_path for deletion_path in deletion_paths\
if do_paths_intersect_saveMode(deletion_path, word_path) ]
if DEBUG_WORD is not None and word.text == DEBUG_WORD.text and word.line_number == DEBUG_WORD.line_number:
relevant_paths = [ path for path in deletion_paths if path.start_line_number == DEBUG_WORD.line_number ]
#print(word.line_number, word_path.path.bbox(), [ path.path.bbox() for path in relevant_paths])
if len(intersecting_paths) > 0:
#print(f'{word.line_number}: {word.id}, {word.text}: {intersecting_paths}')
transkription_position.deleted = True
for deletion_path in intersecting_paths:
if deletion_path.parent_path is not None:
deletion_path = deletion_path.parent_path
if deletion_path not in page.word_deletion_paths:
deletion_path.tag = Path.WORD_DELETION_PATH_TAG
deletion_path.attach_object_to_tree(page.page_tree)
page.word_deletion_paths.append(deletion_path)
return word
def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None):
"""Process words after merging with faksimile word positions.
"""
if page is None and svg_pos_file is None:
raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!')
if page is None:
page = Page(svg_pos_file)
if page.source is None or not isfile(page.source):
raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file))
if svg_pos_file is None:
svg_pos_file = page.page_tree.docinfo.URL
if new_words is not None:
page.words = sorted(new_words, key=attrgetter('id'))
for word_node in page.page_tree.xpath('.//word'):
word_node.getparent().remove(word_node)
manuscript = ArchivalManuscriptUnity.create_cls(manuscript_file)\
if manuscript_file is not None\
else None
copy_page_to_merged_directory(page, manuscript_file=manuscript_file)
transkription_field = TranskriptionField(page.source)
update_faksimile_line_positions(page)
find_special_words(page, transkription_field=transkription_field)
#update_writing_process_ids(page)
page.update_styles(manuscript=manuscript, partition_according_to_styles=True)
#TODO: find_hyphenated_words(page)
categorize_paths(page, transkription_field=transkription_field)
save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=STATUS_POSTMERGED_OK, manuscript_file=manuscript_file)
def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17) -> list:
"""Process word boxes: partition words according to word boxes.
[:return:] a list of paths that are not boxes
"""
MAX_HEIGHT_LINES = 1
not_boxes = []
if not UNITTESTING:
bar = Bar('process word boxes', max=len(page.words))
svg_tree = ET.parse(page.source)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
allpaths_on_margin_field = []
if paths is None or attributes is None:
paths = []
raw_paths, attributes = svg_to_paths.svg2paths(page.source)
for index, raw_path in enumerate(raw_paths):
paths.append(Path.create_cls(id=index, path=raw_path, style_class=attributes[index].get('class'), page=page))
for index, mypath in enumerate(paths):
path = mypath.path
xmin, xmax, ymin, ymax = path.bbox()
attribute = attributes[index]
if len(path) > 0\
and path != transkription_field.path\
and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\
or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\
and abs(ymax-ymin) < max_line:
allpaths_on_margin_field.append(mypath)#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page))
box_line_number_dict = {}
for box_path in sorted(box_paths, key=lambda path: path.get_median_y()):
line_number = page.get_line_number(box_path.get_median_y(tr_ymin=transkription_field.ymin))
if line_number > 0:
if line_number not in box_line_number_dict.keys():
box_line_number_dict.update({ line_number: [ box_path ]})
else:
box_line_number_dict.get(line_number).append(box_path)
boxes = []
for line_number in box_line_number_dict.keys():
box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x())
margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\
if page.get_line_number(margin_box.get_median_y(tr_ymin=transkription_field.ymin)) == line_number ],\
key=lambda path: path.get_x())
threshold = 3 if line_number % 2 == 0 else 1.5
if len(margin_boxes_on_line) > 0:
for box_path in box_paths_on_line:
#print(line_number, box_path.path.d(), len(margin_boxes_on_line))
box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\
transkription_field=transkription_field, namespaces=namespaces, threshold=threshold)
if box is not None:
boxes.append(box)
else:
not_boxes += box_paths_on_line
if len(boxes) > 0:
for word in page.words:
word.process_boxes(boxes, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin)
word.create_correction_history(page)
if not bool(UNITTESTING):
bar.next()
elif word.earlier_version is not None:
#print(f'{word.text} -> {word.earlier_version.text}')
if word.earlier_version.earlier_version is not None:
print(f'{word.earlier_version.earlier_version.text}')
not bool(UNITTESTING) and bar.finish()
return not_boxes
def reset_page(page):
"""Reset all words that have word_parts in order to run the script a second time.
"""
svg_pos_file = PathlibPath(page.page_tree.docinfo.URL)
first_merge_version = svg_pos_file.parent / MERGED_DIR / svg_pos_file.name
if first_merge_version.exists():
page = Page(str(first_merge_version))
else:
word_with_wordparts = [ word for word in page.words if len(word.word_parts) > 0 ]
word_with_wordparts += [ word for word in page.words if word.earlier_version is not None ]
page_changed = False
if len(word_with_wordparts) > 0:
for word in word_with_wordparts:
word.undo_partitioning()
update_transkription_position_ids(word)
page_changed = True
no_line_numbers = [ word for word in page.words if word.line_number == -1 ]
if len(no_line_numbers) > 0:
for word in no_line_numbers:
if len(word.transkription_positions) > 0:
word.line_number = page.get_line_number((word.transkription_positions[0].top+word.transkription_positions[0].bottom)/2)
else:
msg = f'Word {word.id} {word.text} has no transkription_position!'
warnings.warn(msg)
page_changed = True
if page_changed:
page.update_and_attach_words2tree()
def save_page(page, svg_pos_file, target_svg_pos_file=None, status=None, manuscript_file=None):
"""Save page to target_file and update status of file.
"""
page.update_and_attach_words2tree()
if not UNITTESTING:
if target_svg_pos_file is None:
target_svg_pos_file = svg_pos_file
if status is not None:
update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status)
write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
def update_faksimile_line_positions(page):
"""Update faksimile_positions of the lines
"""
num_lines = len(page.line_numbers)
ymin = page.text_field.ymin\
if page.text_field is not None\
else 0.0
for line_number in page.line_numbers:
if len([ word.faksimile_positions[0] for word in page.words\
if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) > 0:
line_number.faksimile_inner_top = min([ word.faksimile_positions[0].top for word in page.words\
if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ])
line_number.faksimile_inner_bottom = max([ word.faksimile_positions[0].bottom for word in page.words\
if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ])
if line_number.id % 2 == 0:
line_number.faksimile_outer_top = line_number.faksimile_inner_top - ymin
line_number.faksimile_outer_bottom = line_number.faksimile_inner_bottom - ymin
for index, line_number in enumerate(page.line_numbers):
if line_number.faksimile_inner_bottom == 0.0\
or line_number.faksimile_inner_bottom < line_number.faksimile_inner_top:
if index == 0 and num_lines > 1:
line_number.faksimile_inner_bottom = page.line_numbers[index+1].top
elif index == num_lines-1 and page.text_field is not None:
line_number.faksimile_inner_bottom = round(page.text_field.height + page.text_field.ymin, 3)
elif index > 0 and index < num_lines-1:
line_number.faksimile_inner_bottom = page.line_numbers[index+1].faksimile_inner_top\
if page.line_numbers[index+1].faksimile_inner_top > page.line_numbers[index-1].faksimile_inner_bottom\
else page.line_numbers[index-1].faksimile_inner_bottom
line_number.attach_object_to_tree(page.page_tree)
def update_writing_process_ids(page):
"""Update the writing_process_ids of the words and split accordingly.
"""
for word in page.words:
word.set_writing_process_id_to_transkription_positions(page)
word.partition_according_to_writing_process_id()
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to process words after they have been merged with faksimile data.
svgscripts/process_words_post_merging.py [OPTIONS] a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-i|--include-missing-line-number run script on files that contain words without line numbers
-r|--rerun rerun script on a svg_pos_file that has already been processed
:return: exit code (int)
"""
status_not_contain = STATUS_POSTMERGED_OK
include_missing_line_number = False
try:
opts, args = getopt.getopt(argv, "hir", ["help", "include-missing-line-number", "rerun" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-i', '--include-missing-line-number'):
include_missing_line_number = True
elif opt in ('-r', '--rerun'):
status_not_contain = ''
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
manuscript_file = file_a\
if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
else None
counter = 0
for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=status_not_contain):
reset_page(page)
no_line_numbers = [ word for word in page.words if word.line_number == -1 ]
if not include_missing_line_number and len(no_line_numbers) > 0:
not UNITTESTING and print(Fore.RED + f'Page {page.title}, {page.number} has words with no line number!')
for word in no_line_numbers:
not UNITTESTING and print(f'Word {word.id}: {word.text}')
else:
back_up(page, page.xml_file)
not UNITTESTING and print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL)
post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file)
counter += 1
not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: tests_svgscripts/test_data/N_VII_1_page006.xml
===================================================================
--- tests_svgscripts/test_data/N_VII_1_page006.xml (revision 92)
+++ tests_svgscripts/test_data/N_VII_1_page006.xml (revision 93)
@@ -1,1276 +1,1276 @@
svgWordPosition2019-08-02 15:17:372019-08-02 15:17:372019-08-02 15:30:592019-08-02 15:30:59
- 2020-05-12 10:14:00
+ 2020-05-13 17:06:34
Index: tests_svgscripts/test_style.py
===================================================================
--- tests_svgscripts/test_style.py (revision 92)
+++ tests_svgscripts/test_style.py (revision 93)
@@ -1,92 +1,96 @@
import unittest
from os import sep, path
from os.path import dirname, basename, isfile, isdir
import lxml.etree as ET
import sys
sys.path.append('svgscripts')
from datatypes.color import Color
from datatypes.manuscript import ArchivalManuscriptUnity
from datatypes.page import Page
from datatypes.style import Style
class TestStyle(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.test_file = DATADIR + sep + 'test.xml'
self.test_svg_file = DATADIR + sep + 'test421.svg'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml'
self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.test_page = DATADIR + sep + 'N_VII_1_page001.xml'
self.test_manuscript = DATADIR + sep + 'N_VII_1.xml'
def test_create_cls(self):
page = Page(self.test_page)
style_string = "st11 st10 st5"
style = Style.create_cls(page, style_string)
self.assertEqual(style.font_family, 'Weidemann-Book')
self.assertEqual(style.color.hex_color, "#DADADA")
self.assertEqual(style.writing_instrument, 'schwarze Tinte')
style_string = "st11 st10"
style = Style.create_cls(page, style_string)
self.assertEqual(style.font_family, 'Weidemann-Book')
self.assertEqual(style.color.name, "black")
self.assertEqual(style.writing_instrument, 'schwarze Tinte')
style_string = "st11 st3"
style = Style.create_cls(page, style_string, create_css=True)
self.assertEqual(style.font_family, 'Weidemann-Book')
self.assertEqual(style.font_size, '9px')
+ style_string = "st18"
+ page = Page(self.test_page)
+ style = Style.create_cls(page, style_string)
+ self.assertEqual(style.color.name, 'black')
def test_remove_irrelevant_style_keys(self):
page = Page(self.test_page)
style_string = "st11 st10 st9 st5 st0"
self.assertEqual(Style.remove_irrelevant_style_keys(style_string, page), "st11 st5 st9")
def test_process_style_classes(self):
style = Style()
style.color = Color.create_cls(hex_color='#009CDE')
style.process_style_classes()
self.assertEqual(style.writing_instrument, 'violette Tinte')
self.assertEqual(style.font, 'deutsche Schreibschrift')
style.font_family = "NewsGothicBT-Bold"
style.process_style_classes()
self.assertEqual(style.writing_instrument, 'Blaustift')
self.assertEqual(style.font, 'lateinische Schreibschrift')
style = Style()
style.font_family = "NewsGothicBT-Bold"
style.process_style_classes()
#print(style.css_styles)
def test_get_semantic_dictionary(self):
dictionary = Style.get_semantic_dictionary()
#print(dictionary)
def test_copy(self):
manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript)
page = Page(self.test_page)
page.words = [ page.words[0] ]
page.update_styles(manuscript=manuscript, add_to_parents=True)
self.assertEqual(len(manuscript.styles), 1)
styleA = page.words[0].transkription_positions[0].style
styleB = styleA.create_a_copy()
self.assertEqual(styleA == styleB, True)
styleB = styleA.create_a_copy(reduce_writing_process_id=True)
self.assertEqual(styleA != styleB, True)
def test_eq(self):
page = Page(self.test_page)
style_string = "st11 st10 st5"
styleA = Style.create_cls(page, style_string)
styleB = Style.create_cls(page, style_string)
self.assertEqual(styleA == styleB, True)
style_string = "st11 st10"
styleC = Style.create_cls(page, style_string)
self.assertEqual(styleA != styleC, True)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_word_deletion_path.py
===================================================================
--- tests_svgscripts/test_word_deletion_path.py (revision 0)
+++ tests_svgscripts/test_word_deletion_path.py (revision 93)
@@ -0,0 +1,40 @@
+import unittest
+from os import sep, path
+from os.path import isdir, dirname, basename
+import lxml.etree as ET
+from svgpathtools.parser import parse_path
+from svgpathtools.path import Line
+from svgpathtools.path import Path as SVGPath
+import sys
+import sys
+
+sys.path.append('svgscripts')
+
+from datatypes.page import Page
+from datatypes.word_deletion_path import WordDeletionPath
+from datatypes.transkription_position import TranskriptionPosition
+from datatypes.positional_word_part import PositionalWordPart
+
+class TestPath(unittest.TestCase):
+ def setUp(self):
+ DATADIR = path.dirname(__file__) + sep + 'test_data'
+ self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml'
+
+ def test_init(self):
+ page = Page(self.xml_file)
+ node = page.page_tree.xpath(WordDeletionPath.XML_TAG)[0]
+ path = WordDeletionPath.create_cls(node, page)
+ print(path)
+
+
+ def test_get_semantic_dict(self):
+ #print(WordDeletionPath.get_semantic_dictionary())
+ pass
+
+
+
+
+
+if __name__ == "__main__":
+ unittest.main()
+
Index: tests_svgscripts/test_word.py
===================================================================
--- tests_svgscripts/test_word.py (revision 92)
+++ tests_svgscripts/test_word.py (revision 93)
@@ -1,456 +1,469 @@
import unittest
from os import sep, path
import lxml.etree as ET
import sys
sys.path.append('svgscripts')
from process_words_post_merging import reset_page, update_writing_process_ids
from datatypes.box import Box
from datatypes.manuscript import ArchivalManuscriptUnity
from datatypes.matrix import Matrix
import datatypes.page
from datatypes.path import Path
from datatypes.positional_word_part import PositionalWordPart
from datatypes.style import Style
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word, execute_function_on_parts, update_transkription_position_ids
+from datatypes.word_deletion_path import WordDeletionPath
from datatypes.word_position import WordPosition
sys.path.append('py2ttl')
from class_spec import SemanticClass
class Page:
def __init__(self):
self.svg_file = None
def get_line_number(self, input=0):
return -1
def get_biggest_fontSize4styles(self, style_set={}):
return 7
class TestWord(unittest.TestCase):
TESTCASE = None
def setUp(self):
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.test_file = DATADIR + sep + 'N_VII_1_page009.xml'
+ self.word_deletion_path_file = DATADIR + sep + 'N_VII_1_page138.xml'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }]
x = 0
for dict in self.word_part_objs:
dict['class'] = 'st22'
dict['x'] = x
dict['y'] = 11
x += 1
mylist = {'text': 'abc', 'id': '0', 'line-number': '2', 'deleted': 'true' }
word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)'))
self.transkription_positions = [ word_position ]
self.word_node = ET.Element('word', attrib=mylist)
word_position.attach_object_to_tree(self.word_node)
x = 0
for char in mylist['text']:
ET.SubElement(self.word_node, 'part', attrib={'text': char, 'x': str(x), 'y': '11', 'class': 'st22' })
x += 1
+ def test_add_deletion_paths(self):
+ page = datatypes.page.Page(self.word_deletion_path_file, add_deletion_paths_to_words=False)
+ word = [ word for word in page.words if word.text == 'AufBau'][0]
+ #self.assertTrue(word.deleted)
+ self.assertTrue(len(word.word_parts) > 0)
+ self.assertTrue(word.word_parts[0].deleted)
+ word.add_deletion_paths(page.word_deletion_paths, tr_xmin=28.347656, tr_ymin=49.921875)
+ self.assertTrue(len(word.word_parts[0].deletion_paths) > 0)
+ #print(word.deletion_paths)
+
+
def test_Word_with_word_part_objs(self):
word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10)
self.assertEqual(word.id, 0)
self.assertEqual(word.transkription_positions[0].bottom, 13)
self.assertEqual(word.transkription_positions[0].height, 10)
self.assertEqual(word.transkription_positions[0].top, 3)
self.assertEqual(word.transkription_positions[0].left, 0)
self.assertEqual(word.transkription_positions[0].width, 10)
self.assertEqual(word.text, 'abc')
def test_Word_with_word_node(self):
word = Word.create_cls(self.word_node)
self.assertEqual(word.id, 0)
self.assertEqual(word.deleted, True)
self.assertEqual(word.transkription_positions[0].bottom, 11)
self.assertEqual(word.transkription_positions[0].height, 10)
self.assertEqual(word.transkription_positions[0].top, 1)
self.assertEqual(word.transkription_positions[0].left, 0)
self.assertEqual(word.transkription_positions[0].width, 10)
self.assertEqual(word.text, 'abc')
self.assertEqual(word.line_number, 2)
self.assertEqual(word.transkription_positions[0].transform.isRotationMatrix(), True)
def test_attach_word_to_tree(self):
newWord = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10)
empty_tree = ET.ElementTree(ET.Element('page'))
newWord.attach_word_to_tree(empty_tree)
for word_node in empty_tree.getroot().xpath('//word'):
word = Word.CREATE_WORD(word_node=word_node)
self.assertEqual(word.id, 0)
self.assertEqual(word.deleted, False)
self.assertEqual(word.transkription_positions[0].bottom, 13)
self.assertEqual(word.transkription_positions[0].height, 10)
self.assertEqual(word.transkription_positions[0].top, 3)
self.assertEqual(word.transkription_positions[0].left, 0)
self.assertEqual(word.transkription_positions[0].width, 10)
self.assertEqual(word.text, 'abc')
@unittest.skipUnless(TESTCASE is None or TESTCASE == 0, 'Not testing this case')
def test_create_correction_history_case0(self):
# Case 1: whole word over box
box = Box(earlier_text='XYX')
word = Word(text='ASDF', transkription_positions=[TranskriptionPosition()])
word.word_box = box
word.create_correction_history()
self.assertEqual(word.earlier_version is None, True)
self.assertEqual(word.overwrites_word is not None, True)
@unittest.skipUnless(TESTCASE is None or TESTCASE == 1, 'Not testing this case')
def test_create_correction_history_case1(self):
# Case 2: part of word over box
box = Box(earlier_text='XYX')
partA = Word(text='A', transkription_positions=[TranskriptionPosition()])
partA.word_box = box
partB = Word(text='SDF', transkription_positions=[TranskriptionPosition()])
word = Word(text='ASDF', word_parts=[ partA, partB])
word.create_correction_history()
self.assertEqual(word.earlier_version is None, True)
self.assertEqual(word.word_parts[0].overwrites_word is not None, True)
@unittest.skipUnless(TESTCASE is None or TESTCASE == 2, 'Not testing this case')
def test_create_correction_history_case3(self):
# Case 3: part of word over box, word under box is part of earlier version
box = Box(earlier_text='XYX')
tp0 = TranskriptionPosition()
tp0.style = Style(writing_process_id=0)
tp1 = TranskriptionPosition()
tp1.style = Style(writing_process_id=1)
partA = Word(id=0, text='Test', transkription_positions=[ tp0])
partB = Word(id=1, text='er', transkription_positions=[ tp1])
partB.word_box = box
word = Word(text='Tester', writing_process_id=1, word_parts=[ partA, partB ] )
word.create_correction_history(box_style=tp0.style)
self.assertEqual(word.text, 'Tester')
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.earlier_version.text, 'TestXYX')
self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1])
@unittest.skipUnless(TESTCASE is None or TESTCASE == 3, 'Not testing this case')
def test_create_correction_history_case4(self):
# Case 4: part of word is deleted
partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()])
partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()])
word = Word(text='ASDF', word_parts=[ partA, partB])
word.create_correction_history()
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True)
self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0])
self.assertEqual(word.edited_text, 'SDF')
@unittest.skipUnless(TESTCASE is None or TESTCASE == 4, 'Not testing this case')
def test_create_correction_history_case5(self):
tp0 = TranskriptionPosition()
tp0.style = Style(writing_process_id=0)
tp1 = TranskriptionPosition()
tp1.style = Style(writing_process_id=1)
partA = Word(id=0, text='Test', transkription_positions=[ tp0])
partB = Word(id=1, text='er', transkription_positions=[ tp1])
word = Word(text='Tester', word_parts=[ partA, partB ] )
word.create_correction_history()
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.word_parts[1].extendsEarlierVersion, True)
self.assertEqual(word.word_parts[1].isExtensionOfWord, word.earlier_version)
#@unittest.skipUnless(TESTCASE is None or TESTCASE == 5, 'Not testing this case')
#@unittest.skip('case tested, relies on a local xml file')
def test_create_correction_history_case_full(self):
page = datatypes.page.Page('xml/N_VII_1_page138.xml')
manuscript = ArchivalManuscriptUnity()
reset_page(page)
update_writing_process_ids(page)
word = [ word for word in page.words if word.text == 'Verschiedenes' and word.line_number == 4 ][0]
wordAufBau = [ word for word in page.words if word.text == 'AufBau' ][0]
#page.words = [ word ]
page.update_styles(manuscript=manuscript, partition_according_to_styles=True)
word.word_parts[0].transkription_positions[0].has_box = Box(earlier_text='v')
self.assertEqual(len(word.word_parts), 2)
word_over_box = word._get_partial_word_over_box()
update_transkription_position_ids(word)
word.create_correction_history(page)
self.assertEqual(word.writing_process_id, 1)
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.earlier_version.text, 'verschiedenes')
#print(word.earlier_version.id, [ (w.id, w.text) for w in word.earlier_version.word_parts ])
empty_tree = ET.ElementTree(ET.Element('page'))
word_node = word.attach_word_to_tree(empty_tree)
#print(ET.dump(word_node))
"""
self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0])
self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1])
self.assertEqual(word.word_parts[1].overwrites_word is not None, True)
"""
word = wordAufBau
page.words = [ word ]
page.update_styles(manuscript=manuscript, partition_according_to_styles=True)
word.word_parts[0].deleted = True
word.word_parts[1].transkription_positions[0].has_box = Box(earlier_text='b')
self.assertEqual(len(word.word_parts), 3)
word_over_box = word._get_partial_word_over_box()
self.assertEqual(len(word.word_parts), 3)
update_transkription_position_ids(word)
word.create_correction_history(page)
self.assertEqual(word.writing_process_id, 2)
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.text, 'AufBau')
self.assertEqual(word.edited_text, 'Bau')
self.assertEqual(word.earlier_version.text, 'Aufbau')
self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0])
self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1])
self.assertEqual(word.word_parts[1].overwrites_word is not None, True)
empty_tree = ET.ElementTree(ET.Element('page'))
word_node = word.attach_word_to_tree(empty_tree)
#print(ET.dump(word_node))
newWord = Word.create_cls(word_node)
#@unittest.skip('')
def test_earlier_version(self):
partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()])
partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()])
word = Word(text='ASDF', word_parts=[ partA, partB])
earlier_version = word.create_earlier_version()
self.assertEqual(earlier_version is not None, True)
self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True)
self.assertEqual(word.word_parts[0].isDeletionOfWord, earlier_version.word_parts[0])
def test_undo_partitioning(self):
tps = []
for i, xy in enumerate([ 3, 4, 5 ]):
tps.append(TranskriptionPosition(id=i, x=xy, y=xy, height=10, width=10))
partA = Word(id=0, text='Auf', writing_process_id=1, deleted=True, transkription_positions=[ tps[0]])
partB = Word(id=1, text='B', writing_process_id=2, transkription_positions=[tps[1]])
partC = Word(id=2, text='au', writing_process_id=1,transkription_positions=[tps[2]])
word = Word(text='Aufbau', writing_process_id=2, word_parts=[ partA, partB, partC ] )
word.undo_partitioning()
self.assertEqual(len(word.transkription_positions), len(tps))
self.assertEqual(len(word.word_parts), 0)
"""
page = datatypes.page.Page('xml/N_VII_1_page138.xml')
word = page.words[77]
word.undo_partitioning()
self.assertEqual(len(word.word_parts), 0)
self.assertEqual(len(word.transkription_positions), 3)
update_transkription_position_ids(word)
empty_tree = ET.ElementTree(ET.Element('page'))
word_node = word.attach_word_to_tree(empty_tree)
print(ET.dump(word_node))
"""
def test_split(self):
page = Page()
pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs)
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
previousWord, currentWord, nextWord = word.split('b')
self.assertEqual(previousWord.id, 0)
self.assertEqual(previousWord.text, 'a')
self.assertEqual(currentWord.id, 1)
self.assertEqual(nextWord.id, 2)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
previousWord, currentWord, nextWord = word.split('bc')
self.assertEqual(previousWord.id, 0)
self.assertEqual(previousWord.text, 'a')
self.assertEqual(currentWord.id, 1)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
previousWord, currentWord, nextWord = word.split('ab', start_id=10)
self.assertEqual(currentWord.id, 10)
self.assertEqual(currentWord.text, 'ab')
self.assertEqual(currentWord.transkription_positions[0].width, 2.1)
self.assertEqual(nextWord.id, 11)
self.assertEqual(nextWord.transkription_positions[0].width, 5.2)
word_part_objs=[{'text': 'x', 'class':'st22', 'x': 0, 'y': 0},\
{'text': 'Insofern', 'class':'st22', 'x': 1, 'y': 0},\
{'text': 'x', 'class':'st22', 'x': 10, 'y': 0}]
pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs)
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
with self.assertWarns(Warning):
previousWord, currentWord, nextWord = word.split('Insofer')
word_part_objs=[{'text': 'xInsofern', 'class':'st22', 'x': 0, 'y': 0}]
pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs)
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
with self.assertWarns(Warning):
previousWord, currentWord, nextWord = word.split('Insofern')
def test_join(self):
word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10)
other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}])
word.join(other_word)
self.assertEqual(word.text, 'abc.')
other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}])
word.join(other_word, append_at_end_of_new_word=False)
self.assertEqual(word.text, '.abc.')
"""
tree = ET.ElementTree(ET.Element('page'))
word.attach_word_to_tree(tree)
print(ET.dump(tree.getroot()))
"""
def test_get_semanticAndDataDict(self):
dictionary = Word.get_semantic_dictionary()
#print(dictionary)
info_dict = dictionary['properties'].get('isDeletionOfWord')
self.assertEqual(SemanticClass.SUPER_PROPERTY in info_dict.keys(), True)
super_info_dict = info_dict[SemanticClass.SUPER_PROPERTY]
#print(info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME))
def test_simplify_transkription_positions(self):
node_string = """ """
nodeA = ET.fromstring(node_string)
node_string = """
"""
nodeB = ET.fromstring(node_string)
word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ])
self.assertEqual(len(word.transkription_positions), 2)
word.simplify_transkription_positions()
self.assertEqual(len(word.transkription_positions), 1)
word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ])
word.transkription_positions[1].writing_process_id = -1
word.simplify_transkription_positions()
self.assertEqual(len(word.transkription_positions), 1)
self.assertEqual(word.transkription_positions[0].writing_process_id, 0)
"""
tree = ET.ElementTree(ET.Element('page'))
word.attach_word_to_tree(tree)
print(ET.dump(tree.getroot()))
"""
def test_partition(self):
page = datatypes.page.Page(self.test_file)
word = page.words[67]
self.assertEqual(word.belongs_to_multiple_writing_processes(), True)
word.partition_according_to_writing_process_id()
self.assertEqual(len(word.word_parts), 3)
self.assertEqual(word.belongs_to_multiple_writing_processes(), False)
self.assertEqual(word.belongs_to_multiple_writing_processes(include_parts=True), True)
empty_tree = ET.ElementTree(ET.Element('page'))
word_node = word.attach_word_to_tree(empty_tree)
newWord = Word.create_cls(word_node)
self.assertEqual(len(newWord.word_parts), 3)
#print(ET.dump(empty_tree.getroot()))
def test_partition_deletion(self):
page = datatypes.page.Page(self.test_file)
word = page.words[67]
for transkription_position in word.transkription_positions:
transkription_position.deleted = transkription_position.writing_process_id == 1
self.assertEqual(word.has_mixed_status('deleted'), True)
word.partition_according_to_deletion()
self.assertEqual(len(word.word_parts), 3)
self.assertEqual(word.has_mixed_status('deleted'), False)
self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True)
page = datatypes.page.Page(self.test_file)
word = page.words[67]
word.partition_according_to_writing_process_id()
#print([(word.text, word.deleted) for word in word.word_parts])
word.word_parts[1].transkription_positions[1].deleted = True
word.partition_according_to_deletion()
self.assertEqual(len(word.word_parts), 4)
#print([(word.text, word.deleted) for word in word.word_parts])
partA = Word(text='A', deleted=True)
partB = Word(text='SDF', deleted=False)
word = Word(text='ASDF', word_parts=[ partA, partB])
self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True)
def test_execute_function_on_parts(self):
page = datatypes.page.Page(self.test_file)
word_parts = [ page.words[67], page.words[68] ]
word_parts, none = execute_function_on_parts(word_parts, 'partition_according_to_writing_process_id')
self.assertEqual(len(word_parts) == 4, True)
def test_process_word_boxes(self):
page = datatypes.page.Page(self.pdf_xml)
page.source = self.pdf_xml_source
page.update_styles(partition_according_to_styles=True)
tr = TranskriptionField(page.source)
box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\
'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\
'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\
'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\
'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315']
box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ]
indices = [30, 277, 288, 297, 321]
for word_id, index in enumerate(indices):
word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin)
self.assertEqual(word_over_box is not None, True)
self.assertEqual(word_over_box == page.words[index] or word_over_box in page.words[index].word_parts, True)
#self.assertEqual(word_over_box in page.words[index].word_parts, True)
def test_process_word_several_boxesOn1LIne(self):
page = datatypes.page.Page(self.pdf_xml)
page.source = self.pdf_xml_source
for word in page.words:
word.set_writing_process_id_to_transkription_positions(page)
word.partition_according_to_writing_process_id()
tr = TranskriptionField(page.source)
box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\
'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\
'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\
'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\
'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315']
box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ]
indices = [30, 277, 288, 297, 321]
empty_tree = ET.ElementTree(ET.Element('page'))
for word_id, index in enumerate(indices):
word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin)
self.assertEqual(word_over_box is not None, True)
def test_split_according_to_status(self):
page = datatypes.page.Page(self.test_file)
word = page.words[67]
for transkription_position in word.transkription_positions:
transkription_position.text = 'asdf'\
if transkription_position.writing_process_id == 1\
else word.text
self.assertEqual(word.has_mixed_status('text'), True)
new_words = word.split_according_to_status('text')
#print([word.text for word in new_words ])
self.assertEqual(len(new_words) > 1, True)
self.assertEqual(new_words[0].id, word.id)
self.assertEqual(new_words[0].deleted, word.deleted)
self.assertEqual(new_words[1].id, word.id+1)
manuscript = ArchivalManuscriptUnity()
page = datatypes.page.Page(self.test_file)
word = page.words[67]
page.words = [ word ]
page.update_styles(manuscript=manuscript)
new_words = word.split_according_to_status('style', splits_are_parts=True)
self.assertEqual(len(word.word_parts), 3)
def test__create_new_word(self):
manuscript = ArchivalManuscriptUnity()
page = datatypes.page.Page(self.test_file)
word = page.words[67]
page.words = [ word ]
page.update_styles(manuscript=manuscript)
newWord = word._create_new_word([ word.transkription_positions[0] ], 'style')
for key in Word.COPY_PROPERTY_KEY:
self.assertEqual(newWord.__dict__[key], word.__dict__[key])
self.assertEqual(len(newWord.styles), 1)
def test__get_partial_word_over_box(self):
word = Word(text='test', transkription_positions=[ TranskriptionPosition(id=0), TranskriptionPosition(id=1) ])
word.transkription_positions[0].has_box = Box(earlier_text='asdf')
word._get_partial_word_over_box()
self.assertEqual(len(word.word_parts), 2)
partA = Word(id=0, text='A', transkription_positions=[TranskriptionPosition()])
partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition(), TranskriptionPosition(id=1)])
partB.transkription_positions[0].has_box = Box(earlier_text='asdf')
word = Word(text='ASDF', word_parts=[ partA, partB])
word._get_partial_word_over_box()
self.assertEqual(len(word.word_parts), 2)
if __name__ == "__main__":
unittest.main()
Index: tests_py2ttl/test_data/mapping_dict.xml
===================================================================
--- tests_py2ttl/test_data/mapping_dict.xml (revision 92)
+++ tests_py2ttl/test_data/mapping_dict.xml (revision 93)
@@ -1,277 +1,283 @@
tlnhttp://www.nie.org/ontology/nietzsche#./tln-ontology_autogenerated.ttlhttp://www.nie.org/ontology/nietzsche#ArchivalManuscriptUnityhttp://www.nie.org/ontology/nietzsche#hasTitlehttp://www.nie.org/ontology/nietzsche#hasManuscriptTypehttp://www.nie.org/ontology/nietzsche#hasStyleshttp://www.nie.org/ontology/nietzsche#hasPageshttp://www.nie.org/ontology/nietzsche#Pathhttp://www.nie.org/ontology/nietzsche#hasDAttribute
- http://www.nie.org/ontology/nietzsche#hasStyleClasshttp://www.nie.org/ontology/nietzsche#Boxhttp://www.nie.org/ontology/nietzsche#hasDAttribute
- http://www.nie.org/ontology/nietzsche#hasStyleClasshttp://www.nie.org/ontology/nietzsche#hasEarlierTexthttp://www.nie.org/ontology/nietzsche#Colorhttp://www.nie.org/ontology/nietzsche#colorHasNamehttp://www.nie.org/ontology/nietzsche#hasHexadecimalValuehttp://www.nie.org/ontology/nietzsche#Imagehttp://www.nie.org/ontology/nietzsche#hasHeighthttp://www.nie.org/ontology/nietzsche#hasWidthhttp://www.nie.org/ontology/nietzsche#hasFileNamehttp://www.nie.org/ontology/nietzsche#FaksimileImagehttp://www.nie.org/ontology/nietzsche#hasHeighthttp://www.nie.org/ontology/nietzsche#hasWidthhttp://www.nie.org/ontology/nietzsche#hasFileNamehttp://www.nie.org/ontology/nietzsche#hasUrlhttp://www.nie.org/ontology/nietzsche#hasTextFieldhttp://www.nie.org/ontology/nietzsche#PositionalObjecthttp://www.nie.org/ontology/nietzsche#hasHeighthttp://www.nie.org/ontology/nietzsche#hasWidthhttp://www.nie.org/ontology/nietzsche#hasLefthttp://www.nie.org/ontology/nietzsche#hasTophttp://www.nie.org/ontology/nietzsche#hasBottomhttp://www.nie.org/ontology/nietzsche#hasTransformhttp://www.nie.org/ontology/nietzsche#WordPositionhttp://www.nie.org/ontology/nietzsche#hasHeighthttp://www.nie.org/ontology/nietzsche#hasWidthhttp://www.nie.org/ontology/nietzsche#hasLefthttp://www.nie.org/ontology/nietzsche#hasTophttp://www.nie.org/ontology/nietzsche#hasBottomhttp://www.nie.org/ontology/nietzsche#hasTransformhttp://www.nie.org/ontology/nietzsche#FaksimilePositionhttp://www.nie.org/ontology/nietzsche#hasHeighthttp://www.nie.org/ontology/nietzsche#hasWidthhttp://www.nie.org/ontology/nietzsche#hasLefthttp://www.nie.org/ontology/nietzsche#hasTophttp://www.nie.org/ontology/nietzsche#hasBottomhttp://www.nie.org/ontology/nietzsche#hasTransformhttp://www.nie.org/ontology/nietzsche#isOnFaksimileImagehttp://www.nie.org/ontology/nietzsche#isOnTextFieldhttp://www.nie.org/ontology/nietzsche#Linehttp://www.nie.org/ontology/nietzsche#lineHasNumberhttp://www.nie.org/ontology/nietzsche#lineHasBottomValueOnTranskriptionhttp://www.nie.org/ontology/nietzsche#lineHasTopValueOnTranskriptionhttp://www.nie.org/ontology/nietzsche#lineHasInnerBottomValueOnFaksimilehttp://www.nie.org/ontology/nietzsche#lineHasInnerTopValueOnFaksimilehttp://www.nie.org/ontology/nietzsche#lineHasOuterBottomValueOnFaksimilehttp://www.nie.org/ontology/nietzsche#lineHasOuterTopValueOnFaksimilehttp://www.nie.org/ontology/nietzsche#isMainLinehttp://www.nie.org/ontology/nietzsche#SimpleWordhttp://www.nie.org/ontology/nietzsche#hasTexthttp://www.nie.org/ontology/nietzsche#wordBelongsToLinehttp://www.nie.org/ontology/nietzsche#hasTranskriptionPositionhttp://www.nie.org/ontology/nietzsche#hasFaksimilePositionhttp://www.nie.org/ontology/nietzsche#SpecialWordhttp://www.nie.org/ontology/nietzsche#hasTexthttp://www.nie.org/ontology/nietzsche#wordBelongsToLinehttp://www.nie.org/ontology/nietzsche#hasTranskriptionPositionhttp://www.nie.org/ontology/nietzsche#hasFaksimilePositionhttp://www.nie.org/ontology/nietzsche#MarkForeignHandshttp://www.nie.org/ontology/nietzsche#hasTexthttp://www.nie.org/ontology/nietzsche#textOfForeignHandshttp://www.nie.org/ontology/nietzsche#penOfForeignHandshttp://www.nie.org/ontology/nietzsche#wordBelongsToLinehttp://www.nie.org/ontology/nietzsche#hasTranskriptionPositionhttp://www.nie.org/ontology/nietzsche#hasFaksimilePositionhttp://www.nie.org/ontology/nietzsche#Pagehttp://www.nie.org/ontology/nietzsche#hasNumberhttp://www.nie.org/ontology/nietzsche#hasOrientationhttp://www.nie.org/ontology/nietzsche#hasLineshttp://www.nie.org/ontology/nietzsche#hasWordshttp://www.nie.org/ontology/nietzsche#hasWritingProcesseshttp://www.nie.org/ontology/nietzsche#hasWordDeletionPathshttp://www.nie.org/ontology/nietzsche#hasWordInsertionMarkshttp://www.nie.org/ontology/nietzsche#hasFaksimileImagehttp://www.nie.org/ontology/nietzsche#hasSvgImagehttp://www.nie.org/ontology/nietzsche#pageIsOnTextFieldhttp://www.nie.org/ontology/nietzsche#Referencehttp://www.nie.org/ontology/nietzsche#firstLineOfReferencehttp://www.nie.org/ontology/nietzsche#lastLineOfReferencehttp://www.nie.org/ontology/nietzsche#IsUncertainhttp://www.nie.org/ontology/nietzsche#hasTitlehttp://www.nie.org/ontology/nietzsche#hasPageNumberhttp://www.nie.org/ontology/nietzsche#SVGImagehttp://www.nie.org/ontology/nietzsche#hasHeighthttp://www.nie.org/ontology/nietzsche#hasWidthhttp://www.nie.org/ontology/nietzsche#hasFileNamehttp://www.nie.org/ontology/nietzsche#StandoffTaghttp://www.nie.org/ontology/nietzsche#standoffTagHasMarkuphttp://www.nie.org/ontology/nietzsche#standoffTagHasStartIndexhttp://www.nie.org/ontology/nietzsche#standoffTagHasEndIndexhttp://www.nie.org/ontology/nietzsche#Texthttp://www.nie.org/ontology/nietzsche#textHasContenthttp://www.nie.org/ontology/nietzsche#textHasMarkuphttp://www.nie.org/ontology/nietzsche#TextConnectionMarkhttp://www.nie.org/ontology/nietzsche#hasTexthttp://www.nie.org/ontology/nietzsche#wordBelongsToLinehttp://www.nie.org/ontology/nietzsche#hasTranskriptionPositionhttp://www.nie.org/ontology/nietzsche#hasFaksimilePositionhttp://www.nie.org/ontology/nietzsche#textConnectionMarkHasTextSourcehttp://www.nie.org/ontology/nietzsche#TextFieldhttp://www.nie.org/ontology/nietzsche#hasHeighthttp://www.nie.org/ontology/nietzsche#hasWidthhttp://www.nie.org/ontology/nietzsche#hasLefthttp://www.nie.org/ontology/nietzsche#hasTophttp://www.nie.org/ontology/nietzsche#hasBottomhttp://www.nie.org/ontology/nietzsche#hasTransformhttp://www.nie.org/ontology/nietzsche#TranskriptionPositionhttp://www.nie.org/ontology/nietzsche#hasHeighthttp://www.nie.org/ontology/nietzsche#hasWidthhttp://www.nie.org/ontology/nietzsche#hasLefthttp://www.nie.org/ontology/nietzsche#hasTophttp://www.nie.org/ontology/nietzsche#hasBottomhttp://www.nie.org/ontology/nietzsche#hasTransformhttp://www.nie.org/ontology/nietzsche#isOnSvgImagehttp://www.nie.org/ontology/nietzsche#Wordhttp://www.nie.org/ontology/nietzsche#hasText
- http://www.nie.org/ontology/nietzsche#isWordDeletedhttp://www.nie.org/ontology/nietzsche#hasEditedTexthttp://www.nie.org/ontology/nietzsche#wordHasWordPartshttp://www.nie.org/ontology/nietzsche#wordBelongsToLinehttp://www.nie.org/ontology/nietzsche#hasTranskriptionPositionhttp://www.nie.org/ontology/nietzsche#hasFaksimilePositionhttp://www.nie.org/ontology/nietzsche#wordHasStylehttp://www.nie.org/ontology/nietzsche#overwritesWordhttp://www.nie.org/ontology/nietzsche#isTransformationOfWordhttp://www.nie.org/ontology/nietzsche#isExtensionOfWordhttp://www.nie.org/ontology/nietzsche#isDeletionOfWordhttp://www.nie.org/ontology/nietzsche#isClarificationOfWordhttp://www.nie.org/ontology/nietzsche#wordHasEarlierVersionhttp://www.nie.org/ontology/nietzsche#wordHasCorrection
+ http://www.nie.org/ontology/nietzsche#wordIsDeletedByPath
+
+ http://www.nie.org/ontology/nietzsche#WordDeletionPath
+
+ http://www.nie.org/ontology/nietzsche#hasDAttribute
+
+
+ http://www.nie.org/ontology/nietzsche#WordInsertionMarkhttp://www.nie.org/ontology/nietzsche#hasHeighthttp://www.nie.org/ontology/nietzsche#hasWidthhttp://www.nie.org/ontology/nietzsche#hasLefthttp://www.nie.org/ontology/nietzsche#hasTophttp://www.nie.org/ontology/nietzsche#hasBottomhttp://www.nie.org/ontology/nietzsche#hasTransformhttp://www.nie.org/ontology/nietzsche#hasMarkTypehttp://www.nie.org/ontology/nietzsche#hasSymbolIdhttp://www.nie.org/ontology/nietzsche#hasNextWordhttp://www.nie.org/ontology/nietzsche#hasPreviousWordhttp://www.nie.org/ontology/nietzsche#wordInsertionMarkBelongsToLinehttp://www.nie.org/ontology/nietzsche#WritingProcesshttp://www.nie.org/ontology/nietzsche#hasVersionhttp://www.nie.org/ontology/nietzsche#hasDescriptionxml-dictionary
- 2020-02-21 14:31:03
+ 2020-05-13 11:04:44