Index: svgscripts/datatypes/image.py
===================================================================
--- svgscripts/datatypes/image.py (revision 100)
+++ svgscripts/datatypes/image.py (revision 101)
@@ -1,116 +1,137 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent all image types.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
import sys
from .attachable_object import AttachableObject
+from .matrix import Matrix
from .text_field import TextField
sys.path.append('py2ttl')
from class_spec import SemanticClass
class Image(AttachableObject,SemanticClass):
"""
This super class represents all types of images.
Args:
file_name (str): name of the image file.
node (lxml.etree.Element) node, containing information
URL (str): URL of image file.
height (float): height of image
width (float): width of image
text_field (.text_field.TextField) text_field on image representation
"""
stringKeys = [ 'file_name', 'URL', 'local_path' ]
floatKeys = [ 'height', 'width' ]
XML_TAG = 'image'
- def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG):
+ def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, matrix=None, text_field=None, tag=XML_TAG):
self.text_field = text_field
self.tag = tag
if node is not None:
self.file_name = node.get('file-name')
self.local_path = node.get('local-path')
self.URL = node.get('URL')
self.height = float(node.get('height'))
self.width = float(node.get('width'))
+ self.transform = Matrix(node.get('transform')) if bool(node.get('transform')) and 'matrix(' in node.get('transform') else None
if len(node.findall(TextField.XML_TAG)) > 0:
self.text_field = TextField(node=node.find(TextField.XML_TAG))
else:
self.file_name = file_name
self.local_path = local_path
self.URL = URL
self.height = height
self.width = width
+ self.transform = matrix
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
obj_node = target_tree.getroot().find('.//' + self.tag) \
if(len(target_tree.getroot().findall('.//' + self.tag)) > 0) \
else ET.SubElement(target_tree.getroot(), self.tag)
for key in self.floatKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3)))
for key in self.stringKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), self.__dict__[key])
+ if self.transform is not None and self.transform.isRotationMatrix():
+ obj_node.set('transform', self.transform.toString())
if self.text_field is not None:
self.text_field.attach_object_to_tree(obj_node)
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {}
for floatKey in Image.floatKeys:
properties.update(cls.create_semantic_property_dictionary(floatKey, float, cardinality=1))
properties.update(cls.create_semantic_property_dictionary('file_name', str, cardinality=1))
- #properties.update(cls.create_semantic_property_dictionary('URL', str))
+ properties.update(cls.create_semantic_property_dictionary('transform', str))
+ properties.update(cls.create_semantic_property_dictionary('URL', str, cardinality=1))
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
class SVGImage(Image):
"""This class represents a svg image.
"""
XML_TAG = 'svg-image'
def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG):
if node is not None and node.tag != self.XML_TAG:
file_name = node.get('file')
height = float(node.get('height')) if bool(node.get('height')) else 0.0
width = float(node.get('width')) if bool(node.get('width')) else 0.0
node = None
super(SVGImage, self).__init__(node=node, file_name=file_name, URL=URL,\
height=height, width=width, text_field=text_field, tag=self.XML_TAG)
+ def decontextualize_file_name(self, update_url=None):
+ """Decontextualize file name.
+ """
+ self.file_name = self.file_name.replace('./', '')
+ if update_url is not None:
+ self.URL = update_url + self.file_name
+
+ @classmethod
+ def get_semantic_dictionary(cls):
+ """ Creates and returns a semantic dictionary as specified by SemanticClass.
+ """
+ dictionary = super(SVGImage,cls).get_semantic_dictionary()
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('text_field', TextField))
+ return cls.return_dictionary_after_updating_super_classes(dictionary)
+
Index: svgscripts/datatypes/text_field.py
===================================================================
--- svgscripts/datatypes/text_field.py (revision 100)
+++ svgscripts/datatypes/text_field.py (revision 101)
@@ -1,52 +1,52 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class represents a text field on a faksimile image.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import sys
import re
from os import path, sep
import lxml.etree as ET
from .positional_object import PositionalObject
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__version__ = "0.0.1"
class TextField(PositionalObject):
"""
This class represents the text field of a faksimile image.
Args:
id (str): id from svg file.
width (float)
height (float)
x (float)
y (float)
"""
XML_TAG = 'text-field'
- def __init__(self, id=0, node=None, width=0.0, height=0.0, x=0.0, y=0.0):
- super(TextField, self).__init__(node=node, id=id, width=width, height=height, x=x, y=y, tag=self.XML_TAG)
+ def __init__(self, id=0, node=None, width=0.0, height=0.0, x=0.0, y=0.0, matrix=None):
+ super(TextField, self).__init__(node=node, id=id, width=width, height=height, x=x, y=y, matrix=matrix, tag=self.XML_TAG)
self.xmin = self.left
self.xmax = self.left + self.width
self.ymin = self.top
self.ymax = self.top + self.height
Index: svgscripts/datatypes/faksimile.py
===================================================================
--- svgscripts/datatypes/faksimile.py (revision 100)
+++ svgscripts/datatypes/faksimile.py (revision 101)
@@ -1,199 +1,204 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a faksimile page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import re
from lxml import etree as ET
from os import path
from os.path import isdir, isfile, sep, basename
from svgpathtools.parser import parse_path
from .faksimile_image import FaksimileImage
from .matrix import Matrix
from .text_field import TextField
from .word_position import WordPosition
class FaksimilePage:
"""
This class represents a faksimile page.
Args:
xml_target_file (str): name of the xml file to which page info will be written.
xml_source_file (str): name of the xml file that will be instantiated.
"""
XML_TAG = 'faksimile-page'
def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, svg_source_file=None, faksimile_image=None, text_field=None):
xml_file = xml_source_file if xml_source_file is not None else xml_target_file
self.title = title
self.page_number = page_number
self.xml_file = xml_file
if xml_file is not None and isfile(xml_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_file, parser)
self.title = self.page_tree.getroot().get('title')
self.page_number = self.page_tree.getroot().get('page-number')
self.width = float(self.page_tree.getroot().get('width')) if bool(self.page_tree.getroot().get('width')) else 0.0
self.height = float(self.page_tree.getroot().get('height')) if bool(self.page_tree.getroot().get('height')) else 0.0
else:
self.page_tree = ET.ElementTree(ET.Element(self.XML_TAG))
if title is not None:
self.page_tree.getroot().set('title', title)
if page_number is not None:
self.page_tree.getroot().set('page-number', str(page_number))
if xml_target_file is not None:
self.remove_tags_from_page_tree([WordPosition.FAKSIMILE])
if svg_source_file is not None:
self.page_tree.getroot().set('svg-source-file', svg_source_file)
if faksimile_image is not None:
faksimile_image.attach_object_to_tree(self.page_tree)
if text_field is not None:
text_field.attach_object_to_tree(self.page_tree)
self.svg_source_file = self.page_tree.getroot().get('svg-source-file')
self.faksimile_image = FaksimileImage(node=self.page_tree.getroot().find('.//' + FaksimileImage.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + FaksimileImage.XML_TAG)) > 0 else None
self.text_field = TextField(node=self.page_tree.getroot().find('.//' + TextField.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + TextField.XML_TAG)) > 0 else None
self.word_positions = [ WordPosition(node=node) for node in self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE) ]\
if len(self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE)) > 0 else []
def append_word_position(self, word_position):
"""Appends word_position to word_positions and attaches it to page_tree.
"""
self.word_positions.append(word_position)
word_position.attach_object_to_tree(self.page_tree)
@classmethod
def get_faksimile_pages(cls, svg_file, page_number='') -> list:
"""Creates and returns text fields contained in a svg_file as a list.
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
return cls.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces, page_number=page_number)
@staticmethod
def GET_FAKSIMILEPAGES(svg_tree, namespaces=None, page_number='') -> list:
"""Creates and returns text fields contained in a svg_tree as a list.
"""
THRESHOLD_X = 10
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
source_file_name = svg_tree.docinfo.URL
image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap), source_file_name)
xml_dir = '.{}xml'.format(sep)
faksimile_pages = list()
title_string = re.sub(r'[,_][0-9]+.*\.svg', '', basename(source_file_name))
+ if re.match(r'.*-\d+[a-z]$', title_string):
+ title_string = re.sub(r'-\d+[a-z]$', '', title_string)
title = title_string.replace('-', ' ')
rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap)\
if rect.get('id', svg_tree.getroot().nsmap).startswith(title_string)\
and rect.get('id', svg_tree.getroot().nsmap).endswith(str(page_number)) ]
for text_field_rect in rect_list:
tf_x = float(text_field_rect.get('x', svg_tree.getroot().nsmap)) - image.x
tf_y = float(text_field_rect.get('y', svg_tree.getroot().nsmap)) - image.y
tf_width = float(text_field_rect.get('width', svg_tree.getroot().nsmap))
tf_height = float(text_field_rect.get('height', svg_tree.getroot().nsmap))
+ tf_matrix = Matrix(transform_matrix_string=text_field_rect.get('transform'))\
+ if bool(text_field_rect.get('transform'))\
+ else None
id = text_field_rect.get('id', svg_tree.getroot().nsmap)
target_file_name = xml_dir + sep + id + '.xml' if isdir(xml_dir) else id + '.xml'
page_number = re.sub(r'.*[,_]', '', id)
if page_number.startswith('0'):
page_number = page_number.lstrip('0')
- text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y)
+ text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y, matrix=tf_matrix)
faksimile_page = FaksimilePage(xml_target_file=target_file_name, svg_source_file=source_file_name,\
title=title, page_number=page_number, faksimile_image=image, text_field=text_field)
x_min = text_field.xmin + image.x
y_min = text_field.ymin + image.y
#rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\
# x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces)
rect_titles = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\
y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces)
rect_titles += get_paths_inside_rect(svg_tree, '//ns:path/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\
y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces)
for rect_title in rect_titles:
rect = rect_title.getparent()
x, y, height, width = 0.0, 0.0, 0.0, 0.0
if rect.tag.endswith('path'):
path = parse_path(rect.get('d'))
x, xmax, y, ymax = path.bbox()
width = xmax - x
height = ymax - y
else:
x = float(rect.get('x', svg_tree.getroot().nsmap))
y = float(rect.get('y', svg_tree.getroot().nsmap))
height = float(rect.get('height', svg_tree.getroot().nsmap))
width = width=float(rect.get('width', svg_tree.getroot().nsmap))
matrix = None
if bool(rect.get('transform')):
matrix = Matrix(transform_matrix_string=rect.get('transform'))
faksimile_page.append_word_position(\
WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=rect_title.text, height=height,\
width=width, x=x, y=y, matrix=matrix, tag=WordPosition.FAKSIMILE))
faksimile_pages.append(faksimile_page)
return faksimile_pages
def remove_tags_from_page_tree(self, list_of_tags_to_remove):
"""Removes the tags specified in the list from the target tree.
"""
for xpath2remove in list_of_tags_to_remove:
for node in self.page_tree.xpath('//' + xpath2remove):
node.getparent().remove(node)
def get_paths_inside_rect(svg_tree, xpath, x_min, x_max, y_min, y_max, not_id, namespaces={}):
"""Returns a list of all paths selected by xpath that are inside x_min, x_max, y_min, y_max and do not have id == not_id.
"""
paths = []
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
for path_node in svg_tree.xpath(xpath, namespaces=namespaces):
append_node = path_node
if not path_node.tag.endswith('path') and not path_node.tag.endswith('rect'):
path_node = path_node.getparent()
x, xmax, y, ymax = -1, -1, -1, -1
init_xy = False
if path_node.tag.endswith('rect'):
x = float(path_node.get('x')) if bool(path_node.get('x')) else -1
y = float(path_node.get('y')) if bool(path_node.get('y')) else -1
xmax = x + float(path_node.get('width')) if bool(path_node.get('width')) else -1
ymax = y + float(path_node.get('height')) if bool(path_node.get('height')) else -1
init_xy = True
elif path_node.tag.endswith('path') and bool(path_node.get('d')) and path_node.get('d') != 0:
path = parse_path(path_node.get('d'))
x, xmax, y, ymax = path.bbox()
init_xy = True
if init_xy:
if bool(path_node.get('transform')):
matrix = Matrix(transform_matrix_string=path_node.get('transform'))
x, xmax = matrix.get_new_x(x=x, y=y), matrix.get_new_x(x=xmax, y=ymax)
y, ymax = matrix.get_new_y(x=x, y=y), matrix.get_new_y(x=xmax, y=ymax)
width = xmax - x
height = ymax - y
if x > x_min and x < x_max\
and y > y_min and y < y_max\
and path_node.get('id') != not_id:
paths.append(append_node)
return paths
Index: svgscripts/datatypes/faksimile_image.py
===================================================================
--- svgscripts/datatypes/faksimile_image.py (revision 100)
+++ svgscripts/datatypes/faksimile_image.py (revision 101)
@@ -1,104 +1,108 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent faksimile images.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import fnmatch
from lxml import etree as ET
import os
from os.path import basename, dirname, isfile, realpath, sep
import sys
from .image import Image
+from .matrix import Matrix
from .text_field import TextField
sys.path.append('svgscripts')
from local_config import FAKSIMILE_LOCATION
class FaksimileImage(Image):
"""
This class represents a faksimile image.
Args:
file_name (str): name of the image file.
node (lxml.etree.Element) node, containing information
URL (str): URL of image file.
height (float): height of image
width (float): width of image
x (float): x
y (float): y
"""
XML_TAG = 'faksimile-image'
#OLD_NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/download/'
NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/'
- def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0, text_field=None):
+ def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, text_field=None):
super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL, local_path=local_path,\
- height=height, width=width, text_field=text_field, tag=self.XML_TAG)
+ height=height, width=width, matrix=matrix, text_field=text_field, tag=self.XML_TAG)
self.x = x
self.y = y
def get_image_joined_with_text_field(self, text_field):
"""Returns a new instance of itself that has a text_field (text_field.TextField).
"""
return FaksimileImage(file_name=self.file_name, local_path=self.local_path, URL=self.URL, height=self.height,\
width=self.width, x=self.x, y=self.y, text_field=text_field)
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(FaksimileImage,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('text_field', TextField))
- dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('URL', str, cardinality=1))
+ #dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('URL', str, cardinality=1))
return cls.return_dictionary_after_updating_super_classes(dictionary)
@staticmethod
def CREATE_IMAGE(image_node, source_file=None):
"""Instantiates a FaksimileImage from a (lxml.etree.Element) image_node.
"""
namespaces = image_node.nsmap
if len(namespaces) == 0:
namespaces = { 'xlink': '' }
local_path = image_node.get('{%s}href' % namespaces['xlink'])
file_name = basename(local_path)
if file_name != local_path and source_file is not None:
local_path = realpath(dirname(source_file)) + sep + local_path
local_path = realpath(local_path)
if not isfile(local_path):
local_path = None
for path, dirs, files in os.walk(os.path.abspath(FAKSIMILE_LOCATION)):
for filename in fnmatch.filter(files, file_name):
local_path = os.path.join(path, filename)
break
URL = FaksimileImage.NIETZSCHE_SOURCES_URL + file_name.replace('.jpg','')
height = float(image_node.get('height')) if bool(image_node.get('height')) else 0.0
width = float(image_node.get('width')) if bool(image_node.get('width')) else 0.0
x = float(image_node.get('x')) if bool(image_node.get('x')) else 0.0
y = float(image_node.get('y')) if bool(image_node.get('y')) else 0.0
- return FaksimileImage(file_name=file_name, local_path=local_path, URL=URL, height=height, width=width, x=x, y=y)
+ matrix = Matrix(transform_matrix_string=image_node.get('transform'))\
+ if bool(image_node.get('transform'))\
+ else None
+ return FaksimileImage(file_name=file_name, local_path=local_path, URL=URL, height=height, width=width, x=x, y=y, matrix=matrix)
Index: svgscripts/datatypes/transkriptionField.py
===================================================================
--- svgscripts/datatypes/transkriptionField.py (revision 100)
+++ svgscripts/datatypes/transkriptionField.py (revision 101)
@@ -1,202 +1,207 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to transform a svg file according to the dimension of its transkription field.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__version__ = "0.0.1"
import sys
from os.path import exists
from svgpathtools import svg_to_paths
import xml.etree.ElementTree as ET
from xml.parsers.expat import ExpatError
from .matrix import Matrix
+from .text_field import TextField
MAX_SMALLER_PATH_WIDTH = 50.0
MAX_SMALLER_PATH_HEIGHT = 50.0
MAX_DIFF_DOC_SELF_WIDTH = 100.0
MAX_DIFF_DOC_SELF_HEIGHT = 100.0
MIN_AREA = 2500.0
class TranskriptionField:
"""
A class containing the dimensions of the transkription field.
Args:
filename (str): name of the svg file
"""
def __init__(self, filename, multipage_index=-1):
self.width = 0.0
self.height = 0.0
self.xmin = 0.0
self.xmax = 0.0
self.ymin = 0.0
self.ymin_without_title = 0.0
self.ymax = 0.0
self.documentWidth = 0.0
self.documentHeight = 0.0
self.path = None
self.second_field = None
self.filename = filename
self.line_number_area_width = 0.0
try:
paths, attributes, self.svg_attributes = svg_to_paths.svg2paths(filename, return_svg_attributes=True)
except ExpatError:
raise ExpatError('File {} is empty!'.format(filename))
if len(self.svg_attributes) > 0 and bool(self.svg_attributes.get('viewBox')):
viewBox = (self.svg_attributes['viewBox'].split())
else:
raise Exception('File "{}" does not have an attribute "viewBox"'.format(filename))
self.documentWidth = float(viewBox[2])
self.documentHeight = float(viewBox[3])
if self.is_shrunk():
self.xmin = float(viewBox[0])
self.ymin = float(viewBox[1])
self.width = self.documentWidth
self.height = self.documentHeight
else:
sorted_paths = self.sort_according_to_area_desc(paths, attributes)
if multipage_index < 0 and len(sorted_paths) > 0:
self.path = sorted_paths[0]
elif len(sorted_paths) > 1:
self.path = sorted(sorted_paths[:2], key=lambda path: path.bbox()[0])[multipage_index]
if multipage_index == 0:
self.second_field = TranskriptionField(filename, multipage_index=1)
if self.path is not None:
self.xmin, self.xmax, self.ymin, self.ymax = self.path.bbox()
self.width = self.xmax - self.xmin
self.height = self.ymax - self.ymin
self.ymin_without_title = self.ymin - 10
-
def add_line_number_area_width(self, end_positionX_of_line_number_area):
"""Adds the width of the line number area.
"""
if self.is_page_verso():
self.line_number_area_width = self.xmin - end_positionX_of_line_number_area
else:
self.line_number_area_width = end_positionX_of_line_number_area - self.xmax
+
+ def convert_to_text_field(self) ->TextField:
+ """Convert to TextField.
+ """
+ return TextField(width=self.width, height=self.height, x=self.xmin, y=self.ymin)
def is_page_verso(self) -> bool:
"""Returns true if the area right of the TranskriptionField is less than the left area.
"""
return self.documentWidth-self.xmax < self.xmin
def is_shrunk(self) -> bool:
"""Returns True if viewbox[0] and viewBox[1] != 0.
"""
if len(self.svg_attributes) == 0 or not bool(self.svg_attributes.get('viewBox')):
return False
viewBox = self.svg_attributes['viewBox'].split()
return float(viewBox[0]) != 0 and float(viewBox[1]) != 0
def get_svg_attributes(self, attrib_key):
"""Returns the svg attribute for the corresponding key or None if empty.
"""
if self.svg_attributes is None or len(self.svg_attributes) == 0 or not bool(self.svg_attributes.get(attrib_key)):
return None
return self.svg_attributes[attrib_key]
def shrink_svg_to_transkription_field(self, target_filename=None):
""" Changes the viewBox of the svg graphics to the size of the transkription field.
If a target_filename is specified, the changes are saved to a new file,
otherwise they are saved to the input file.
Args:
target_filename (str): name of the target svg file
"""
if bool(self.svg_attributes.get('xmlns')):
ET.register_namespace('', self.svg_attributes['xmlns'])
if bool(self.svg_attributes.get('xmlns:xlink')):
ET.register_namespace('xlink', self.svg_attributes['xmlns:xlink'])
et = ET.parse(self.filename)
root = et.getroot()
if bool(root.attrib.get('viewBox')):
if(not self.is_shrunk()):
root.attrib['viewBox'] = '{} {} {} {}'.format(self.xmin, self.ymin, self.width, self.height)
if bool(root.attrib.get('width')):
root.attrib['width'] = '{}pt'.format(self.width)
if bool(root.attrib.get('height')):
root.attrib['height'] = '{}pt'.format(self.height)
if not bool(target_filename):
target_filename = self.filename
et.write(target_filename)
return 0
else:
#print('File {} already transformed!'.format(self.filename))
return 1
else:
print('ERROR: file {} does not contain a svg/@viewBox!'.format(self.filename)) #TODO: throw error
return 2
"""Return a list of paths sorted according to volume, descending.
"""
def transkription_field_found(self) -> bool:
""" Returns whether transkription field was found in __init__
"""
return self.width > 0.0 and self.height > 0.0 and self.xmin > 0.0 and self.xmax > 0.0 and self.ymin > 0.0 and self.ymax > 0.0
def getWidth(self):
"""Returns documentWidth
"""
return self.documentWidth
def getHeight(self):
"""Returns documentHeight if not is_shrunk, else height.
"""
return self.documentHeight
def get_path_area(self, path, attribute_dict, removal_dict=None) -> float:
"""Return area of path.bbox
"""
try:
if not bool(path)\
or not path.iscontinuous()\
or not path.isclosed():
return 0.0
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
height = ymax - ymin
if 'transform' in attribute_dict.keys():
matrix = Matrix(attribute_dict['transform'])
xmin, ymax, width, height = matrix.get_transformed_positions(xmin, ymin, width, height)
xmax = matrix.get_new_x()
ymin = matrix.get_new_y()
width = xmax - xmin
height = ymax - ymin
if self.documentWidth - width <= MAX_DIFF_DOC_SELF_WIDTH:
return 0.0
if self.documentHeight - height <= MAX_DIFF_DOC_SELF_HEIGHT:
return 0.0
return width * height
except AssertionError:
return 0.0
def sort_according_to_area_desc(self, paths, attributes, removal_dict=None) ->list:
"""Return a sorted list of paths sorted according to the area of their bbox, remove smaller paths.
"""
path_attributes = [ (path, attributes[index]) for index, path in enumerate(paths) if self.get_path_area(path, attributes[index]) > MAX_SMALLER_PATH_HEIGHT*self.documentWidth/4 ]
return [ path_tuple[0] for path_tuple in sorted(path_attributes, key=lambda path_tuple: self.get_path_area(*path_tuple, removal_dict=removal_dict), reverse=True) ]
Index: svgscripts/datatypes/word.py
===================================================================
--- svgscripts/datatypes/word.py (revision 100)
+++ svgscripts/datatypes/word.py (revision 101)
@@ -1,863 +1,871 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
import inspect
from lxml import etree as ET
from operator import attrgetter
import re
import string
import sys
import warnings
from .box import Box
from .editor_comment import EditorComment
from .matrix import Matrix
from .path import Path
from .simple_word import SimpleWord
from .style import Style
from .word_deletion_path import WordDeletionPath
from .word_position import WordPosition
from .transkription_position import TranskriptionPosition
from .writing_process import WritingProcess
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
def execute_function_on_parts(word_parts, func_name):
"""Execute function on parts and add those parts instead of original word to word_parts.
:return: new word_parts, output from func
"""
copy_parts = word_parts[:]
for word in word_parts:
output = eval('word.{0}()'.format(func_name))
if len(word.word_parts) > 0:
for part_word in word.word_parts:
copy_parts.insert(copy_parts.index(word), part_word)
copy_parts.remove(word)
word.word_parts = []
return copy_parts, output
def update_transkription_position_ids(word):
"""Update transkription_position' ids according to index.
"""
word_part_ids = [ wp.id for wp in word.word_parts ]
if len(word_part_ids) != len(set(word_part_ids)):
for id, wp in enumerate(word.word_parts):
wp.id = id
for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
transkription_position.id = index
transkription_position.has_box = None
transkription_position.deleted = False
class Word(SimpleWord):
"""
This class represents a word.
"""
COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ]
APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' }
DATA = 'debug-data'
RDFS_SUBCLASSOF_LIST = ['http://www.e-editiones.ch/ontology/text#HandwrittenText']
XML_TAG = 'word'
XML_EARLIER_VERSION = 'earlier-version'
XML_OVERWRITES = 'overwrites'
XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\
'isDeletionOfWord': 'deletesEarlierPart',\
'isExtensionOfWord': 'extendsEarlierVersion',\
'isTransformationOfWord': 'transformsEarlierPart' }
def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None):
super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
self.corrections = []
self.deleted = deleted
self.deletion_paths = []
self.debug_container = {}
self.debug_msg = None
self.earlier_version = earlier_version
self.edited_text = None
self.editor_comment = None
self.isClarificationOfWord = None
self.isDeletionOfWord = None
self.isExtensionOfWord = None
self.isTransformationOfWord = None
if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0:
self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ])
self.overwrites_word = None
self.styles = styles\
if styles is not None\
else []
self.verified = None
self.writing_process_id = writing_process_id
self.writing_processes = []
self.word_insertion_mark = None
self.word_box = None
self.word_parts = word_parts if word_parts is not None else []
self.word_part_objs = word_part_objs if word_part_objs is not None else []
def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Add a word deletion path to word.
"""
if len(self.word_parts) > 0:
for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
elif self.deleted and len(self.transkription_positions) > 0:
word_path = Path.create_path_from_transkription_position(self.transkription_positions[0],\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
self.deletion_paths = [ deletion_path for deletion_path in deletion_paths\
if do_paths_intersect_saveMode(deletion_path, word_path) ]
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
word_node = super(Word,self).attach_word_to_tree(target_tree)
if self.deleted is not None:
word_node.set('deleted', str(self.deleted).lower())
if self.verified is not None:
word_node.set('verified', str(self.verified).lower())
if self.edited_text is not None:
word_node.set('edited-text', self.edited_text)
if self.editor_comment is not None:
self.editor_comment.attach_object_to_tree(word_node)
if self.writing_process_id > -1:
word_node.set('writing-process-id', str(self.writing_process_id))
for index, word_part in enumerate(self.word_parts):
word_part.id = index
word_part.attach_word_to_tree(word_node)
if self.earlier_version is not None:
earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION)
self.earlier_version.attach_word_to_tree(earlier_node)
if self.overwrites_word is not None\
and len(self.overwrites_word.transkription_positions) > 0:
overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES)
self.overwrites_word.attach_word_to_tree(overwrite_node)
if self.word_box is not None:
self.word_box.attach_object_to_tree(word_node)
if len(self.corrections) > 0:
word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ])))
for key in self.XML_CORRECTION_DICT.keys():
if self.__dict__[key] is not None:
word_node.set(self.XML_CORRECTION_DICT[key], 'true')
return word_node
def belongs_to_multiple_writing_processes(self, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if len(self.word_parts) > 0 and include_parts:
return len(set(word.writing_process_id for word in self.word_parts)) > 1
return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1
def set_parent_word_writing_process_id(self):
"""Set writing_process_id for parent word.
"""
ids = set(word.transkription_positions[0].style for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None)
if len(ids) > 1:
self.writing_process_id = max([style.writing_process_id for style in ids])
if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\
for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\
> 1:
self.writing_process_id += 1
@classmethod
def create_cls(cls, word_node):
"""Creates a word from a (lxml.Element) node.
[:return:] Word
"""
cls = super(Word,cls).create_cls(word_node)
cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
cls.split_strings = None
cls.join_string = word_node.get('join')
if bool(word_node.get('split')):
cls.split_strings = word_node.get('split').split(' ')
if ''.join(cls.split_strings) != cls.text:
error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\
format(word_node.getroottree().docinfo.URL, str(cls.id))\
+ 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\
+ 'Text attribute: "{0}".\n'.format(cls.text)
raise Exception(error_msg)
cls.verified = word_node.get('verified') == 'true'\
if bool(word_node.get('verified')) else None
cls.deleted = word_node.get('deleted') == 'true'\
if bool(word_node.get('deleted')) else None
cls.edited_text = word_node.get('edited-text')
cls.editor_comment = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ][0]\
if len([ node for node in word_node.xpath('./' + EditorComment.XML_TAG) ]) > 0 else None
cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ]
if bool(word_node.get('corrections')):
for index in [ int(i) for i in word_node.get('corrections').split(' ') ]:
if index < len(cls.word_parts):
cls.corrections.append(cls.word_parts[index])
cls.earlier_version = None
if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0:
cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0]
for key_value in cls.XML_CORRECTION_DICT.values():
if word_node.get(key_value) == 'true':
cls.__dict__[key_value] = True
if cls.earlier_version is not None:
for word_part in cls.word_parts:
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\
and len(cls.word_parts) <= len(cls.earlier_version.word_parts):
try:
word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id]
except Exception:
msg = f'{cls.id} {cls.text}: {word_part.id}'
raise Exception(msg)
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls.earlier_version
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls
cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\
if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\
else None
cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\
if len(word_node.xpath('./' + Box.XML_TAG)) > 0\
else None
return cls
@classmethod
- def join_words(cls, list_of_words):
+ def join_words(cls, list_of_words, add_white_space_between_words=False):
"""Creates a word from a list of words.
[:return:] Word
"""
if len(list_of_words) > 1:
deleted = True in [ word.deleted for word in list_of_words ]\
and len(set([ word.deleted for word in list_of_words ])) == 1
line_number = list_of_words[0].line_number\
if len(set([ word.line_number for word in list_of_words ])) == 1\
else -1
for word in list_of_words:
if len(word.word_parts) > 0:
index = list_of_words.index(word)
list_of_words.remove(word)
for part_word in reversed(word.word_parts):
list_of_words.insert(index, part_word)
- new_word = cls(id=list_of_words[0].id, text=''.join([word.text for word in list_of_words]),\
+ new_word_text = ''.join([word.text for word in list_of_words])\
+ if not add_white_space_between_words\
+ else ' '.join([word.text for word in list_of_words])
+ new_word = cls(id=list_of_words[0].id, text=new_word_text,\
line_number=line_number, deleted=deleted, word_parts=list_of_words)
if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]:
change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0]
new_word.edited_text = new_word.text.replace(change_text, change_text[:-1])
for id, word in enumerate(new_word.word_parts): word.id = id
return new_word
if len(list_of_words) > 0:
return list_of_words[0]
else:
return None
def create_earlier_version(self, root_word=None, id=0):
"""Create an earlier version of word.
"""
if root_word is None:
root_word = self
root_word.set_parent_word_writing_process_id()
word_parts = []
non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\
if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ]
non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts)
if non_single_punctuation_word_parts_length > 0\
and len([ word_part for word_part in non_single_punctuation_word_parts\
if word_part.deleted ])\
== non_single_punctuation_word_parts_length:
self.deleted = True
for word_part in non_single_punctuation_word_parts: word_part.deleted = False
for id, word_part in enumerate(self.word_parts):
earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id)
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
elif word_part.overwrites_word is not None\
and ((len(word_part.transkription_positions) > 0\
and word_part.overwrites_word.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style\
!= word_part.overwrites_word.transkription_positions[0].style)
or word_part.word_box.earlier_version):
word_part.overwrites_word.id = word_part.id
word_parts.append(word_part.overwrites_word)
word_part.isTransformationOfWord = word_part.overwrites_word
#print(f'transform: {self.text}')
if word_part not in self.corrections:
self.corrections.append(word_part)
elif root_word.writing_process_id > -1\
and (len(word_part.transkription_positions) > 0\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style.writing_process_id\
== root_word.writing_process_id):
word_part.extendsEarlierVersion = True
#print('extends')
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
#print(f'default: {self.text}')
word_parts.append(earlierWordPart)
text = ''.join([ word.text for word in word_parts ])\
if len(word_parts) > 0\
else self.text
if len(word_parts) == 1:
self.transkription_positions += word_parts[0].transkription_positions
self.faksimile_positions += word_parts[0].faksimile_positions
word_parts = []
new_transkription_positions = copy.deepcopy(self.transkription_positions)
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None:
writing_process_id = self.transkription_positions[0].style.writing_process_id
for new_tp in new_transkription_positions:
new_tp.style.writing_process_id = writing_process_id
return Word(id=id, text=text, transkription_positions=new_transkription_positions,\
faksimile_positions=self.faksimile_positions, line_number=self.line_number,\
word_parts=word_parts)
def create_correction_history(self, page=None, box_style=None):
"""Create correction history.
"""
if self.word_box is not None:
manuscript = self.transkription_positions[0].style.manuscript\
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None\
else None
style = Style()
if box_style is not None:
style = box_style
if page is not None:
style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript)
for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]:
style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions)
for transkription_position in transkription_positions:
transkription_position.style = style
self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\
line_number=self.line_number)
for word_part in self.word_parts:
word_part.create_correction_history(page=page, box_style=box_style)
if len(self.word_parts) > 0:
earlier_version = self.create_earlier_version()
extending_words = self._get_parts_with_property_key('extendsEarlierVersion')
if len(extending_words) > 0:
for word in extending_words:
word.isExtensionOfWord = earlier_version
if self.has_mixed_status('deleted', include_parts=True):
self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ])
if len(self.corrections) > 0:
self.earlier_version = earlier_version
@staticmethod
def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
"""Creates a word from a (lxml.Element) node or word_part_objs.
[:return:] Word
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
text = word_node.get('text')
deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
if len(word_node.findall('.//' + Word.DATA)) > 0\
else [ item.attrib for item in word_node.findall('.//part')]
return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
WIDTH = 5
TOPCORRECTION = 2.0
FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
height = height
x = round(float(word_part_objs[0]['x']), 3)
if(page is not None and bool(page.style_dict)):
HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
if endSign is not None and '%' in endSign:
lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
if bool(page.style_dict[key].get('font-size'))]
lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
elif endSign is not None and '%' in endSign:
endX = float(endX) + WIDTH
bottom = round(float(word_part_objs[0]['y']), 3)
y = round(bottom - height + TOPCORRECTION, 3)
width = round(float(endX) - x, 3)
transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
text = ''.join([ dict['text'] for dict in word_part_objs])
line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
word.debug_msg = debug_msg
return word
else:
error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(Word,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\
cardinality=1, cardinality_restriction='minCardinality',\
name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\
name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\
name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\
comment='Word has been deleted by the author using a deletion path.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comment', EditorComment,\
name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\
name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\
name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\
name='isClarificationOfWord', label='word is a clarification of word',\
comment='The author has used this part of the word in order to clarify the appearance of that word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\
name='isDeletionOfWord', label='word is a deletion of word',\
comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\
name='isExtensionOfWord', label='word is a extension of word',\
comment='The author has used this part of a word in order to extend an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\
name='isTransformationOfWord', label='word is a transformation of word',\
comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\
name='overwritesWord', label='word overwrites word',\
comment='The author has used this word in order to overwrite that word.'))
# This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING,
# cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class.
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\
name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\
subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING))
super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\
name='isCorrectionOfWord', label='word is a correction of word',\
comment='The author has used this word in order to correct that word.')
for key in cls.XML_CORRECTION_DICT.keys():
correction_dict = dictionary[cls.PROPERTIES_KEY].get(key)
correction_dict.update(super_property_dictionary)
dictionary[cls.PROPERTIES_KEY].update({key: correction_dict})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def has_mixed_status(self, property_key, include_parts=False, concerns_word=True):
"""Returns true if transkription_positions have mixed status concerning the property_key in their __dict__.
"""
if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions):
return False
if len(self.word_parts) > 0 and include_parts:
if concerns_word:
if False in set(property_key in word.__dict__.keys() for word in self.word_parts):
return False
return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1
else:
return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\
if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1
return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1
def init_word(self, page):
"""Initialize word with objects from page.
"""
super(Word,self).init_word(page)
if self.writing_process_id > -1:
self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ]
writing_processes = self.writing_processes
for word_part in self.word_parts:
word_part.init_word(page)
self.lines += word_part.lines
self.writing_processes += word_part.writing_processes
self.lines = [ line for line in set(self.lines) ]
self.writing_processes = [ wp for wp in set(self.writing_processes)]
if self.overwrites_word is not None:
self.overwrites_word.init_word(page)
if self.earlier_version is not None:
if self.earlier_version.writing_process_id == -1:
self.earlier_version.writing_process_id = self.writing_process_id-1
if self.earlier_version.line_number == -1:
self.earlier_version.line_number = self.line_number
self.earlier_version.init_word(page)
- def join(self, other_word, append_at_end_of_new_word=True):
+ def join(self, other_word, append_at_end_of_new_word=True, add_white_space_between_words=False):
"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
"""
if append_at_end_of_new_word:
- self.text = self.text + other_word.text
+ self.text = self.text + other_word.text\
+ if not add_white_space_between_words\
+ else self.text + ' ' + other_word.text
for position in other_word.transkription_positions:
position.id = str(len(self.transkription_positions))
self.transkription_positions.append(position)
else:
self.text = other_word.text + self.text
index = 0
for position in other_word.transkription_positions:
self.transkription_positions.insert(index, position)
index += 1
while index < len(self.transkription_positions):
self.transkription_positions[index].id = str(index)
index += 1
self.simplify_transkription_positions()
def partition_according_to_deletion(self):
"""Partition a word according to its transkription_positions' deletion status
->split word and add partial words as its parts.
"""
if self.has_mixed_status('deleted'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.deleted != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.deleted
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
self.line_number = -1
self.deleted = False
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion')
elif not self.deleted\
and len(self.transkription_positions) > 0\
and self.transkription_positions[0].deleted:
self.deleted = True
def partition_according_to_writing_process_id(self):
"""Partition a word according to its transkription_positions' writing_process_ids
->split word and add partial words as its parts.
"""
if self.belongs_to_multiple_writing_processes():
last_writing_process_id = -1
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.writing_process_id != last_writing_process_id\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_writing_process_id = transkription_position.writing_process_id
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id')
if self.belongs_to_multiple_writing_processes(include_parts=True):
self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0]
elif len(self.transkription_positions) > 0:
self.writing_process_id = self.transkription_positions[0].writing_process_id
def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False):
"""Determines whether word is over a word box.
"""
word_over_box = None
if len(self.word_parts) > 0:
for word in self.word_parts:
current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None))
if current_word is not None and current_word.word_box is not None:
word_over_box = current_word
else:
new_tp_dict = {}
for index, transkription_position in enumerate(self.transkription_positions):
if previous_word_has_box and index == 0:
if len(transkription_position.positional_word_parts) > 0:
transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2
#print(f'{self.text}: {transkription_position.positional_word_parts[0].left}')
else:
transkription_position.left += 1
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
containing_boxes = [ box_path for box_path in box_paths\
if word_path.is_partially_contained_by(box_path)\
or box_path.do_paths_intersect(word_path) ]
if len(containing_boxes) > 0:
if previous_word_has_box:
print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}')
self._set_box_to_transkription_position(containing_boxes[0], word_path,\
transkription_position, new_tp_dict, tr_xmin)
box_paths.remove(containing_boxes[0])
for replace_tp in new_tp_dict.keys():
for tp in new_tp_dict.get(replace_tp):
self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp)
self.transkription_positions.remove(replace_tp)
word_over_box = self._get_partial_word_over_box()
update_transkription_position_ids(self)
return word_over_box
def set_word_insertion_mark(self, word_insertion_mark):
"""Sets word_insertion_mark
"""
self.word_insertion_mark = word_insertion_mark
def set_writing_process_id_to_transkription_positions(self, page):
"""Determines the writing process id of the transkription_positions.
"""
for transkription_position in self.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
if font_key in page.fontsizekey2stage_mapping.keys():
transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
def simplify_transkription_positions(self):
"""Merge transkription_positions if possible.
"""
index = len(self.transkription_positions)-1
while index > 0\
and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
current_tp = self.transkription_positions[index]
index -= 1
previous_tp = self.transkription_positions[index]
if previous_tp.is_mergebale_with(current_tp):
positional_word_parts = previous_tp.positional_word_parts
positional_word_parts += current_tp.positional_word_parts
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
if len(transkription_positions) == 1:
transkription_positions[0].writing_process_id = previous_tp.writing_process_id\
if previous_tp.writing_process_id != -1\
else current_tp.writing_process_id
self.transkription_positions.pop(index+1)
self.transkription_positions[index] = transkription_positions[0]
#print(self.text, len(self.transkription_positions))
def split(self, split_string, start_id=0):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString, currentString, nextString = self.text.partition(split_string)
currentWord = None
previousWord = None
nextWord = None
previousIndex = 0
current_id = start_id
all_positional_word_parts = []
for position in self.transkription_positions:
all_positional_word_parts += position.positional_word_parts
if len(all_positional_word_parts) == 0:
warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString))
if len(previousString) > 0:
previous_pwps = []
while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
previous_pwps.append(all_positional_word_parts[previousIndex])
previousIndex += 1
if previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString))
else:
previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split')
previous_text = ''.join([ pwp.text for pwp in previous_pwps ])
previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions)
+ previousWord.faksimile_positions = self.faksimile_positions
current_id += 1
all_positional_word_parts = all_positional_word_parts[previousIndex:]
if len(nextString) > 0:
tmp_pwps = []
index = 0
while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
tmp_pwps.append(all_positional_word_parts[index])
index += 1
if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString))
else:
next_pwps = all_positional_word_parts[index:]
next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split')
next_text = ''.join([ pwp.text for pwp in next_pwps ])
nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions)
+ nextWord.faksimile_positions = self.faksimile_positions
all_positional_word_parts = all_positional_word_parts[:index]
current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split')
current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ])
currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions)
+ currentWord.faksimile_positions = self.faksimile_positions
return previousWord, currentWord, nextWord
def split_according_to_status(self, status, splits_are_parts=False):
"""Split a word according to its transkription_positions' text.
:return: a list of new word.Word
"""
new_words = []
if self.has_mixed_status(status):
last_status = None
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.__dict__[status] != last_status\
and len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.__dict__[status]
if len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
if splits_are_parts:
self.word_parts += new_words
if len(self.word_parts) > 0:
self.transkription_positions = []
return new_words
def undo_partitioning(self):
"""Undo partitioning.
"""
if len(self.word_parts) > 0:
for word_part in self.word_parts:
word_part.undo_partitioning()
if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]):
self.transkription_positions += word_part.transkription_positions
self.earlier_version = None
self.edited_text = None
self.word_box = None
self.word_parts = []
self.corrections = []
self.earlier_versions = []
self.box_paths = []
def _create_new_word(self, transkription_positions, status, new_id=0):
"""Create a new word from self and transkription_positions.
"""
newWord = Word(id=new_id, transkription_positions=transkription_positions)
for key in self.COPY_PROPERTY_KEY:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys():
newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status])
else:
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
return newWord
def _get_parts_with_property_key(self, property_key):
"""Return a list of word_parts with property == property_key.
"""
word_parts = []
for word_part in self.word_parts:
if property_key in word_part.__dict__.keys():
word_parts.append(word_part)
else:
word_parts += word_part._get_parts_with_property_key(property_key)
return word_parts
def _get_partial_word_over_box(self):
"""Partition a word according to its transkription_positions' has_box
->split word and add partial words as its parts.
:return: word over box or self
"""
word_over_box = None
if self.has_mixed_status('has_box'):
transkription_positions = []
last_word_box = None
for transkription_position in self.transkription_positions:
if transkription_position.has_box != last_word_box\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
transkription_positions = []
transkription_positions.append(transkription_position)
last_word_box = transkription_position.has_box
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
self.transkription_positions = []
elif len(self.word_parts) > 0:
#self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box')
for word_part in self.word_parts:
if word_over_box is None:
word_over_box = word_part._get_partial_word_over_box()
else:
break
elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1:
word_over_box = self
word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box
return word_over_box
def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin):
"""Set box_path to transkription_position that is contained by box_path.
Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary.
"""
if box_path.contains_path(word_path):
transkription_position.has_box = box_path
elif box_path.contains_start_of_path(word_path):
split_position = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[0].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
elif box_path.contains_end_of_path(word_path):
split_position = box_path.path.bbox()[0] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
else: # box_path in the middle of word_pathz
split_position1 = box_path.path.bbox()[0] - tr_xmin
split_position2 = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position1, split_position2)
if len(new_tps) >= 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
def do_paths_intersect_saveMode(mypath1, mypath2):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try:
return mypath1.path.intersect(mypath2.path, justonemode=True)\
or mypath1.is_partially_contained_by(mypath2)
except AssertionError:
return False
Index: svgscripts/extract_line_continuation.py
===================================================================
--- svgscripts/extract_line_continuation.py (revision 100)
+++ svgscripts/extract_line_continuation.py (revision 101)
@@ -1,222 +1,224 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract line continuations.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import getopt
import lxml.etree as ET
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import warnings
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from datatypes.box import text_node_is_inside_match_box, tspan_node_is_inside_match_box
from datatypes.line import Line
from datatypes.line_continuation import LineContinuation
from datatypes.matrix import Matrix
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.reference import Reference
from datatypes.transkriptionField import TranskriptionField
from util import back_up
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
UNITTESTING = False
DEBUG = False
def extract_line_continuations(page: Page, svg_file=None, warning_message='WARNING'):
"""Extract line continuations.
"""
if svg_file is None:
if page.source is None or not isfile(page.source):
raise Exception('Function "extract_line_continuations" needs a page with a valid source or a svg_file!')
svg_file = page.source
if not UNITTESTING:
print(Fore.CYAN + f'Extracting line continuations on {page.title}, {page.number} ...' + Style.RESET_ALL)
svg_tree = ET.parse(svg_file)
- transkription_field = TranskriptionField(svg_file)
+ transkription_field = TranskriptionField(svg_file, multipage_index=page.multipage_index)
page.update_line_number_area(transkription_field, svg_tree=svg_tree)
for line in page.lines: line.editor_comments = []
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
arrow_style_key = [ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen'][0]\
if len([ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen']) > 0\
else None
if arrow_style_key is not None:
+ if DEBUG:
+ print(arrow_style_key)
for arrow in _extract_arrow_nodes(svg_tree, arrow_style_key, transkription_field, namespaces):
matrix = Matrix(transform_matrix_string=arrow.get('transform'))\
if not arrow.tag.endswith('tspan')\
else Matrix(transform_matrix_string=arrow.getparent().get('transform'))
line = _get_line_of_arrow(arrow, page, transkription_field)
if line is not None:
reference_counter = 0
reference = None
while reference is None and reference_counter < 2:
reference = _get_reference(svg_tree, arrow, matrix, transkription_field, namespaces, is_from_reference=(reference_counter==0))
reference_counter += 1
if reference is not None:
line.editor_comments.append(LineContinuation(reference=reference, to_reference=(reference_counter>1)))
else:
to_reference = (matrix.getX() > transkription_field.xmax)
line.editor_comments.append(LineContinuation(reference=Reference(), to_reference=to_reference))
else:
y = round(matrix.getY() - transkription_field.ymin, 2)
warnings.warn(f'{warning_message}: There is no line for {y}')
for line in page.lines: line.attach_object_to_tree(page.page_tree)
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
def _extract_arrow_nodes(svg_tree: ET.ElementTree, arrow_style_key: str, transkription_field=None, namespaces=None) ->list:
"""Extract arrow nodes from svg_tree.
"""
if transkription_field is None:
transkription_field = TranskriptionField(svg_tree.docinfo.URL)
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
return [ arrow for arrow in svg_tree.xpath('//ns:text[contains(@class, "{0}")]'.format(arrow_style_key)\
+ '|//ns:tspan[contains(@class, "{0}")]'.format(arrow_style_key),\
namespaces=namespaces)\
if arrow.text == ')' and node_is_on_marginalia(arrow, transkription_field) ]
def _get_arrow_y(arrow: ET.Element, matrix=None) ->float:
"""Return y of arrow node.
"""
if matrix is None:
matrix = Matrix(transform_matrix_string=arrow.get('transform'))\
if not arrow.tag.endswith('tspan')\
else Matrix(transform_matrix_string=arrow.getparent().get('transform'))
if arrow.tag.endswith('tspan'):
return matrix.add2Y(add_to_y=arrow.get('y'))
else:
return matrix.getY()
def _get_line_of_arrow(arrow: ET.Element, page: Page, transkription_field: TranskriptionField, matrix=None) ->Line:
"""Return Line next to arrow.
"""
arrow_y = _get_arrow_y(arrow, matrix=matrix)
line_number = page.get_line_number(round(arrow_y - transkription_field.ymin, 2) -.5)
lines = [ line for line in page.lines if line.id == line_number ]
if len(lines) > 0:
return lines[0]
return None
def _get_reference(svg_tree: ET.ElementTree, arrow: ET.Element, arrow_matrix: Matrix, transkription_field: TranskriptionField, namespaces: dict, is_from_reference=True) ->Reference:
"""Return reference.
"""
reference = None
arrow_left = arrow_matrix.add2X(add_to_x=arrow.get('x'))\
if arrow.tag.endswith('tspan')\
else arrow_matrix.getX()
arrow_y = _get_arrow_y(arrow, matrix=arrow_matrix)
xmin = 0\
if arrow_left < transkription_field.xmin\
else transkription_field.xmax + transkription_field.line_number_area_width
xmax = arrow_left
ymin = arrow_y -5
ymax = arrow_y +5
if not is_from_reference:
xmin = xmax
xmax = transkription_field.xmin - transkription_field.line_number_area_width\
if arrow_left < transkription_field.xmin\
else transkription_field.documentWidth + transkription_field.line_number_area_width
text_nodes_on_arrow_line = sorted([ text_node for text_node in svg_tree.xpath('//ns:text', namespaces=namespaces)\
if text_node != arrow and text_node_is_inside_match_box(text_node, xmin, xmax, ymin, ymax) ],\
key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX())
reference_string = ''
for text_node in text_nodes_on_arrow_line:
reference_string += ''.join([ child.text for child in text_node.getchildren()])\
if len(text_node.getchildren()) > 0\
else text_node.text
if reference_string != '':
try:
reference = Reference.create_cls(reference_string=reference_string)
except Exception:
print(reference_string)
return reference
def node_is_on_marginalia(node: ET.Element, transkription_field: TranskriptionField) ->bool:
"""Return true if node is on marginalia.
"""
if node.tag.endswith('tspan'):
return tspan_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\
or tspan_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax)
return text_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\
or text_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to extract the line continuations.
svgscripts/extract_line_continuation.py [OPTIONS] a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
manuscript_file = file_a\
if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
else None
counter = 0
for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK):
if not UNITTESTING:
back_up(page, page.xml_file)
extract_line_continuations(page)
counter += 1
not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/convert_wordPositions.py
===================================================================
--- svgscripts/convert_wordPositions.py (revision 100)
+++ svgscripts/convert_wordPositions.py (revision 101)
@@ -1,557 +1,690 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import cairosvg
import getopt
import json
from lxml.html import builder as E
from lxml.html import open_in_browser
import lxml
from pathlib import Path as PathLibPath
from os import sep, listdir, mkdir, path, remove
from os.path import exists, isfile, isdir, dirname
import re
import sys
from svgpathtools import svg_to_paths
import xml.etree.ElementTree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.page_creator import PageCreator
from datatypes.transkriptionField import TranskriptionField
from datatypes.writing_process import WritingProcess
from datatypes.word import Word
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
+EXIST_DB = 'http://130.60.24.65:8081/exist/rest/db/ProjectData/Nietzsche/'
+
class Converter:
"""The converter super class.
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
self.page = page
self.non_testing = non_testing
self.show_word_insertion_mark = show_word_insertion_mark
def _get_transkription_positions(self, transkription_positions, stage_version=''):
"""Returns the transkription_positions of the indicated stage_version.
"""
convertable_transkription_positions = transkription_positions
if stage_version != '':
convertable_transkription_positions = []
if re.match(r'^\d$', stage_version):
writing_process_id = int(stage_version)
for transkription_position in transkription_positions:
if transkription_position.writing_process_id == writing_process_id:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\+$', stage_version):
version_range = [ *range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\-\d$', stage_version):
start_stop = [ int(i) for i in re.split(r'-', stage_version) ]
version_range = [ *range(start_stop[0], start_stop[1]+1) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
return convertable_transkription_positions
def _get_words(self, words, highlighted_words=None):
"""Return the words that will be hightlighted.
"""
return highlighted_words if highlighted_words is not None else words
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Prints all words.
"""
first_word_of_line = None
out = sys.stdout
if output_file is not None:
out = open(output_file, 'w')
for word in self.page.words:
if first_word_of_line is None or first_word_of_line.line_number != word.line_number:
out.write('\n')
first_word_of_line = word
if word.line_number % 2 == 0:
out.write(str(word.line_number).zfill(2) + ' ')
else:
out.write(' ')
if stage_version == '' or len(self._get_transkription_positions(word.transkription_positions, stage_version=stage_version)) > 0:
if word.text is not None:
out.write(word.text + ' ')
out.close()
return 0
@classmethod
def CREATE_CONVERTER(cls, page, non_testing=True, converter_type='', show_word_insertion_mark=False, key=''):
"""Returns a converter of type converter_type.
[:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None
"""
cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() }
cls_key = converter_type + 'Converter'
if bool(cls_dict.get(cls_key)):
converter_cls = cls_dict[cls_key]
if converter_cls == JSONConverter:
return converter_cls(page, non_testing, key=key)
return converter_cls(page, non_testing, show_word_insertion_mark)
else:
return Converter(page, non_testing, show_word_insertion_mark)
class JSONConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a json file.
"""
+ def __init__(self, page, non_testing=True, key=''):
+ Converter.__init__(self, page, non_testing, False)
+
+ def _add_word_to_list(self, words, word, text, text_field=None, edited_text=None, earlier_version=None, overwrites_word=None, parent_id=-1):
+ """Add word to list.
+ """
+ id = word.id\
+ if parent_id == -1\
+ else parent_id
+ edited_text = word.edited_text\
+ if edited_text is None\
+ else edited_text
+ earlier_version = word.earlier_version\
+ if earlier_version is None\
+ else earlier_version
+ overwrites_word = word.overwrites_word\
+ if overwrites_word is None\
+ else overwrites_word
+ line_number = word.line_number
+ for tp in word.transkription_positions:
+ tp_id = f'w{word.id}:tp{tp.id}'\
+ if parent_id == -1\
+ else f'w{parent_id}:w{word.id}:tp{tp.id}'
+ if text_field is not None:
+ word_dict = { 'id': id, 'text': text, 'left': tp.left + text_field.left, 'top': tp.top + text_field.top,\
+ 'width': tp.width, 'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted }
+ if tp.transform is not None:
+ matrix = tp.transform.clone_transformation_matrix()
+ xmin = text_field.left
+ ymin = text_field.top
+ matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3)
+ matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3)
+ word_dict.update({ 'transform': matrix.toString() })
+ if tp.left > 0:
+ word_dict.update({ 'left': round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)})
+ else:
+ word_dict.update({ 'left': 0})
+ word_dict.update({ 'top': round((tp.height-1.5)*-1, 3)})
+ else:
+ word_dict = { 'id': id, 'text': text, 'left': tp.left, 'top': tp.top, 'width': tp.width,\
+ 'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted }
+ if tp.transform is not None:
+ word_dict.update({ 'transform': tp.transform.toString() })
+ if edited_text is not None:
+ word_dict.update({'edited_text': edited_text})
+ if earlier_version is not None:
+ word_dict.update({'earlier_version': earlier_version.text })
+ if overwrites_word is not None:
+ word_dict.update({'overwrites_word': overwrites_word.text })
+ if parent_id > -1:
+ word_dict.update({'part_text': word.text })
+ words.append(word_dict)
+ for wp in word.word_parts:
+ self._add_word_to_list(words, wp, text, text_field=text_field, edited_text=edited_text,\
+ earlier_version=earlier_version, overwrites_word=overwrites_word, parent_id=word.id)
+
+ def create_json_dict(self) ->dict:
+ """Create and return a json dictionary.
+ """
+ words = []
+ text_field = None
+ if self.page.svg_image is not None:
+ if self.page.svg_image.text_field is None:
+ text_field = self.page.svg_image.text_field = TranskriptionField(self.page.svg_image.file_name).convert_to_text_field()
+ self.page.svg_image.decontextualize_file_name(update_url=EXIST_DB)
+ for word in self.page.words:
+ self._add_word_to_list(words, word, word.text, text_field=text_field)
+ lines = []
+ for line in self.page.lines: lines.append({ 'id': line.id, 'top': line.top, 'bottom': line.bottom })
+ return { 'title': self.page.title, 'number': self.page.number, 'words': words,\
+ 'svg': self.add_object2dict(self.page.svg_image), 'lines': lines }
+
+ def convert(self, output_file=None, stage_version='', highlighted_words=None):
+ """Converts Page to JSON.
+ """
+ if output_file is None:
+ output_file = 'output.json'
+ json_file = open(output_file, "w+")
+ try:
+ json.dump(self.create_json_dict(), json_file)
+ except Exception:
+ raise Exception('Error in json.dump')
+ json_file.close()
+ return 0
+
+ def add_object2dict(self, object_instance):
+ """Add an object to json_dict and generate json data and interfaces.
+
+ [:return:] json dict or object_instance
+ """
+ json_dict = {}
+ object_type = type(object_instance)
+ if object_type.__module__ == 'builtins':
+ if object_type != list:
+ return object_instance
+ else:
+ items = []
+ for item in object_instance:
+ items.append(self.add_object2dict(item))
+ if len(items) > 0:
+ return items
+ else:
+ return { self.key: [] }
+ semantic_dictionary = object_type.get_semantic_dictionary()
+ for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]:
+ content = object_instance.__dict__.get(key)
+ if content_type == list\
+ and content is not None\
+ and len(content) > 0\
+ and type(content[0]).__module__ != 'builtins':
+ content_list = []
+ for content_item in content:
+ content_list.append(self.add_object2dict(content_item))
+ json_dict.update({key: content_list})
+ elif content_type.__module__ == 'builtins':
+ if content is not None:
+ json_dict.update({key: content})
+ else:
+ if content is not None and type(content) == list:
+ content_list = []
+ for content_item in content:
+ content_list.append(self.add_object2dict(content_item))
+ json_dict.update({key: content_list})
+ else:
+ if content is not None:
+ json_dict.update({key: self.add_object2dict(content)})
+ return json_dict
+
+class oldJSONConverter(Converter):
+ """This class can be used to convert a 'svgWordPositions' xml file to a json file.
+ """
PY2TS_DICT = { float: 'number', int: 'number', bool: 'boolean', str: 'string' }
def __init__(self, page, non_testing=True, key=''):
Converter.__init__(self, page, non_testing, False)
self.key = key
self.interface_output_dir = PathLibPath('ts_interfaces')
if not self.interface_output_dir.is_dir():
self.interface_output_dir.mkdir()
elif len(list(self.interface_output_dir.glob('*.ts'))) > 0:
for ts_file in self.interface_output_dir.glob('*.ts'):
remove(ts_file)
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to JSON.
"""
if output_file is None:
output_file = 'output.json'
class_dict = {}
if self.key != '':
object_instance = self.page.__dict__.get(self.key)
if object_instance is not None:
json_dict = self.add_object2dict(object_instance, class_dict)
if type(json_dict) == list:
json_dict = { self.key : json_dict }
else:
print(f'Page initialized from {self.page.page_tree.docinfo.URL} does not have an object at "{self.key}"!')
return 2
else:
json_dict = self.add_object2dict(self.page, class_dict)
json_file = open(output_file, "w+")
try:
json.dump(json_dict, json_file)
except Exception:
raise Exception('Error in json.dump')
json_file.close()
self.create_imports(class_dict)
return 0
def add_object2dict(self, object_instance, class_dict):
"""Add an object to json_dict and generate json data and interfaces.
[:return:] json dict or object_instance
"""
json_dict = {}
interface_list = []
object_type = type(object_instance)
if object_type.__module__ == 'builtins':
if object_type != list:
return object_instance
else:
items = []
for item in object_instance:
items.append(self.add_object2dict(item, class_dict))
if len(items) > 0:
return { self.key: items }
else:
return { self.key: 'null' }
semantic_dictionary = object_type.get_semantic_dictionary()
for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]:
content = object_instance.__dict__.get(key)
if content_type == list\
and content is not None\
and len(content) > 0\
and type(content[0]).__module__ != 'builtins':
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item, class_dict))
json_dict.update({key: content_list})
interface_list.append(f'{key}: {type(content[0]).__name__}[];')
elif content_type.__module__ == 'builtins':
if content_type != list:
ts_type = self.PY2TS_DICT[content_type]\
if content_type in self.PY2TS_DICT.keys()\
else 'string'
interface_list.append(f'{key}: {ts_type};')
json_dict.update({key: content})
else:
if content is not None and type(content) == list:
interface_list.append(f'{key}: {content_type.__name__}[];')
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item, class_dict))
json_dict.update({key: content_list})
else:
interface_list.append(f'{key}: {content_type.__name__};')
if content is not None:
json_dict.update({key: self.add_object2dict(content, class_dict)})
if object_type not in class_dict.keys():
class_dict.update({object_type: self.create_interface(object_type.__name__, interface_list)})
return json_dict
def create_imports(self, class_dict):
"""Create an ts interface from a list of key and content_types.
[:return:] file_name of interface
"""
ts_file = PathLibPath('ts_imports.ts')
file = open(ts_file, "w+")
file.write(f'//import all interfaces from {self.interface_output_dir} ' + '\n')
for interface_name, path_name in class_dict.items() :
file.write('import {' + interface_name.__name__ + '} from \'./' + str(self.interface_output_dir.joinpath(path_name.stem)) + '\';\n')
file.close()
return ts_file
def create_interface(self, class_name, interface_list) -> PathLibPath:
"""Create an ts interface from a list of key and content_types.
[:return:] file_name of interface
"""
ts_file = self.interface_output_dir.joinpath(PathLibPath(f'{class_name.lower()}.ts'))
import_list = [ import_class_name for import_class_name in\
[ import_class_name.split(': ')[1].replace(';','').replace('[]','') for import_class_name in interface_list ]\
if import_class_name not in set(self.PY2TS_DICT.values()) ]
file = open(ts_file, "w")
for import_class_name in set(import_list):
file.write('import {' + import_class_name + '} from \'./' + import_class_name.lower() + '\';\n')
file.write(f'export interface {class_name} ' + '{\n')
for interace_string in interface_list:
file.write(f'\t' + interace_string + '\n')
file.write('}')
file.close()
return ts_file
class SVGConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text.
"""
BG_COLOR = 'yellow'
OPACITY = '0.2'
def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
self.bg_color = bg_color
self.opacity = opacity
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to SVG
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
svg_file = self.page.svg_file
if svg_file is None and self.page.svg_image is not None:
svg_file = self.page.svg_image.file_name
elif svg_file is None:
msg = f'ERROR: xml_source_file {self.page.docinfo.URL} does neither have a svg_file nor a svg_image!'
raise Exception(msg)
transkription_field = TranskriptionField(svg_file)
if bool(transkription_field.get_svg_attributes('xmlns')):
ET.register_namespace('', transkription_field.get_svg_attributes('xmlns'))
if bool(transkription_field.get_svg_attributes('xmlns:xlink')):
ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink'))
svg_tree = ET.parse(svg_file)
transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'})
colors = [ 'yellow', 'orange' ] if self.bg_color == self.BG_COLOR else [ self.bg_color ]
if highlighted_words is not None:
colors = ['yellow']
else:
highlighted_words = []
color_index = 0
for word in self.page.words:
word_id = 'word_' + str(word.id)
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
transkription_position_id = word_id + '_' + str(transkription_position.id)
color = colors[color_index] if word not in highlighted_words else self.bg_color
rect_node = ET.SubElement(transkription_node, 'rect',\
attrib={'id': transkription_position_id, 'x': str(transkription_position.left + transkription_field.xmin),\
'y': str(transkription_position.top + transkription_field.ymin), 'width': str(transkription_position.width),\
'height': str(transkription_position.height), 'fill': color, 'opacity': self.opacity})
if transkription_position.transform is not None:
matrix = transkription_position.transform.clone_transformation_matrix()
matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3)
rect_node.set('transform', matrix.toString())
rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3)))
rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3)))
ET.SubElement(rect_node, 'title').text = word.text
color_index = (color_index + 1) % len(colors)
if output_file is not None:
svg_tree.write(output_file)
return 0
class HTMLConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a test HTML file.
"""
CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; }
.highlight1 { background-color: pink; opacity: 0.2; }
.highlight2 { background-color: red; opacity: 0.2; }
.foreign { background-color: blue; opacity: 0.4; }
.overwritten { background-color: green; opacity: 0.4; }
.word-insertion-mark { background-color: orange; opacity: 0.2; }
.deleted { background-color: grey; opacity: 0.2; }
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to HTML
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
if stage_version != '':
title = title + ', Schreibstufe: ' + stage_version
if self.page.svg_image is not None:
width = self.page.svg_image.width
height = self.page.svg_image.height
svg_file = self.page.svg_image.file_name
elif self.page.svg_file is not None:
svg_file = self.page.svg_file
transkription_field = TranskriptionField(svg_file)
width = transkription_field.getWidth()
height = transkription_field.getHeight()
style_content = ' position: relative; width: {}px; height: {}px; background-image: url("{}"); background-size: {}px {}px '\
.format(width, height, path.abspath(svg_file), width, height)
style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS)
head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style)
transkription = E.DIV(id="transkription")
counter = 0
for word in self.page.words:
highlight_class = 'highlight' + str(counter)\
if not word.deleted else 'deleted'
if highlighted_words is not None\
and word in highlighted_words:
highlight_class = 'highlight2'
earlier_text = '' if word.earlier_version is None else word.earlier_version.text
if earlier_text == '' and len(word.word_parts) > 0:
earlier_versions = [ word for word in word.word_parts if word.earlier_version is not None ]
earlier_text = earlier_versions[0].text if len(earlier_versions) > 0 else ''
if earlier_text != '':
word_title = 'id: {}/line: {}\n0: {}\n1: {}'.format(str(word.id), str(word.line_number), earlier_text, word.text)
else:
word_title = 'id: {}/line: {}\n{}'.format(str(word.id), str(word.line_number), word.text)
if word.edited_text is not None:
word_title += f'\n>{word.edited_text}'
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
self._append2transkription(transkription, highlight_class, word_title, transkription_position)
if word.overwrites_word is not None:
overwritten_title = f'{word.text} overwrites {word.overwrites_word.text}'
for overwritten_transkription_position in word.overwrites_word.transkription_positions:
self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position)
for part_word in word.word_parts:
highlight_class = 'highlight' + str(counter)\
if not part_word.deleted else 'deleted'
for part_transkription_position in self._get_transkription_positions(part_word.transkription_positions, stage_version=stage_version):
self._append2transkription(transkription, highlight_class, word_title, part_transkription_position)
if part_word.overwrites_word is not None:
overwritten_title = f'{word.text} overwrites {part_word.overwrites_word.text}'
for overwritten_transkription_position in part_word.overwrites_word.transkription_positions:
self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position)
counter = (counter + 1) % 2
word_insertion_mark_class = 'word-insertion-mark'
counter = 0
for mark_foreign_hands in self.page.mark_foreign_hands:
highlight_class = 'foreign'
- title = 'id: {}/line: {}\n{} {}'.format(str(mark_foreign_hands.id), str(word.line_number),\
+ title = 'id: {}/line: {}\n{} {}'.format(str(mark_foreign_hands.id), str(mark_foreign_hands.line_number),\
mark_foreign_hands.foreign_hands_text, mark_foreign_hands.pen)
for transkription_position in mark_foreign_hands.transkription_positions:
self._append2transkription(transkription, highlight_class, title, transkription_position)
if self.show_word_insertion_mark:
for word_insertion_mark in self.page.word_insertion_marks:
wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(word_insertion_mark.id), str(word_insertion_mark.line_number))
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
word_insertion_mark.top, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height)
link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content)
transkription.append(link)
html = E.HTML(head,E.BODY(transkription))
bool(self.non_testing) and open_in_browser(html)
if output_file is not None:
with open(output_file, 'wb') as f:
f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8'))
f.closed
return 0
def _append2transkription(self, transkription, highlight_class, title, transkription_position):
"""Append content to transkription-div.
"""
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
transkription_position.top, transkription_position.left, transkription_position.width, transkription_position.height)
if transkription_position.transform is not None:
style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString())
transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\
if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0
style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height)
link = E.A(' ', E.CLASS(highlight_class), title=title, style=style_content)
transkription.append(link)
def create_pdf_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, pdf_file_name='output.pdf', bg_color=SVGConverter.BG_COLOR):
"""Creates a pdf file highlighting some words.
"""
if not pdf_file_name.endswith('pdf'):
pdf_file_name = pdf_file_name + '.pdf'
tmp_svg_file = pdf_file_name.replace('.pdf', '.svg')
create_svg_with_highlighted_words(xml_source_file=xml_source_file, page=page, highlighted_words=highlighted_words,\
svg_file_name=tmp_svg_file, bg_color=bg_color)
if isfile(tmp_svg_file):
cairosvg.svg2pdf(url=tmp_svg_file, write_to=pdf_file_name)
remove(tmp_svg_file)
def create_svg_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, svg_file_name='output.svg', bg_color=SVGConverter.BG_COLOR):
"""Creates a svg file highlighting some words.
"""
if page is None and xml_source_file is not None:
page = Page(xml_source_file)
converter = SVGConverter(page, bg_color=bg_color)
if not svg_file_name.endswith('svg'):
svg_file_name = svg_file_name + '.svg'
converter.convert(output_file=svg_file_name, highlighted_words=highlighted_words)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes.
svgscripts/convert_wordPositions.py OPTIONS
OPTIONS:
-h|--help: show help
-H|--HTML [default] convert to HTML test file
-k|--key=key option for json converter:
only convert object == page.__dict__[key]
-o|--output=outputFile save output to file outputFile
-P|--PDF convert to PDF test file
-S|--SVG convert to SVG test file
-s|--svg=svgFile: svg web file
-T|--TEXT convert to TEXT output
-t|--text=text highlight word
-w|--word-insertion-mark show word insertion mark on HTML
-v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. }
-x|--testing execute in test mode, do not write to file or open browser
:return: exit code (int)
"""
convert_to_type = None
key = ''
non_testing = True
output_file = None
page = None
show_word_insertion_mark = False
stage_version = ''
svg_file = None
text = None
try:
opts, args = getopt.getopt(argv, "hk:t:HPSTws:o:v:x", ["help", "key=", "text=", "HTML", "PDF", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version=", "testing"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-v', '--version'):
if re.match(r'^(\d|\d\+|\d\-\d)$', arg):
stage_version = arg
else:
raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg))
elif opt in ('-w', '--word-insertion-mark'):
show_word_insertion_mark = True
elif opt in ('-P', '--PDF'):
convert_to_type = 'PDF'
elif opt in ('-S', '--SVG'):
convert_to_type = 'SVG'
elif opt in ('-T', '--TEXT'):
convert_to_type = 'TEXT'
elif opt in ('-H', '--HTML'):
convert_to_type = 'HTML'
elif opt in ('-x', '--testing'):
non_testing = False
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-o', '--output'):
output_file = arg
elif opt in ('-k', '--key'):
key = arg
elif opt in ('-t', '--text'):
text = arg
print(arg)
if len(args) < 1:
usage()
return 2
if convert_to_type is None:
if output_file is not None and len(re.split(r'\.', output_file)) > 1:
output_file_part_list = re.split(r'\.', output_file)
convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper()
else:
convert_to_type = 'HTML'
exit_code = 0
for word_position_file in args:
if not isfile(word_position_file):
print("'{}' does not exist!".format(word_position_file))
return 2
if convert_to_type == 'PDF':
if output_file is None:
output_file = 'output.pdf'
highlighted_words = None
if text is not None:
page = Page(word_position_file)
highlighted_words = [ word for word in page.words if word.text == text ]
create_pdf_with_highlighted_words(word_position_file, pdf_file_name=output_file, highlighted_words=highlighted_words)
else:
if svg_file is not None:
if isfile(svg_file):
page = PageCreator(word_position_file, svg_file=svg_file)
else:
print("'{}' does not exist!".format(word_position_file))
return 2
else:
page = Page(word_position_file)
if page.svg_file is None:
print('Please specify a svg file!')
usage()
return 2
highlighted_words = None
if text is not None:
highlighted_words = [ word for word in page.words if word.text == text ]
print([ (word.id, word.text) for word in highlighted_words ])
converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark, key=key)
exit_code = converter.convert(output_file=output_file, stage_version=stage_version, highlighted_words=highlighted_words)
return exit_code
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: tests_svgscripts/test_transformSVG2TranskriptionField.py
===================================================================
--- tests_svgscripts/test_transformSVG2TranskriptionField.py (revision 100)
+++ tests_svgscripts/test_transformSVG2TranskriptionField.py (revision 101)
@@ -1,114 +1,118 @@
import unittest
import xml.etree.ElementTree as ET
from xml.parsers.expat import ExpatError
import os
from os import sep, path
from os.path import isfile, isdir, dirname
from svgpathtools import svg_to_paths
import sys
sys.path.append('svgscripts')
from datatypes.transkriptionField import TranskriptionField
def createEmptySVGFile():
""" Creates an empty file and returns the name of this file.
:return: (str) file name
"""
empty_file = 'empty.svg'
if not bool(os.path.exists(empty_file)):
root = ET.Element('svg')
ET.SubElement(root, 'g')
et = ET.ElementTree(root)
et.write(empty_file)
return empty_file
def deleteFile(file_name):
"""Deletes file.
"""
bool(os.path.exists(file_name)) and os.remove(file_name)
class TestTF(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
self.test_file = DATADIR + sep + 'test_ai.svg'
self.test_tr = TranskriptionField(self.test_file)
self.empty_file = createEmptySVGFile()
self.svg_test_file = 'python_test.svg'
self.test_empty_file = DATADIR + sep + 'my_empty_test.svg'
self.verso_svg = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.recto_svg = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg'
self.multipage = DATADIR + sep + 'multipage_small_above.svg'
def test_init_tf(self):
""" Tests init function of transformSVG2TranskriptionField.TranskriptionField.
"""
self.assertEqual(self.test_tr.transkription_field_found(), True)
with self.assertRaises(ExpatError):
TranskriptionField(self.test_empty_file)
multi_tf_above = TranskriptionField(self.multipage, multipage_index=0)
self.assertEqual(multi_tf_above.transkription_field_found(), True)
multi_tf_beneath = TranskriptionField(self.multipage, multipage_index=1)
self.assertEqual(multi_tf_beneath.transkription_field_found(), True)
self.assertTrue(multi_tf_beneath.second_field is None)
self.assertEqual(multi_tf_above.second_field.xmin, multi_tf_beneath.xmin)
self.assertTrue(multi_tf_above.ymax < multi_tf_beneath.ymin)
def test_is_shrunk(self):
self.assertEqual(self.test_tr.is_shrunk(), False)
self.test_tr.shrink_svg_to_transkription_field(self.svg_test_file)
test_tr = TranskriptionField(self.svg_test_file)
self.assertEqual(test_tr.is_shrunk(), True)
def test_getWidth_Height(self):
self.assertEqual(self.test_tr.getWidth(), 835.7)
self.assertEqual(self.test_tr.getHeight(), 1161.7)
def test_init_with_empty_file(self):
"""Tests init function of transformSVG2TranskriptionField.TranskriptionField with empty svg file.
"""
with self.assertRaises(Exception):
TranskriptionField(self.empty_file)
def test_tf_dimensions(self):
"""Tests dimensions of transformSVG2TranskriptionField.TranskriptionField.
"""
self.assertEqual(self.test_tr.width, 493.2)
self.assertEqual(self.test_tr.height, 623.6999999999999)
self.assertEqual(self.test_tr.xmin, 190.7)
self.assertEqual(self.test_tr.ymin, 74.9)
+ def test_to_text_field(self):
+ text_field = self.test_tr.convert_to_text_field()
+ self.assertEqual(text_field.width, 493.2)
+
def test_tf_shrink_svg_to_transkription_field(self):
"""Tests transformSVG2TranskriptionField.shrink_svg_to_transkription_field.
"""
self.assertEqual(self.test_tr.shrink_svg_to_transkription_field(self.svg_test_file), 0)
deleteFile(self.svg_test_file)
def test_tf_shrink_svg_to_transkription_field(self):
"""Tests transformSVG2TranskriptionField.shrink_svg_to_transkription_field with a file that has already been shrunk.
"""
self.test_tr.shrink_svg_to_transkription_field(self.svg_test_file)
test_tr = TranskriptionField(self.svg_test_file)
self.assertEqual(test_tr.shrink_svg_to_transkription_field(), 1)
test_tr = TranskriptionField(self.svg_test_file)
self.assertEqual(test_tr.transkription_field_found(), False)
deleteFile(self.svg_test_file)
def test_is_page_verso(self):
tf = TranskriptionField(self.verso_svg)
self.assertEqual(tf.is_page_verso(), True)
tf = TranskriptionField(self.recto_svg)
self.assertEqual(tf.is_page_verso(), False)
def test_add_line_number_area_width(self):
tf = TranskriptionField(self.verso_svg)
tf.add_line_number_area_width(tf.xmin)
self.assertEqual(tf.line_number_area_width, 0.0)
tf = TranskriptionField(self.recto_svg)
tf.add_line_number_area_width(tf.xmax)
self.assertEqual(tf.line_number_area_width, 0.0)
def tearDown(self):
deleteFile(self.empty_file)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_faksimile.py
===================================================================
--- tests_svgscripts/test_faksimile.py (revision 100)
+++ tests_svgscripts/test_faksimile.py (revision 101)
@@ -1,82 +1,90 @@
import unittest
from os import sep, path
from os.path import isdir, dirname, basename
import lxml.etree as ET
import sys
import sys
sys.path.append('svgscripts')
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.faksimile_image import FaksimileImage
from datatypes.text_field import TextField
class TestFaksimilePage(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.svg_file = DATADIR + sep + 'W-II-1,49et50.svg'
self.svg_testmatrix = DATADIR + sep + 'TESTMATRIX_1.svg'
self.faksimile_dir = DATADIR + sep + 'faksimile_svg'
self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg'
+ self.faksimile_rotate90 = self.faksimile_dir + sep + 'Mp-XV-2c,4.svg'
def test_init(self):
image = FaksimileImage(file_name='test.jpg', height=10, width=10)
text_field = TextField(width=10, height=10, x=10, y=10)
faksimile = FaksimilePage(title='test', page_number=1, faksimile_image=image, text_field=text_field)
self.assertEqual(faksimile.page_tree.getroot().get('title'), 'test')
self.assertEqual(faksimile.page_tree.getroot().get('page-number'), '1')
self.assertEqual(faksimile.faksimile_image.width, 10)
self.assertEqual(faksimile.text_field.width, 10)
def test_GET_TEXTFIELDS(self):
svg_tree = ET.parse(self.svg_file)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 2)
text_field = pages[0].text_field
self.assertEqual(text_field.width, 663.333)
result_dir = '.{}xml{}'.format(sep, sep) if isdir('xml') else ''
self.assertEqual(pages[0].xml_file, result_dir + 'W-II-1_49.xml')
self.assertEqual(pages[0].title, 'W II 1')
self.assertEqual(pages[0].page_number, '49')
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree, page_number='49')
self.assertEqual(len(pages), 1)
svg_tree = ET.parse(self.svg_testmatrix)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 1)
self.assertEqual(len(pages[0].word_positions), 1)
self.assertEqual(pages[0].word_positions[0].transform.toCSSTransformString(), 'rotate(45deg)')
svg_tree = ET.parse(self.faksimile_file)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 2)
textfield_id = pages[1].title.replace(' ', '-') + '_' + pages[1].page_number
#print([ position.id for position in pages[0].word_positions])
self.assertEqual(textfield_id not in [ position.id for position in pages[0].word_positions ], True)
self.assertEqual('path1237' in [ position.id for position in pages[0].word_positions ], True)
self.assertEqual('Vorgangs' in [ position.text for position in pages[0].word_positions ], False)
svg_tree = ET.parse(self.faksimile_file)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 2)
self.assertEqual(pages[0].page_number, '5')
+ """
svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Eric/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/W-II-1,141et142.svg')
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 1)
+ """
+ svg_tree = ET.parse(self.faksimile_rotate90)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
+ self.assertEqual(len(pages), 1)
+ self.assertEqual(len(pages[0].word_positions), len(svg_tree.xpath('//ns:rect/ns:title', namespaces=namespaces)))
def test_get_paths_inside_rect(self):
svg_tree = ET.parse(self.faksimile_file)
paths = get_paths_inside_rect(svg_tree, '//ns:path', 360, 786, 92, 765, 'N-VII-1_5')
self.assertEqual(len(paths), 1)
svg_tree = ET.parse(self.svg_testmatrix)
paths = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', 0, 2038.72, 0, 974.08002, 'TESTMATRIX_1')
self.assertEqual(len(paths), 1)
svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Myriam/Mp_XIV/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/Mp-XIV-1,419a.svg')
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
paths = get_paths_inside_rect(svg_tree, '//ns:rect', 52, 800, 58, 900, 'Mp-XIV-1_419a', namespaces=namespaces)
self.assertEqual(len([ path for path in paths if 'seinen' in path.xpath('./ns:title/text()', namespaces=namespaces)]), 1)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_word.py
===================================================================
--- tests_svgscripts/test_word.py (revision 100)
+++ tests_svgscripts/test_word.py (revision 101)
@@ -1,481 +1,487 @@
import unittest
from os import sep, path
import lxml.etree as ET
import sys
sys.path.append('svgscripts')
from process_words_post_merging import reset_page, update_writing_process_ids
from datatypes.box import Box
from datatypes.manuscript import ArchivalManuscriptUnity
from datatypes.matrix import Matrix
import datatypes.page
from datatypes.path import Path
from datatypes.positional_word_part import PositionalWordPart
from datatypes.style import Style
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word, execute_function_on_parts, update_transkription_position_ids
from datatypes.word_deletion_path import WordDeletionPath
from datatypes.word_position import WordPosition
sys.path.append('py2ttl')
from class_spec import SemanticClass
class Page:
def __init__(self):
self.svg_file = None
def get_line_number(self, input=0):
return -1
def get_biggest_fontSize4styles(self, style_set={}):
return 7
class TestWord(unittest.TestCase):
TESTCASE = None
def setUp(self):
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.test_file = DATADIR + sep + 'N_VII_1_page009.xml'
self.word_deletion_path_file = DATADIR + sep + 'N_VII_1_page138.xml'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }]
x = 0
for dict in self.word_part_objs:
dict['class'] = 'st22'
dict['x'] = x
dict['y'] = 11
x += 1
mylist = {'text': 'abc', 'id': '0', 'line-number': '2', 'deleted': 'true' }
word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)'))
self.transkription_positions = [ word_position ]
self.word_node = ET.Element('word', attrib=mylist)
word_position.attach_object_to_tree(self.word_node)
x = 0
for char in mylist['text']:
ET.SubElement(self.word_node, 'part', attrib={'text': char, 'x': str(x), 'y': '11', 'class': 'st22' })
x += 1
def test_add_deletion_paths(self):
page = datatypes.page.Page(self.word_deletion_path_file, add_deletion_paths_to_words=False)
word = [ word for word in page.words if word.text == 'AufBau'][0]
#self.assertTrue(word.deleted)
self.assertTrue(len(word.word_parts) > 0)
self.assertTrue(word.word_parts[0].deleted)
word.add_deletion_paths(page.word_deletion_paths, tr_xmin=28.347656, tr_ymin=49.921875)
self.assertTrue(len(word.word_parts[0].deletion_paths) > 0)
#print(word.deletion_paths)
def test_join_words(self):
words = [ Word(id=4, text='asdf-', line_number=1, deleted=True), Word(id=5, text='bsdf', line_number=2, deleted=False) ]
new_word = Word.join_words(words)
self.assertEqual(new_word.id, 4)
self.assertEqual(new_word.text, 'asdf-bsdf')
self.assertEqual(new_word.edited_text, 'asdfbsdf')
self.assertEqual(new_word.deleted, False)
self.assertEqual(new_word.line_number, -1)
words = [ Word(id=1, word_parts=[Word(id=4, text='asdf-', line_number=1, deleted=True), Word(id=5, text='bsdf', line_number=2, deleted=False)]),\
Word(id=4, text='.', line_number=2, deleted=True), Word(id=5, text='.', line_number=2, deleted=False) ]
new_word = Word.join_words(words)
self.assertEqual(new_word.text, 'asdf-bsdf..')
+ new_word = Word.join_words(words, add_white_space_between_words=True)
+ self.assertEqual(new_word.text, 'asdf- bsdf . .')
def test_Word_with_word_part_objs(self):
word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10)
self.assertEqual(word.id, 0)
self.assertEqual(word.transkription_positions[0].bottom, 13)
self.assertEqual(word.transkription_positions[0].height, 10)
self.assertEqual(word.transkription_positions[0].top, 3)
self.assertEqual(word.transkription_positions[0].left, 0)
self.assertEqual(word.transkription_positions[0].width, 10)
self.assertEqual(word.text, 'abc')
def test_Word_with_word_node(self):
word = Word.create_cls(self.word_node)
self.assertEqual(word.id, 0)
self.assertEqual(word.deleted, True)
self.assertEqual(word.transkription_positions[0].bottom, 11)
self.assertEqual(word.transkription_positions[0].height, 10)
self.assertEqual(word.transkription_positions[0].top, 1)
self.assertEqual(word.transkription_positions[0].left, 0)
self.assertEqual(word.transkription_positions[0].width, 10)
self.assertEqual(word.text, 'abc')
self.assertEqual(word.line_number, 2)
self.assertEqual(word.transkription_positions[0].transform.isRotationMatrix(), True)
def test_attach_word_to_tree(self):
newWord = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10)
empty_tree = ET.ElementTree(ET.Element('page'))
newWord.attach_word_to_tree(empty_tree)
for word_node in empty_tree.getroot().xpath('//word'):
word = Word.CREATE_WORD(word_node=word_node)
self.assertEqual(word.id, 0)
self.assertEqual(word.deleted, False)
self.assertEqual(word.transkription_positions[0].bottom, 13)
self.assertEqual(word.transkription_positions[0].height, 10)
self.assertEqual(word.transkription_positions[0].top, 3)
self.assertEqual(word.transkription_positions[0].left, 0)
self.assertEqual(word.transkription_positions[0].width, 10)
self.assertEqual(word.text, 'abc')
@unittest.skipUnless(TESTCASE is None or TESTCASE == 0, 'Not testing this case')
def test_create_correction_history_case0(self):
# Case 1: whole word over box
box = Box(earlier_text='XYX')
word = Word(text='ASDF', transkription_positions=[TranskriptionPosition()])
word.word_box = box
word.create_correction_history()
self.assertEqual(word.earlier_version is None, True)
self.assertEqual(word.overwrites_word is not None, True)
@unittest.skipUnless(TESTCASE is None or TESTCASE == 1, 'Not testing this case')
def test_create_correction_history_case1(self):
# Case 2: part of word over box
box = Box(earlier_text='XYX')
partA = Word(text='A', transkription_positions=[TranskriptionPosition()])
partA.word_box = box
partB = Word(text='SDF', transkription_positions=[TranskriptionPosition()])
word = Word(text='ASDF', word_parts=[ partA, partB])
word.create_correction_history()
self.assertEqual(word.earlier_version is None, True)
self.assertEqual(word.word_parts[0].overwrites_word is not None, True)
@unittest.skipUnless(TESTCASE is None or TESTCASE == 2, 'Not testing this case')
def test_create_correction_history_case3(self):
# Case 3: part of word over box, word under box is part of earlier version
box = Box(earlier_text='XYX')
tp0 = TranskriptionPosition()
tp0.style = Style(writing_process_id=0)
tp1 = TranskriptionPosition()
tp1.style = Style(writing_process_id=1)
partA = Word(id=0, text='Test', transkription_positions=[ tp0])
partB = Word(id=1, text='er', transkription_positions=[ tp1])
partB.word_box = box
word = Word(text='Tester', writing_process_id=1, word_parts=[ partA, partB ] )
word.create_correction_history(box_style=tp0.style)
self.assertEqual(word.text, 'Tester')
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.earlier_version.text, 'TestXYX')
self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1])
@unittest.skipUnless(TESTCASE is None or TESTCASE == 3, 'Not testing this case')
def test_create_correction_history_case4(self):
# Case 4: part of word is deleted
partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()])
partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()])
word = Word(text='ASDF', word_parts=[ partA, partB])
word.create_correction_history()
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True)
self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0])
self.assertEqual(word.edited_text, 'SDF')
@unittest.skipUnless(TESTCASE is None or TESTCASE == 4, 'Not testing this case')
def test_create_correction_history_case5(self):
tp0 = TranskriptionPosition()
tp0.style = Style(writing_process_id=0)
tp1 = TranskriptionPosition()
tp1.style = Style(writing_process_id=1)
partA = Word(id=0, text='Test', transkription_positions=[ tp0])
partB = Word(id=1, text='er', transkription_positions=[ tp1])
word = Word(text='Tester', word_parts=[ partA, partB ] )
word.create_correction_history()
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.word_parts[1].extendsEarlierVersion, True)
self.assertEqual(word.word_parts[1].isExtensionOfWord, word.earlier_version)
#@unittest.skipUnless(TESTCASE is None or TESTCASE == 5, 'Not testing this case')
#@unittest.skip('case tested, relies on a local xml file')
def test_create_correction_history_case_full(self):
page = datatypes.page.Page('xml/N_VII_1_page138.xml')
manuscript = ArchivalManuscriptUnity()
reset_page(page)
update_writing_process_ids(page)
word = [ word for word in page.words if word.text == 'Verschiedenes' and word.line_number == 4 ][0]
wordAufBau = [ word for word in page.words if word.text == 'AufBau' ][0]
#page.words = [ word ]
page.update_styles(manuscript=manuscript, partition_according_to_styles=True)
word.word_parts[0].transkription_positions[0].has_box = Box(earlier_text='v')
self.assertEqual(len(word.word_parts), 2)
word_over_box = word._get_partial_word_over_box()
update_transkription_position_ids(word)
word.create_correction_history(page)
self.assertEqual(word.writing_process_id, 1)
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.earlier_version.text, 'verschiedenes')
#print(word.earlier_version.id, [ (w.id, w.text) for w in word.earlier_version.word_parts ])
empty_tree = ET.ElementTree(ET.Element('page'))
word_node = word.attach_word_to_tree(empty_tree)
#print(ET.dump(word_node))
"""
self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0])
self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1])
self.assertEqual(word.word_parts[1].overwrites_word is not None, True)
"""
word = wordAufBau
page.words = [ word ]
page.update_styles(manuscript=manuscript, partition_according_to_styles=True)
word.word_parts[0].deleted = True
word.word_parts[1].transkription_positions[0].has_box = Box(earlier_text='b')
self.assertEqual(len(word.word_parts), 3)
word_over_box = word._get_partial_word_over_box()
self.assertEqual(len(word.word_parts), 3)
update_transkription_position_ids(word)
word.create_correction_history(page)
self.assertEqual(word.writing_process_id, 2)
self.assertEqual(word.earlier_version is not None, True)
self.assertEqual(word.text, 'AufBau')
self.assertEqual(word.edited_text, 'Bau')
self.assertEqual(word.earlier_version.text, 'Aufbau')
self.assertEqual(word.word_parts[0].isDeletionOfWord, word.earlier_version.word_parts[0])
self.assertEqual(word.word_parts[1].isTransformationOfWord, word.earlier_version.word_parts[1])
self.assertEqual(word.word_parts[1].overwrites_word is not None, True)
empty_tree = ET.ElementTree(ET.Element('page'))
word_node = word.attach_word_to_tree(empty_tree)
#print(ET.dump(word_node))
newWord = Word.create_cls(word_node)
#@unittest.skip('')
def test_earlier_version(self):
partA = Word(id=0, text='A', deleted=True, transkription_positions=[TranskriptionPosition()])
partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition()])
word = Word(text='ASDF', word_parts=[ partA, partB])
earlier_version = word.create_earlier_version()
self.assertEqual(earlier_version is not None, True)
self.assertEqual(word.word_parts[0].isDeletionOfWord is not None, True)
self.assertEqual(word.word_parts[0].isDeletionOfWord, earlier_version.word_parts[0])
def test_undo_partitioning(self):
tps = []
for i, xy in enumerate([ 3, 4, 5 ]):
tps.append(TranskriptionPosition(id=i, x=xy, y=xy, height=10, width=10))
partA = Word(id=0, text='Auf', writing_process_id=1, deleted=True, transkription_positions=[ tps[0]])
partB = Word(id=1, text='B', writing_process_id=2, transkription_positions=[tps[1]])
partC = Word(id=2, text='au', writing_process_id=1,transkription_positions=[tps[2]])
word = Word(text='Aufbau', writing_process_id=2, word_parts=[ partA, partB, partC ] )
word.undo_partitioning()
self.assertEqual(len(word.transkription_positions), len(tps))
self.assertEqual(len(word.word_parts), 0)
"""
page = datatypes.page.Page('xml/N_VII_1_page138.xml')
word = page.words[77]
word.undo_partitioning()
self.assertEqual(len(word.word_parts), 0)
self.assertEqual(len(word.transkription_positions), 3)
update_transkription_position_ids(word)
empty_tree = ET.ElementTree(ET.Element('page'))
word_node = word.attach_word_to_tree(empty_tree)
print(ET.dump(word_node))
"""
def test_split(self):
page = Page()
pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs)
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
previousWord, currentWord, nextWord = word.split('b')
self.assertEqual(previousWord.id, 0)
self.assertEqual(previousWord.text, 'a')
self.assertEqual(currentWord.id, 1)
self.assertEqual(nextWord.id, 2)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
previousWord, currentWord, nextWord = word.split('bc')
self.assertEqual(previousWord.id, 0)
self.assertEqual(previousWord.text, 'a')
self.assertEqual(currentWord.id, 1)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
previousWord, currentWord, nextWord = word.split('ab', start_id=10)
self.assertEqual(currentWord.id, 10)
self.assertEqual(currentWord.text, 'ab')
self.assertEqual(currentWord.transkription_positions[0].width, 2.1)
self.assertEqual(nextWord.id, 11)
self.assertEqual(nextWord.transkription_positions[0].width, 5.2)
word_part_objs=[{'text': 'x', 'class':'st22', 'x': 0, 'y': 0},\
{'text': 'Insofern', 'class':'st22', 'x': 1, 'y': 0},\
{'text': 'x', 'class':'st22', 'x': 10, 'y': 0}]
pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs)
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
with self.assertWarns(Warning):
previousWord, currentWord, nextWord = word.split('Insofer')
word_part_objs=[{'text': 'xInsofern', 'class':'st22', 'x': 0, 'y': 0}]
pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs)
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps)
word = Word(text=''.join([pwp.text for pwp in pwps]), transkription_positions=transkription_positions)
with self.assertWarns(Warning):
previousWord, currentWord, nextWord = word.split('Insofern')
def test_join(self):
word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10)
other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}])
+ word.join(other_word, add_white_space_between_words=True)
+ self.assertEqual(word.text, 'abc .')
+ word = Word.CREATE_WORD(word_part_objs=self.word_part_objs, height=10, endX=10)
+ other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}])
word.join(other_word)
self.assertEqual(word.text, 'abc.')
other_word = Word.CREATE_WORD(word_part_objs=[{'text': '.', 'class':'st22', 'x': 3, 'y': 11}])
word.join(other_word, append_at_end_of_new_word=False)
self.assertEqual(word.text, '.abc.')
"""
tree = ET.ElementTree(ET.Element('page'))
word.attach_word_to_tree(tree)
print(ET.dump(tree.getroot()))
"""
def test_get_semanticAndDataDict(self):
dictionary = Word.get_semantic_dictionary()
#print(dictionary)
info_dict = dictionary['properties'].get('isDeletionOfWord')
self.assertEqual(SemanticClass.SUPER_PROPERTY in info_dict.keys(), True)
super_info_dict = info_dict[SemanticClass.SUPER_PROPERTY]
#print(info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME))
def test_simplify_transkription_positions(self):
node_string = """ """
nodeA = ET.fromstring(node_string)
node_string = """
"""
nodeB = ET.fromstring(node_string)
word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ])
self.assertEqual(len(word.transkription_positions), 2)
word.simplify_transkription_positions()
self.assertEqual(len(word.transkription_positions), 1)
word = Word(text="Si", transkription_positions=[ TranskriptionPosition(node=nodeA), TranskriptionPosition(node=nodeB) ])
word.transkription_positions[1].writing_process_id = -1
word.simplify_transkription_positions()
self.assertEqual(len(word.transkription_positions), 1)
self.assertEqual(word.transkription_positions[0].writing_process_id, 0)
"""
tree = ET.ElementTree(ET.Element('page'))
word.attach_word_to_tree(tree)
print(ET.dump(tree.getroot()))
"""
def test_partition(self):
page = datatypes.page.Page(self.test_file)
word = page.words[67]
self.assertEqual(word.belongs_to_multiple_writing_processes(), True)
word.partition_according_to_writing_process_id()
self.assertEqual(len(word.word_parts), 3)
self.assertEqual(word.belongs_to_multiple_writing_processes(), False)
self.assertEqual(word.belongs_to_multiple_writing_processes(include_parts=True), True)
empty_tree = ET.ElementTree(ET.Element('page'))
word_node = word.attach_word_to_tree(empty_tree)
newWord = Word.create_cls(word_node)
self.assertEqual(len(newWord.word_parts), 3)
#print(ET.dump(empty_tree.getroot()))
def test_partition_deletion(self):
page = datatypes.page.Page(self.test_file)
word = page.words[67]
for transkription_position in word.transkription_positions:
transkription_position.deleted = transkription_position.writing_process_id == 1
self.assertEqual(word.has_mixed_status('deleted'), True)
word.partition_according_to_deletion()
self.assertEqual(len(word.word_parts), 3)
self.assertEqual(word.has_mixed_status('deleted'), False)
self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True)
page = datatypes.page.Page(self.test_file)
word = page.words[67]
word.partition_according_to_writing_process_id()
#print([(word.text, word.deleted) for word in word.word_parts])
word.word_parts[1].transkription_positions[1].deleted = True
word.partition_according_to_deletion()
self.assertEqual(len(word.word_parts), 4)
#print([(word.text, word.deleted) for word in word.word_parts])
partA = Word(text='A', deleted=True)
partB = Word(text='SDF', deleted=False)
word = Word(text='ASDF', word_parts=[ partA, partB])
self.assertEqual(word.has_mixed_status('deleted', include_parts=True), True)
def test_execute_function_on_parts(self):
page = datatypes.page.Page(self.test_file)
word_parts = [ page.words[67], page.words[68] ]
word_parts, none = execute_function_on_parts(word_parts, 'partition_according_to_writing_process_id')
self.assertEqual(len(word_parts) == 4, True)
def test_process_word_boxes(self):
page = datatypes.page.Page(self.pdf_xml)
page.source = self.pdf_xml_source
page.update_styles(partition_according_to_styles=True)
tr = TranskriptionField(page.source)
box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\
'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\
'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\
'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\
'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315']
box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ]
indices = [30, 277, 288, 297, 321]
for word_id, index in enumerate(indices):
word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin)
self.assertEqual(word_over_box is not None, True)
self.assertEqual(word_over_box == page.words[index] or word_over_box in page.words[index].word_parts, True)
#self.assertEqual(word_over_box in page.words[index].word_parts, True)
def test_process_word_several_boxesOn1LIne(self):
page = datatypes.page.Page(self.pdf_xml)
page.source = self.pdf_xml_source
for word in page.words:
word.set_writing_process_id_to_transkription_positions(page)
word.partition_according_to_writing_process_id()
tr = TranskriptionField(page.source)
box_path_d = ['M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565',\
'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44',\
'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565',\
'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065',\
'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315']
box_paths = [ Box(d_string=d_string, earlier_text='test') for d_string in box_path_d ]
indices = [30, 277, 288, 297, 321]
empty_tree = ET.ElementTree(ET.Element('page'))
for word_id, index in enumerate(indices):
word_over_box = page.words[index].process_boxes(box_paths, tr_xmin=tr.xmin, tr_ymin=tr.ymin)
self.assertEqual(word_over_box is not None, True)
def test_split_according_to_status(self):
page = datatypes.page.Page(self.test_file)
word = page.words[67]
for transkription_position in word.transkription_positions:
transkription_position.text = 'asdf'\
if transkription_position.writing_process_id == 1\
else word.text
self.assertEqual(word.has_mixed_status('text'), True)
new_words = word.split_according_to_status('text')
#print([word.text for word in new_words ])
self.assertEqual(len(new_words) > 1, True)
self.assertEqual(new_words[0].id, word.id)
self.assertEqual(new_words[0].deleted, word.deleted)
self.assertEqual(new_words[1].id, word.id+1)
manuscript = ArchivalManuscriptUnity()
page = datatypes.page.Page(self.test_file)
word = page.words[67]
page.words = [ word ]
page.update_styles(manuscript=manuscript)
new_words = word.split_according_to_status('style', splits_are_parts=True)
self.assertEqual(len(word.word_parts), 3)
def test__create_new_word(self):
manuscript = ArchivalManuscriptUnity()
page = datatypes.page.Page(self.test_file)
word = page.words[67]
page.words = [ word ]
page.update_styles(manuscript=manuscript)
newWord = word._create_new_word([ word.transkription_positions[0] ], 'style')
for key in Word.COPY_PROPERTY_KEY:
self.assertEqual(newWord.__dict__[key], word.__dict__[key])
self.assertEqual(len(newWord.styles), 1)
def test__get_partial_word_over_box(self):
word = Word(text='test', transkription_positions=[ TranskriptionPosition(id=0), TranskriptionPosition(id=1) ])
word.transkription_positions[0].has_box = Box(earlier_text='asdf')
word._get_partial_word_over_box()
self.assertEqual(len(word.word_parts), 2)
partA = Word(id=0, text='A', transkription_positions=[TranskriptionPosition()])
partB = Word(id=1, text='SDF', transkription_positions=[TranskriptionPosition(), TranskriptionPosition(id=1)])
partB.transkription_positions[0].has_box = Box(earlier_text='asdf')
word = Word(text='ASDF', word_parts=[ partA, partB])
word._get_partial_word_over_box()
self.assertEqual(len(word.word_parts), 2)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_data/faksimile_svg/Mp-XV-2c,4.svg
===================================================================
--- tests_svgscripts/test_data/faksimile_svg/Mp-XV-2c,4.svg (revision 0)
+++ tests_svgscripts/test_data/faksimile_svg/Mp-XV-2c,4.svg (revision 101)
@@ -0,0 +1,2980 @@
+
+
Index: tests_svgscripts/test_convert_wordPositions.py
===================================================================
--- tests_svgscripts/test_convert_wordPositions.py (revision 100)
+++ tests_svgscripts/test_convert_wordPositions.py (revision 101)
@@ -1,67 +1,72 @@
import unittest
from os import sep, path, remove
import lxml.etree as ET
import lxml.html
import sys
sys.path.append('svgscripts')
import convert_wordPositions
from convert_wordPositions import Converter, SVGConverter, HTMLConverter, JSONConverter
from datatypes.page import Page
from datatypes.page_creator import PageCreator
from datatypes.transkription_position import TranskriptionPosition
class TestConverter(unittest.TestCase):
def setUp(self):
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.test_file = DATADIR + sep + 'test.xml'
self.test_svg_file = DATADIR + sep + 'test421.svg'
self.outputfile_txt = 'test.txt'
self.outputfile_html = 'test.html'
self.outputfile_svg = 'test.svg'
self.outputfile_json = 'test.json'
def test_main(self):
argv = ['-x', '-s', self.test_svg_file, self.test_file]
self.assertEqual(convert_wordPositions.main(argv), 0)
argv = ['-x', '-s', self.test_svg_file, '-o', self.outputfile_txt, self.test_file]
self.assertEqual(convert_wordPositions.main(argv), 0)
self.assertEqual(path.isfile(self.outputfile_txt), True)
argv = ['-x', '-s', self.test_svg_file, '-o', self.outputfile_html, self.test_file]
self.assertEqual(convert_wordPositions.main(argv), 0)
self.assertEqual(path.isfile(self.outputfile_html), True)
html_tree = lxml.html.parse(self.outputfile_html)
self.assertEqual(html_tree.getroot().tag, 'html')
argv = ['-x', '-s', self.test_svg_file, '-o', self.outputfile_svg, self.test_file]
self.assertEqual(convert_wordPositions.main(argv), 0)
self.assertEqual(path.isfile(self.outputfile_svg), True)
svg_tree = ET.parse(self.outputfile_svg)
self.assertEqual(svg_tree.getroot().tag, '{http://www.w3.org/2000/svg}svg')
argv = ['-x', '-k', 'number', '-o', self.outputfile_json, self.test_file]
self.assertEqual(convert_wordPositions.main(argv), 0)
+ def test_jsoin_add_object2dict(self):
+ page = Page('xml/Mp_XV_page77r.xml')
+ json = convert_wordPositions.JSONConverter(page, non_testing=False)
+ #print(json.add_object2dict(page.lines))
+
def test_create_converter(self):
page = PageCreator(self.test_file, svg_file=self.test_svg_file)
converter = Converter.CREATE_CONVERTER(page, False, 'SVG')
self.assertEqual(isinstance(converter, SVGConverter), True)
converter = Converter.CREATE_CONVERTER(page, False, 'HTML')
self.assertEqual(isinstance(converter, HTMLConverter), True)
converter = Converter.CREATE_CONVERTER(page, False, 'JSON')
self.assertEqual(isinstance(converter, JSONConverter), True)
converter = Converter.CREATE_CONVERTER(page, False)
self.assertEqual(isinstance(converter, Converter), True)
def test_get_transkription_positions(self):
tp = [ TranskriptionPosition(), TranskriptionPosition(), TranskriptionPosition() ]
page = PageCreator(self.test_file, svg_file=self.test_svg_file)
converter = Converter.CREATE_CONVERTER(page, False, 'SVG')
converter._get_transkription_positions(tp, stage_version='1+')
def tearDown(self):
bool(path.isfile(self.outputfile_txt)) and remove(self.outputfile_txt)
bool(path.isfile(self.outputfile_html)) and remove(self.outputfile_html)
bool(path.isfile(self.outputfile_svg)) and remove(self.outputfile_svg)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_extract_line_continuation.py
===================================================================
--- tests_svgscripts/test_extract_line_continuation.py (revision 100)
+++ tests_svgscripts/test_extract_line_continuation.py (revision 101)
@@ -1,48 +1,52 @@
import unittest
from os import sep, path, remove
from os.path import isfile
import lxml.etree as ET
import warnings
import sys
sys.path.append('svgscripts')
import extract_line_continuation
from datatypes.page import Page
from datatypes.transkriptionField import TranskriptionField
class TestExtractLineContinuation(unittest.TestCase):
def setUp(self):
extract_line_continuation.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.w_I_8_125_svg = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.w_I_8_125_xml = DATADIR + sep + 'W_I_8_new_page125.xml'
def test_get_arrow_y(self):
arrow = ET.Element('text')
arrow.set('transform', 'matrix(1 0 0 1 10 20)')
self.assertEqual(extract_line_continuation._get_arrow_y(arrow), 20.0)
tspan = ET.SubElement(arrow, 'tspan')
tspan.set('y', '10.0')
self.assertEqual(extract_line_continuation._get_arrow_y(tspan), 30.0)
def test_get_line_of_arrow(self):
svg_tree = ET.parse(self.w_I_8_125_svg)
page = Page(self.w_I_8_125_xml)
transkription_field = TranskriptionField(self.w_I_8_125_svg)
arrows = extract_line_continuation._extract_arrow_nodes(svg_tree, 'st7')
line = extract_line_continuation._get_line_of_arrow(arrows[0], page, transkription_field)
self.assertEqual(line.id, 15)
def test_extract_line_continuations(self):
page = Page(self.w_I_8_125_xml)
extract_line_continuation.extract_line_continuations(page, svg_file=self.w_I_8_125_svg)
lines_with_continuations = [ line for line in page.lines if len(line.editor_comments) > 0]
self.assertEqual(len(lines_with_continuations), 2)
page = Page('xml/N_VII_1_page029.xml')
extract_line_continuation.extract_line_continuations(page)
lines_with_continuations = [ line for line in page.lines if len(line.editor_comments) > 0]
#print(lines_with_continuations)
self.assertEqual(len(lines_with_continuations), 1)
+ page = Page('xml/Mp_XV_page75v.xml')
+ extract_line_continuation.extract_line_continuations(page)
+ lines_with_continuations = [ line for line in page.lines if len(line.editor_comments) > 0]
+ self.assertTrue(len(lines_with_continuations) > 0)
if __name__ == "__main__":
unittest.main()
Index: fixes/test_fix_old_data.py
===================================================================
--- fixes/test_fix_old_data.py (revision 100)
+++ fixes/test_fix_old_data.py (revision 101)
@@ -1,72 +1,81 @@
import lxml.etree as ET
from os import sep, path, remove
from os.path import isdir, isfile, dirname, basename
import shutil
import sys
import tempfile
import unittest
import warnings
import fix_old_data
sys.path.append('svgscripts')
from datatypes.faksimile import FaksimilePage
from datatypes.mark_foreign_hands import MarkForeignHands
+from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.path import Path
from datatypes.positional_word_part import PositionalWordPart
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word
from datatypes.word_position import WordPosition
from process_words_post_merging import MERGED_DIR
class TestFixFaksimile(unittest.TestCase):
def setUp(self):
fix_old_data.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml'
self.fix_transkription_positions = DATADIR + sep + 'Mp_XIV_page419a.xml'
+ def test_convert_old_matrix(self):
+ page = Page(self.xml_file)
+ xmin = 28.346
+ ymin = 49.921
+ tp = page.words[63].transkription_positions[0]
+ matrix, x, y = fix_old_data.convert_old_matrix(tp, xmin, ymin)
+ #print(matrix.toString(), x, y)
+
def test_fix_faksimile(self):
page = Page(self.xml_file)
fp = page.words[0].faksimile_positions[0]
left = fp.left
top = fp.top
self.assertEqual(fix_old_data.fix_faksimile_positions(page), True)
self.assertEqual(fp.left, left + page.text_field.xmin)
self.assertEqual(fp.top, top + page.text_field.ymin)
def test_fix_faksimile_line_position(self):
page = Page(self.xml_file)
fix_old_data.fix_faksimile_line_position(page)
for line_number in page.line_numbers:
#print(line_number.id)
self.assertTrue(line_number.faksimile_inner_top < line_number.faksimile_inner_bottom)
@unittest.skip('already tested, interactive')
def test_fix_transkription_positions(self):
page = Page(self.fix_transkription_positions)
merged_page = Page(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL))
fix_old_data.sync_words_linewise(merged_page.words, page.words, page.line_numbers)
self.assertTrue(fix_old_data.fix_transkription_positions(page))
@unittest.skip('already tested, interactive')
def test_join_words(self):
page = Page(self.fix_transkription_positions)
fix_old_data.join_words_interactive(page)
@unittest.skip('already tested, local file')
def test_fix_graphical_svg_file(self):
fix_old_data.fix_graphical_svg_file(Page('xml/Mp_XIV_page418.xml'))
@unittest.skip('already tested, local file')
def test_get_words(self):
page = Page('xml/Mp_XIV_page418.xml')
print([ word.text for word in page.words if word.id == 300])
words = fix_old_data._get_words_from_response('300-310', page.words)
print(words)
if __name__ == "__main__":
unittest.main()
Index: fixes/fix_old_data.py
===================================================================
--- fixes/fix_old_data.py (revision 100)
+++ fixes/fix_old_data.py (revision 101)
@@ -1,538 +1,567 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
sys.path.append('svgscripts')
from convert_wordPositions import HTMLConverter
from datatypes.box import Box
from datatypes.faksimile import FaksimilePage
from datatypes.manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
+from datatypes.matrix import Matrix
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word, update_transkription_position_ids
from join_faksimileAndTranskription import sort_words
from util import back_up, back_up_svg_file, copy_faksimile_svg_file
from process_files import update_svgposfile_status
from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import create_function_dictionary
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
MAX_SVG_XY_THRESHOLD = 10
#TODO: fix all svg graphical files: change xlink:href to href!!!!
+def convert_old_matrix(tp, xmin, ymin) ->(Matrix, float, float):
+ """Return new matrix, x and y for old transkription_position.
+ """
+ matrix = tp.transform.clone_transformation_matrix()
+ matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3)
+ matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3)
+ x = round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)\
+ if tp.left > 0\
+ else 0
+ y = round((tp.height-1.5)*-1, 3)
+ return matrix, x, y
+
+
def save_page(page, attach_first=False, backup=False):
"""Write page to xml file
"""
if backup:
back_up(page, page.xml_file)
if attach_first:
page.update_and_attach_words2tree()
script_name = f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}'
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=script_name, file_type=FILE_TYPE_SVG_WORD_POSITION)
def page_already_changed(page) -> bool:
"""Return whether page has alreadybeen changed by function
"""
return len(\
page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}:{inspect.currentframe().f_back.f_code.co_name}"]')\
) > 0
def fix_faksimile_line_position(page, redo=False) -> bool:
"""Create a faksimile line position.
"""
if not redo and page_already_changed(page):
return False;
update_faksimile_line_positions(page)
if not UNITTESTING:
save_page(page)
return True
def check_faksimile_positions(page, redo=False) -> bool:
"""Check faksimile line position.
"""
if len(page.page_tree.xpath('//data-source/@file')) > 0:
svg_file = page.page_tree.xpath('//data-source/@file')[0]
svg_tree = ET.parse(svg_file)
positions_are_equal_counter = 0
page_changed = False
for faksimile_page in FaksimilePage.GET_FAKSIMILEPAGES(svg_tree):
if page.title == faksimile_page.title\
and page.number == faksimile_page.page_number:
#print([fp.id for fp in faksimile_page.word_positions ])
for word in page.words:
for fp in word.faksimile_positions:
rect_fps = [ rfp for rfp in faksimile_page.word_positions if rfp.id == fp.id ]
if len(rect_fps) > 0:
rfp = rect_fps[0]
if fp.left != rfp.left or fp.top != rfp.top:
#print(f'{fp.id}: {fp.left}/{rfp.left} {fp.top}/{rfp.top}')
fp.left = rfp.left
fp.top = rfp.top
fp.bottom = fp.top + rfp.height
word.attach_word_to_tree(page.page_tree)
page_changed = True
else:
positions_are_equal_counter += 1
print(f'{positions_are_equal_counter}/{len(page.words)} are equal')
if page_changed and not UNITTESTING:
save_page(page)
return page_changed
def fix_faksimile_positions(page, redo=False) -> bool:
"""Set faksimile positions to absolute values.
[:return:] fixed
"""
if not redo and len(page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}"]')) > 0:
return False
x_min = page.text_field.xmin
y_min = page.text_field.ymin
for word in page.words:
for fp in word.faksimile_positions:
fp.left = fp.left + x_min
fp.top = fp.top + y_min
fp.bottom = fp.bottom + y_min
word.attach_word_to_tree(page.page_tree)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
return True
def fix_transkription_positions(page, redo=False) -> bool:
"""Fix transkription positions of merged words
[:return:] fixed
"""
if not isdir(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR)\
or not isfile(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)):
return False
merged_page = Page(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL))
sync_dictionary = sync_words_linewise(merged_page.words, page.words, merged_page.line_numbers)
words = []
for source_word in merged_page.words:
words.append(source_word)
if bool(sync_dictionary.get(source_word)):
_sync_transkriptions_with_words(source_word, sync_dictionary)
if source_word.text != ''.join([ t.get_text() for t in source_word.transkription_positions ]):
text = ''.join([ t.get_text() for t in source_word.transkription_positions ])
print(f'{source_word.line_number}: {source_word.text} has transkription_positions with text "{text}".')
response = input('Change? [Y/n]>')
if not response.startswith('n'):
new_sync_dictionary = sync_words_linewise(merged_page.words, page.words,\
[ line for line in merged_page.line_numbers if line.id == source_word.line_number ], force_sync_on_word=source_word)
if bool(new_sync_dictionary.get(source_word)):
_sync_transkriptions_with_words(source_word, new_sync_dictionary)
else:
raise Exception(f'Could not find sourc_word {source_word.text} in {new_sync_dictionary}!')
page.words = words
page.update_and_attach_words2tree()
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page)
return True
def fix_graphical_svg_file(page, redo=False) -> bool:
"""Fix glyphs of word for which there is a /changed-word in page.page_tree
"""
svg_tree = ET.parse(page.svg_file)
transkription_field = TranskriptionField(page.source)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
back_up_svg_file(svg_tree, namespaces=namespaces)
for deleted_word_node in page.page_tree.xpath('//deleted-word'):
deleted_word = Word.create_cls(deleted_word_node)
_run_function_on_nodes_for_word(svg_tree, namespaces, deleted_word, transkription_field, _set_node_attribute_to, 'visibility', 'hidden')
for changed_word_node in page.page_tree.xpath('//changed-word'):
changed_word = Word.create_cls(changed_word_node)
try:
word = [ word for word in page.words if word.id == changed_word.id and word.text == changed_word.text ][0]
left_difference = word.transkription_positions[0].left - changed_word.transkription_positions[0].left
_run_function_on_nodes_for_word(svg_tree, namespaces, word, transkription_field, _add_value2attribute, 'x', left_difference)
except IndexError:
warnings.warn(f'There is no word for changed_word {changed_word.id}: "{changed_word.text}" in {page.page_tree.docinfo.URL}!')
copy_faksimile_svg_file(target_file=page.svg_file, faksimile_tree=svg_tree, namespaces=namespaces)
def _add_value2attribute(node, attribute, value):
"""Add left_difference to x of node.
"""
node.set(attribute, str(float(node.get(attribute)) + value))
node.set('changed', 'true')
def _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=0.1) -> list:
"""Return nodes with symbol_id n x = svg_x and y = svg_y.
"""
nodes = [ node for node in svg_tree.xpath(\
f'//ns:use[@xlink:href="#{symbol_id}" and @x > {svg_x-threshold} and @x < {svg_x+threshold} and @y > {svg_y-threshold} and @y < {svg_y+threshold} ]',\
namespaces=namespaces) if not bool(node.get('changed')) ]
if len(nodes) == 0 and threshold < MAX_SVG_XY_THRESHOLD:
return _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=threshold+1)
return nodes
def _run_function_on_nodes_for_word(svg_tree, namespaces, word, transkription_field, function_on_node, attribute, value):
"""Run function on nodes for words.
"""
for tp in word.transkription_positions:
for pwp in tp.positional_word_parts:
symbol_id = pwp.symbol_id
svg_x = pwp.left + transkription_field.xmin
svg_y = pwp.bottom + transkription_field.ymin
nodes = _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y)
if len(nodes) > 0:
node = nodes[0]
function_on_node(node, attribute, value)
def _set_node_attribute_to(node, attribute, value):
"""Set attribute of node to value.
"""
node.set(attribute, str(value))
node.set('changed', 'true')
def _get_words_from_response(response, words) ->list:
"""Return a list of word that correspond to indices
"""
if re.match(r'\d+-\d+', response)\
or re.match(r'\d+\+', response):
index_boundaries = []
if response[-1] == '+':
index_boundaries.append(int(response[:response.index('+')]))
index_boundaries.append(index_boundaries[0]+1)
else:
index_boundaries = [ int(i) for i in response.split('-') ]
index_boundaries_length_diff = len(response.split('-')[0]) - len(response.split('-')[1])
if index_boundaries_length_diff > 0:
index_boundaries[1] = int(response.split('-')[0][0-index_boundaries_length_diff-1] + response.split('-')[1])
indices = [ i for i in range(index_boundaries[0], index_boundaries[1]+1) ]
if index_boundaries[0] > index_boundaries[1]:
indices = [ index_boundaries[0] ]
while indices[-1] > index_boundaries[1]:
indices.append(indices[-1]-1)
else:
indices = [ int(i) for i in response.split(' ') ]
result_words = []
for index in indices:
if len([ word for word in words if word.id == index ]) > 0:
result_words += [ word for word in words if word.id == index ]
return result_words
def _split_word(page, word, split_text):
"""Split word.
"""
index = page.words.index(word)
_, left, right = word.split(split_text)
page.words[index] = left
page.words.insert(index+1, right)
def join_words_interactive(page, redo=False) -> bool:
"""Join words interactively.
"""
HTMLConverter(page).convert()
print('Specify ids of words to join.')
- print('[s=split and join word]')
+ print('[s=split and join word ("s splittext id")]')
+ print('[c=create correction history]')
print('[d=mark deleted|i=fix ids|u=undelete|l[:value]=change line to value for ids|r=reload|b=restore backup|q=quit]>')
response = input('>')
if response.startswith('i'):
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, attach_first=True)
return join_words_interactive(Page(page.page_tree.docinfo.URL))
elif response.startswith('r'):
return join_words_interactive(Page(page.page_tree.docinfo.URL))
elif response.startswith('b'):
if page.bak_file is not None:
return join_words_interactive(Page(page.bak_file))
else:
print('Could not restore backup file, please restore manually!')
elif response.startswith('l'):
words = []
line_number = -1
if re.match(r'l:\d+\s\d+', response):
line_number = int(response.replace('l:', '').split(' ')[0])
words = _get_words_from_response(re.compile('l:\d+\s').sub('', response), page.words)
else:
if not re.match(r'l:\d+$', response):
new_response_line = input('Specify new line number>')
if re.match(r'^\d+$', new_response_line):
line_number = int(new_response_line)
else:
line_number = int(response.replace('l:', ''))
new_response = input(f'Specify ids of words for which line number should be changed to {line_number}>')
if re.match(r'\d+', new_response):
words = _get_words_from_response(new_response, page.words)
if line_number != -1:
for word in words: word.line_number = line_number
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, attach_first=True)
page = Page(page.page_tree.docinfo.URL)
return join_words_interactive(page)
+ elif response.startswith('c'):
+ if re.match(r'c\w*\s\d+', response):
+ words = _get_words_from_response(re.compile('c\w*\s').sub('', response), page.words)
+ else:
+ new_response = input(f'Specify ids of words to create a correction history. >')
+ if re.match(r'\d+', new_response):
+ words = _get_words_from_response(new_response, page.words)
+ if len(words) > 0:
+ for word in words: word.create_correction_history()
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, attach_first=True)
+ page = Page(page.page_tree.docinfo.URL)
+ return join_words_interactive(page)
elif response.startswith('d') or response.startswith('u'):
if re.match(r'[du]\w*\s\d+', response):
words = _get_words_from_response(re.compile('[du]\w*\s').sub('', response), page.words)
else:
deletion_target = 'delete' if response.startswith('d') else 'undelete'
new_response = input(f'Specify ids of words to {deletion_target}. >')
if re.match(r'\d+', new_response):
words = _get_words_from_response(new_response, page.words)
if len(words) > 0:
for word in words: word.deleted = response.startswith('d')
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, attach_first=True)
page = Page(page.page_tree.docinfo.URL)
return join_words_interactive(page)
elif response.startswith('s'):
if re.match(r's\s\w+\s\d+', response):
words = _get_words_from_response(re.compile('s\s\w+\s').sub('', response), page.words)
split_text = response.split(' ')[1]
else:
split_text = input('Input split text>')
new_response = input(f'Specify ids of words to split. >')
if re.match(r'\d+', new_response):
words = _get_words_from_response(new_response, page.words)
if len(words) > 0 and split_text != '':
for word in words: _split_word(page, word, split_text)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, attach_first=True)
page = Page(page.page_tree.docinfo.URL)
return join_words_interactive(page)
elif re.match(r'\d+', response):
words = _get_words_from_response(response, page.words)
if len(words) > 0:
if len(set([ word.line_number for word in words ])) == 1\
and len(set([ word.deleted for word in words ])) == 1:
new_word = words[0]
for word2join in words[1:]:
page.words.remove(word2join)
new_word.join(word2join)
else:
new_word = Word.join_words(words)
index = len(page.words)
if words[0] in page.words:
index = page.words.index(words[0])
elif len([ word for word in page.words if words[0] in word.word_parts ]) > 0:
index = page.words.index([ word for word in page.words if words[0] in word.word_parts ][0])
for word2join in words:
if word2join in page.words:
page.words.remove(word2join)
elif len([ word for word in page.words if word2join in word.word_parts ]) > 0:
page.words.remove([ word for word in page.words if word2join in word.word_parts ][0])
page.words.insert(index, new_word)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, attach_first=True)
page = Page(page.page_tree.docinfo.URL)
return join_words_interactive(page)
return True
def sync_words_linewise(source_words, target_words, lines, force_sync_on_word=None) -> dict:
"""Sync words an create a dictionary with source_words as keys, refering to a list of corresponding words.
"""
result_dict = {}
for word in target_words + source_words: word.processed = False
for line in lines:
source_words_on_line = sorted([ word for word in source_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left)
target_words_on_line = sorted([ word for word in target_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left)
if len(target_words_on_line) == len(source_words_on_line):
_sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word)
elif len(source_words_on_line) < len(target_words_on_line):
_sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word)
else:
print('okey dokey')
return result_dict
def _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict):
"""Force sync on word.
"""
unprocessed_target_words = [t_word for t_word in target_words_on_line if not t_word.processed]
if len(unprocessed_target_words) > 0:
print([ (i, t_word.text) for i, t_word in enumerate(unprocessed_target_words)])
response = input(f'Please specify indices of words to sync {force_sync_on_word.text} with: [default:0-{len(unprocessed_target_words)-1}]>')
indices = [ i for i in range(0, len(unprocessed_target_words)) ]
if re.match(r'\d+-\d+', response):
index_strings = response.split('-')
indices = [ i for i in range(int(index_strings[0]), int(index_strings[1])+1) ]
elif response != '':
indices = [ int(i) for i in response.split(' ') ]
target_words = []
for i in indices: target_words.append(unprocessed_target_words[i])
result_dict.update({ force_sync_on_word: target_words })
else:
raise Exception(f'There are no unprocessed target_words for {force_sync_on_word.text} on line {force_sync_on_word.line_number}!')
def _sync_transkriptions_with_words(word, sync_dictionary):
"""Sync transkription_positions of word with syncronized words.
"""
word.transkription_positions = []
for target_word in sync_dictionary[word]:
word.transkription_positions += target_word.transkription_positions
def _sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None):
"""Sync if there are more target words.
"""
current_source_word = None
for target_word in target_words_on_line:
if current_source_word is not None\
and current_source_word.text.startswith(''.join([ w.text for w in result_dict[current_source_word]]) + target_word.text):
result_dict[current_source_word].append(target_word)
target_word.processed = True
if current_source_word.text == ''.join([ w.text for w in result_dict[current_source_word]]):
current_source_word = None
elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ]) > 0:
source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ][0]
target_word.processed = True
source_word.processed = True
result_dict.update({ source_word: [ target_word ] })
elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ]) > 0:
current_source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ][0]
current_source_word.processed = True
target_word.processed = True
result_dict.update({ current_source_word: [ target_word ] })
else:
msg = f'On line {target_word.line_number}: target_word "{target_word.text}" does not have a sibling in {[ s.text for s in source_words_on_line if not s.processed ]}'
warnings.warn(msg)
if force_sync_on_word is not None:
_force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict)
def _sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None):
"""Sync same length
"""
for i, word in enumerate(source_words_on_line):
if word.text == target_words_on_line[i].text:
word.processed = True
target_words_on_line[i].processed = True
result_dict.update({ word: [ target_words_on_line[i] ] })
elif len([ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ]) > 0:
target_word = [ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ][0]
word.processed = True
target_word.processed = True
result_dict.update({ word: [ target_word ] })
else:
msg = f'On line {word.line_number}: source_word "{word.text}" does not have a sibling in {[ s.text for s in target_words_on_line]}'
warnings.warn(msg)
if force_sync_on_word is not None:
_force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix faksimile position ->set them to their absolute value.
svgscripts/fix_old_data.py [OPTIONS] a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-c|--check-faksimile-positions check whether faksimile positions have been updated
-j|--join-words join words by id interactive
-l|--faksimile-line-position create faksimile line positions
-p|--faksimile-positions fix old faksimile positions
-r|--redo rerun
-s|--fix-graphical-svg fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file.
:return: exit code (int)
"""
function_list = []
function_dict = create_function_dictionary(['-c', '--check-faksimile-positions'], check_faksimile_positions)
function_dict = create_function_dictionary(['-l', '--faksimile-line-position'], fix_faksimile_line_position, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-p', '--faksimile-positions'], fix_faksimile_positions, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-t', '--transkription-positions'], fix_transkription_positions, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-s', '--fix-graphical-svg'], fix_graphical_svg_file, function_dictionary=function_dict)
function_dict = create_function_dictionary(['default', '-j', '--join-words'], join_words_interactive, function_dictionary=function_dict)
redo = False;
try:
opts, args = getopt.getopt(argv, "hcplrst", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position", "redo", "fix-graphical-svg", "transkription--positions" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-r', '--redo'):
redo = True;
elif opt in function_dict.keys():
function_list.append(function_dict[opt])
if len(function_list) == 0:
function_list.append(function_dict['default'])
if len(args) < 1:
usage()
return 2
exit_status = 0
xml_file = args[0]
if isfile(xml_file):
counters = { f.__name__: 0 for f in function_list }
for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK):
for current_function in function_list:
if not UNITTESTING:
print(Fore.CYAN + f'Processing {page.title}, {page.number} with function {current_function.__name__} ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
counters[current_function.__name__] += 1 if current_function(page, redo=redo) else 0
if not UNITTESTING:
for function_name, counter in counters.items():
print(Style.RESET_ALL + f'[{counter} pages changed by {function_name}]')
else:
raise FileNotFoundError('File {} does not exist!'.format(xml_file))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: fixes/test_tools.py
===================================================================
--- fixes/test_tools.py (revision 0)
+++ fixes/test_tools.py (revision 101)
@@ -0,0 +1,31 @@
+import lxml.etree as ET
+from os import sep, path, remove
+from os.path import isdir, isfile, dirname, basename
+import shutil
+import sys
+import tempfile
+import unittest
+import warnings
+
+import tools
+
+from datatypes.page import Page
+
+
+class TestUtil(unittest.TestCase):
+ def setUp(self):
+ tools.UNITTESTING = True
+ DATADIR = path.dirname(__file__) + sep + 'test_data'
+ self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml'
+ self.fix_transkription_positions = DATADIR + sep + 'Mp_XIV_page419a.xml'
+
+ def test_convert_old_matrix(self):
+ page = Page(self.xml_file)
+ xmin = 28.346
+ ymin = 49.921
+ tp = page.words[63].transkription_positions[0]
+ matrix, x, y = tools.convert_old_matrix(tp, xmin, ymin)
+ #print(matrix.toString(), x, y)
+
+if __name__ == "__main__":
+ unittest.main()
Index: fixes/tools.py
===================================================================
--- fixes/tools.py (revision 0)
+++ fixes/tools.py (revision 101)
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This code contains some useful tools
+"""
+# Copyright (C) University of Basel 2019 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+import sys
+
+sys.path.append('svgscripts')
+from datatypes.matrix import Matrix
+
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+UNITTESTING = False
+
+def convert_old_matrix(tp, xmin, ymin) ->(Matrix, float, float):
+ """Return new matrix, x and y for old transkription_position.
+ """
+ matrix = tp.transform.clone_transformation_matrix()
+ matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3)
+ matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3)
+ x = round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)\
+ if tp.left > 0\
+ else 0
+ y = round((tp.height-1.5)*-1, 3)
+ return matrix, x, y
+
+
Index: fixes/test_data/03.svg
===================================================================
--- fixes/test_data/03.svg (revision 0)
+++ fixes/test_data/03.svg (revision 101)
@@ -0,0 +1,3508 @@
+
+
+
Index: fixes/test_data/Mp_XIV_page416.xml
===================================================================
--- fixes/test_data/Mp_XIV_page416.xml (revision 0)
+++ fixes/test_data/Mp_XIV_page416.xml (revision 101)
@@ -0,0 +1,4292 @@
+
+
+
+
+
+
+ svgWordPosition
+
+
+ 2020-08-31 15:55:58
+
+ 2020-08-31 15:55:58
+ 2020-10-09 10:26:10
+ 2020-10-09 10:32:35
+ 2020-10-09 17:22:16
+ 2020-10-09 17:28:15
+ 2020-10-09 10:32:49
+ 2020-10-09 17:22:12
+ 2020-10-09 10:33:36
+ 2020-10-09 17:22:12
+
+ Fr Okt 9 17:59:13 CEST 2020
+ hyphenation, boxes, deletion fixed
+ box errors
+ tp of words with apostrophes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ mir
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: fixes/test_interactive_editor.py
===================================================================
--- fixes/test_interactive_editor.py (revision 0)
+++ fixes/test_interactive_editor.py (revision 101)
@@ -0,0 +1,110 @@
+import lxml.etree as ET
+from os import sep, path, remove
+from os.path import isdir, isfile, dirname, basename
+import shutil
+import sys
+import tempfile
+import unittest
+import warnings
+
+import interactive_editor
+
+sys.path.append('svgscripts')
+from datatypes.faksimile import FaksimilePage
+from datatypes.mark_foreign_hands import MarkForeignHands
+from datatypes.page import Page
+from datatypes.path import Path
+from datatypes.positional_word_part import PositionalWordPart
+from datatypes.text_connection_mark import TextConnectionMark
+from datatypes.transkriptionField import TranskriptionField
+from datatypes.word import Word
+from datatypes.word_position import WordPosition
+from process_words_post_merging import MERGED_DIR
+
+
+class TestInteractiveEditor(unittest.TestCase):
+ def setUp(self):
+ interactive_editor.UNITTESTING = True
+ DATADIR = path.dirname(__file__) + sep + 'test_data'
+ self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml'
+ self.fix_transkription_positions = DATADIR + sep + 'Mp_XIV_page419a.xml'
+
+ @unittest.skip('interactive')
+ def test_run(self):
+ page = Page(self.xml_file)
+ interactive_editor.InteractiveShell().run_interactive_editor(page)
+
+ def test_json_dict(self):
+ ro = interactive_editor.ResponseOrganizer()
+ json_dict = ro.create_json_dict(self.xml_file)
+ #print(json_dict)
+
+ def test_handle_json(self):
+ ro = interactive_editor.ResponseOrganizer()
+ json_dict = ro.handle_response({})
+ self.assertEqual(json_dict['actions']['result'], 'ERROR: there was no key "target_file" in json!')
+ json_dict = ro.handle_response({'target_file': self.xml_file})
+ self.assertEqual(json_dict['actions']['result'], 'ERROR: there was no key "date_stamp" in json')
+ json_dict = ro.handle_response({'target_file': self.xml_file, 'date_stamp': path.getmtime(self.xml_file)})
+ self.assertEqual(json_dict['actions']['result'], 'Operation "unknown" failed')
+ page = Page(self.xml_file)
+ json_dict = ro.handle_response({'target_file': self.xml_file, 'date_stamp': path.getmtime(self.xml_file),\
+ 'response_handler': { 'action_name': 'join words'}, 'words': [ { 'id': w.id } for w in page.words[:2] ] })
+ self.assertEqual(json_dict['actions']['result'], 'Operation "join words" succeeded!')
+ #self.assertEqual(json_dict['response'], 'ERROR: there was no key "target_file" in json!')
+
+ def test_update_word(self):
+ page = Page(self.xml_file)
+ word = page.words[0]
+ rh = interactive_editor.SaveChanges()
+ self.assertEqual(rh._update_word(word, { 'id': word.id, 'deleted': False, 'line': 99, 'tp_id': f'w{word.id}:tp0' }), 0)
+ self.assertEqual(word.deleted, False)
+ self.assertEqual(word.line_number, 99)
+ word = page.words[18]
+ self.assertEqual(rh._update_word(word, { 'id': word.id, 'deleted': True, 'line': 99, 'tp_id': f'w{word.id}:w0:tp0' }), 0)
+ self.assertEqual(word.word_parts[0].deleted, True)
+ self.assertEqual(word.word_parts[0].line_number, 99)
+
+ def test_dictcontains_keys(self):
+ a_dict = { 'a': { 'b': { 'c': { 'd': 0 }}}}
+ key_list = [ 'a', 'b', 'c', 'd' ]
+ self.assertTrue(interactive_editor.dict_contains_keys(a_dict, key_list))
+
+ def test_get_requirement(self):
+ rh = interactive_editor.ResponseHandler()
+ json_dict = { 'response_handler': { 'requirements' : [ { 'input': 'asdf', 'name': 'test' } ]}}
+ name, requirement = rh.get_requirement(json_dict)
+ self.assertEqual(name, 'test')
+ self.assertEqual(requirement, 'asdf')
+ self.assertEqual(rh.get_requirement(json_dict, index=1), (None,None))
+
+ def test_split_words_dict(self):
+ rh = interactive_editor.SplitWords(action_name='split words', description='asdf asdf')
+ self.assertTrue(interactive_editor.dict_contains_keys(rh.create_json_dict(), ['requirements']))
+
+ def test_handle_split_text(self):
+ page = Page(self.xml_file)
+ word = page.words[0]
+ json_dict = { 'words': [{ 'id': word.id }], 'response_handler': { 'requirements' : [ { 'input': 'h', 'name': 'split_text' } ]}}
+ rh = interactive_editor.SplitWords(action_name='split words', description='asdf asdf')
+ self.assertEqual(rh.handle_response(page, json_dict), 0)
+ self.assertEqual(page.words[0].text, 'h')
+
+ def test_handle_addbox(self):
+ page = Page(self.xml_file)
+ word = page.words[0]
+ json_dict = { 'words': [{ 'id': word.id }], 'response_handler': { 'requirements' : [ { 'input': 'test', 'name': 'box_text' } ]}}
+ rh = interactive_editor.AddBox(action_name='add box', description='asdf asdf')
+ self.assertEqual(rh.handle_response(page, json_dict), 0)
+ self.assertTrue(page.words[0].overwrites_word is not None)
+ self.assertEqual(page.words[0].overwrites_word.text, 'test')
+ word = page.words[1]
+ json_dict = { 'words': [{ 'id': word.id }], 'response_handler': { 'requirements' : [ { 'input': 'a', 'name': 'box_text' },\
+ {'input': 'e', 'name': 'overwritten_by'}, {'input': True, 'name': 'is_earlier_version'}]}}
+ self.assertEqual(rh.handle_response(page, json_dict), 0)
+ self.assertTrue(page.words[1].earlier_version is not None)
+ self.assertEqual(page.words[1].earlier_version.text, 'fast')
+
+
+if __name__ == "__main__":
+ unittest.main()
Index: fixes/interactive_editor.py
===================================================================
--- fixes/interactive_editor.py (revision 0)
+++ fixes/interactive_editor.py (revision 101)
@@ -0,0 +1,676 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This program can be used to process words after they have been merged with faksimile data.
+"""
+# Copyright (C) University of Basel 2019 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+from colorama import Fore, Style
+from datetime import datetime
+from deprecated import deprecated
+from functools import cmp_to_key
+import getopt
+import inspect
+import lxml.etree as ET
+import re
+import shutil
+import string
+from svgpathtools import svg2paths2, svg_to_paths
+from svgpathtools.path import Path as SVGPath
+from svgpathtools.path import Line
+import sys
+import tempfile
+from operator import attrgetter
+import os
+from os import listdir, sep, path, setpgrp, devnull
+from os.path import exists, isfile, isdir, dirname, basename
+from progress.bar import Bar
+import warnings
+
+
+from fix_old_data import save_page
+from fix_boxes import attach_box, split_into_parts_and_attach_box
+
+sys.path.append('svgscripts')
+from convert_wordPositions import HTMLConverter, JSONConverter
+from datatypes.box import Box
+from datatypes.faksimile import FaksimilePage
+from datatypes.manuscript import ArchivalManuscriptUnity
+from datatypes.mark_foreign_hands import MarkForeignHands
+from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
+from datatypes.path import Path
+from datatypes.text_connection_mark import TextConnectionMark
+from datatypes.transkriptionField import TranskriptionField
+from datatypes.word import Word, update_transkription_position_ids
+from join_faksimileAndTranskription import sort_words
+from util import back_up, back_up_svg_file, copy_faksimile_svg_file
+from process_files import update_svgposfile_status
+from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
+
+sys.path.append('shared_util')
+from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
+from main_util import create_function_dictionary
+
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+UNITTESTING = False
+MAX_SVG_XY_THRESHOLD = 10
+
+class ResponseHandler:
+ def __init__(self, response_starts_with=None, dialog_string=None, action_name=None, description=None):
+ self.action_name = action_name
+ self.dialog_string = dialog_string
+ self.description = description
+ self.response_starts_with = response_starts_with
+
+ def create_requirement_list(self) ->list:
+ """Create a requirement dictionary.
+ """
+ return []
+
+ def create_json_dict(self)->dict:
+ """Create a json dictionary.
+ """
+ json_dict = { 'action_name': self.action_name, 'description': self.description }
+ requirements = self.create_requirement_list()
+ if len(requirements) > 0:
+ json_dict.update({ 'requirements': requirements })
+ return json_dict
+
+ def get_requirement(self, json_dict: dict, index=0) ->tuple:
+ """Return requirement tuple (name, input).
+ """
+ name = requirement = None
+ if dict_contains_keys(json_dict, ['response_handler','requirements'])\
+ and index < len(json_dict['response_handler']['requirements']):
+ requirement_dict = json_dict['response_handler']['requirements'][index]
+ if dict_contains_keys(requirement_dict, ['name'])\
+ and dict_contains_keys(requirement_dict, ['input']):
+ name = requirement_dict['name']
+ requirement = requirement_dict['input']
+ return name, requirement
+
+ def match(self, response: str) ->bool:
+ """Return whether response matchs with handler.
+ """
+ if self.response_starts_with is not None:
+ return response.startswith(self.response_starts_with)
+ return True
+
+ def print_dialog(self):
+ """Print dialog.
+ """
+ if self.dialog_string is not None:
+ print(f'[{self.dialog_string}]')
+
+ def handle_response(self, page: Page, json_dict: dict) -> int:
+ """Handle response and return exit code.
+ """
+ json_word_ids = [ jw.get('id') for jw in json_dict['words'] ]
+ action_dictionary = { 'words': [ word for word in page.words if word.id in json_word_ids ] }
+ for index, item in enumerate(self.create_requirement_list()):
+ name, requirement = self.get_requirement(json_dict, index=index)
+ action_dictionary.update({name: requirement})
+ return self.run_change(page, action_dictionary)
+
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response and return exit code.
+ """
+ return self.run_change(page, {})
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ return exit_code
+
+class JoinWords(ResponseHandler):
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response interactively and return exit code.
+ """
+ action_dictionary = { 'words' : shell._get_words_from_response(re.compile('^\D+\s').sub('', response), page.words),\
+ 'add_white_space_between_words': re.match(r'^\D+\s', response) }
+ if self.run_change(page, action_dictionary) == 0:
+ return shell.run_interactive_editor(page)
+ return 2
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ add_white_space_between_words = action_dictionary['add_white_space_between_words']\
+ if bool(action_dictionary.get('add_white_space_between_words'))\
+ else False
+ words = action_dictionary['words']\
+ if bool(action_dictionary.get('words'))\
+ else []
+ if len(words) > 0:
+ if len(set([ word.line_number for word in words ])) == 1\
+ and len(set([ word.deleted for word in words ])) == 1:
+ new_word = words[0]
+ for word2join in words[1:]:
+ page.words.remove(word2join)
+ new_word.join(word2join, add_white_space_between_words=add_white_space_between_words)
+ else:
+ new_word = Word.join_words(words, add_white_space_between_words=add_white_space_between_words)
+ index = len(page.words)
+ if words[0] in page.words:
+ index = page.words.index(words[0])
+ elif len([ word for word in page.words if words[0] in word.word_parts ]) > 0:
+ index = page.words.index([ word for word in page.words if words[0] in word.word_parts ][0])
+ for word2join in words:
+ if word2join in page.words:
+ page.words.remove(word2join)
+ elif len([ word for word in page.words if word2join in word.word_parts ]) > 0:
+ page.words.remove([ word for word in page.words if word2join in word.word_parts ][0])
+ page.words.insert(index, new_word)
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, attach_first=True)
+ page = Page(page.page_tree.docinfo.URL)
+ else:
+ exit_code = 2
+ return exit_code
+
+class SimpleJoinWords(JoinWords):
+ def match(self, response: str) ->bool:
+ """Return whether response matchs with handler.
+ """
+ return re.match(r'\d+', response)
+
+class SaveChanges(ResponseHandler):
+ RELEVANT_PROPERTIES = [ ('deleted','deleted'), ('line_number','line') ] # 0 = word, 1 = word_dict
+
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response and return exit code.
+ """
+ self.run_change(page, {})
+ return shell.run_interactive_editor(page)
+
+ def _update_word(self, word, word_dict) ->int:
+ """Update properites of word according to word_dict,
+ return exit_code
+ """
+ exit_code = 0
+ for relevant_property in self.RELEVANT_PROPERTIES:
+ if len(word.word_parts) > 0:
+ if len(word_dict['tp_id'].split(':')) == 3:
+ wp_index = int(word_dict['tp_id'].split(':')[1].replace('w',''))
+ word.word_parts[wp_index].__dict__[relevant_property[0]] = word_dict[relevant_property[1]]
+ else:
+ return 2
+ else:
+ word.__dict__[relevant_property[0]] = word_dict[relevant_property[1]]
+ return exit_code
+
+ def handle_response(self, page: Page, json_dict: dict) -> int:
+ """Handle response and return exit code.
+ """
+ json_word_ids = [ jw.get('id') for jw in json_dict['words'] ]
+ for word in page.words:
+ if word.id in json_word_ids:
+ word_dict = [ jw for jw in json_dict['words'] if jw.get('id') == word.id ][0]
+ if self._update_word(word, word_dict) > 0:
+ return 2
+ return self.run_change(page, {})
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, attach_first=True)
+ page = Page(page.page_tree.docinfo.URL)
+ return exit_code
+
+class Reload(ResponseHandler):
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response and return exit code.
+ """
+ return shell.run_interactive_editor(Page(page.page_tree.docinfo.URL))
+
+class RestoreBackup(ResponseHandler):
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response and return exit code.
+ """
+ if page.bak_file is not None:
+ return shell.run_interactive_editor(Page(page.bak_file))
+ else:
+ print('Could not restore backup file, please restore manually!')
+ return 2
+
+class ChangeLine2Value(ResponseHandler):
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response and return exit code.
+ """
+ words = []
+ line_number = -1
+ if re.match(r'l:\d+\s\d+', response):
+ line_number = int(response.replace('l:', '').split(' ')[0])
+ words = shell._get_words_from_response(re.compile('l:\d+\s').sub('', response), page.words)
+ else:
+ if not re.match(r'l:\d+$', response):
+ new_response_line = input('Specify new line number>')
+ if re.match(r'^\d+$', new_response_line):
+ line_number = int(new_response_line)
+ else:
+ line_number = int(response.replace('l:', ''))
+ new_response = input(f'Specify ids of words for which line number should be changed to {line_number}>')
+ if re.match(r'\d+', new_response):
+ words = shell_get_words_from_response(new_response, page.words)
+ action_dictionary = { 'words': words, 'line_number' : line_number }
+ if self.run_change(page, action_dictionary) == 0:
+ return shell.run_interactive_editor(page)
+ return 2
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ line_number = action_dictionary['line_number']\
+ if bool(action_dictionary.get('line_number'))\
+ else -1
+ words = action_dictionary['words']\
+ if bool(action_dictionary.get('words'))\
+ else []
+
+ if line_number != -1:
+ for word in words: word.line_number = line_number
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, attach_first=True)
+ page = Page(page.page_tree.docinfo.URL)
+ else:
+ exit_code = 2
+ return exit_code
+
+class CreateCorrectionHistory(ResponseHandler):
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response and return exit code.
+ """
+ if re.match(r'c\w*\s\d+', response):
+ words = shell._get_words_from_response(re.compile('c\w*\s').sub('', response), page.words)
+ else:
+ new_response = input(f'Specify ids of words to create a correction history. >')
+ if re.match(r'\d+', new_response):
+ words = shell._get_words_from_response(new_response, page.words)
+ action_dictionary = { 'words': words }
+ if self.run_change(page, action_dictionary) == 0:
+ return shell.run_interactive_editor(page)
+ return 2
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ words = action_dictionary['words']\
+ if bool(action_dictionary.get('words'))\
+ else []
+ if len(words) > 0:
+ for word in words: word.create_correction_history()
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, attach_first=True)
+ page = Page(page.page_tree.docinfo.URL)
+ else:
+ exit_code = 2
+ return exit_code
+
+class DeleteCorrectionHistory(ResponseHandler):
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response interactively and return exit code.
+ """
+ if re.match(r'D\w*\s\d+', response):
+ words = shell._get_words_from_response(re.compile('D\w*\s').sub('', response), page.words)
+ else:
+ new_response = input(f'Specify ids of words to delete their correction history. >')
+ if re.match(r'\d+', new_response):
+ words = shell._get_words_from_response(new_response, page.words)
+ action_dictionary = { 'words' : words }
+ if self.run_change(page, action_dictionary) == 0:
+ return shell.run_interactive_editor(page)
+ return 2
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ words = action_dictionary['words']\
+ if bool(action_dictionary.get('words'))\
+ else []
+ if len(words) > 0:
+ for word in words:
+ print(word.text)
+ word.earlier_version = None
+ word.corrections = []
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, attach_first=True)
+ page = Page(page.page_tree.docinfo.URL)
+ else:
+ exit_code = 2
+ return exit_code
+
+class ChangeDeletionStatus(ResponseHandler):
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response and return exit code.
+ """
+ if re.match(r'[du]\w*\s\d+', response):
+ words = shell._get_words_from_response(re.compile('[du]\w*\s').sub('', response), page.words)
+ else:
+ deletion_target = 'delete' if response.startswith('d') else 'undelete'
+ new_response = input(f'Specify ids of words to {deletion_target}. >')
+ if re.match(r'\d+', new_response):
+ words = shell._get_words_from_response(new_response, page.words)
+ action_dictionary = { 'words': words, 'deleted': response.startswith('d') }
+ if self.run_change(page, action_dictionary) == 0:
+ return shell.run_interactive_editor(page)
+ return 2
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ words = action_dictionary['words']\
+ if bool(action_dictionary.get('words'))\
+ else []
+ word_should_be_deleted = bool(action_dictionary.get('deleted'))
+ if len(words) > 0:
+ for word in words: word.deleted = word_should_be_deleted
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, attach_first=True)
+ page = Page(page.page_tree.docinfo.URL)
+ else:
+ exit_code = 2
+ return exit_code
+
+class SplitWords(ResponseHandler):
+ def _split_word(self, page, word, split_text):
+ """Split word.
+ """
+ index = page.words.index(word)
+ _, left, right = word.split(split_text)
+ page.words[index] = left
+ page.words.insert(index+1, right)
+
+ def create_requirement_list(self) ->list:
+ """Create a requirement dictionary.
+ """
+ return [{ 'name': 'split_text', 'type': 'string', 'input': None }]
+
+ def handle_interactive_response(self, page: Page, response: str, shell) -> int:
+ """Handle response and return exit code.
+ """
+ if re.match(r's\s\w+\s\d+', response):
+ words = shell._get_words_from_response(re.compile('s\s\w+\s').sub('', response), page.words)
+ split_text = response.split(' ')[1]
+ else:
+ split_text = input('Input split text>')
+ new_response = input(f'Specify ids of words to split. >')
+ if re.match(r'\d+', new_response):
+ words = shell._get_words_from_response(new_response, page.words)
+ action_dictionary = { 'words': words, 'split_text': split_text }
+ if self.run_change(page, action_dictionary) == 0:
+ return shell.run_interactive_editor(page)
+ return 2
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ words = action_dictionary['words']\
+ if bool(action_dictionary.get('words'))\
+ else []
+ split_text = action_dictionary['split_text']\
+ if bool(action_dictionary.get('split_text'))\
+ else ''
+ if len(words) > 0 and split_text != '':
+ for word in words: self._split_word(page, word, split_text)
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, attach_first=True)
+ page = Page(page.page_tree.docinfo.URL)
+ else:
+ exit_code = 2
+ return exit_code
+
+class AddBox(ResponseHandler):
+ def create_requirement_list(self) ->list:
+ """Create a requirement dictionary.
+ """
+ return [{ 'name': 'box_text', 'type': 'string', 'input': None },\
+ { 'name': 'overwritten_by', 'type': 'string', 'input': None },\
+ { 'name': 'is_earlier_version', 'type': 'boolean', 'input': False }]
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ words = action_dictionary['words']\
+ if bool(action_dictionary.get('words'))\
+ else []
+ missing_text = action_dictionary.get('box_text')
+ is_earlier_version = action_dictionary.get('is_earlier_version')
+ overwritten_by = action_dictionary.get('overwritten_by')
+ if len(words) > 0 and missing_text is not None:
+ for word in words:
+ if overwritten_by is not None:
+ split_into_parts_and_attach_box(word, 0, missing_text, is_earlier_version, overwritten_by)
+ else:
+ attach_box(word, 0, missing_text, False)
+ word.create_correction_history()
+ if len(word.corrections) > 0:
+ for wp in word.word_parts:
+ wp.overwrites_word = None
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, attach_first=True)
+ page = Page(page.page_tree.docinfo.URL)
+ else:
+ exit_code = 2
+ return exit_code
+
+class ResponseOrganizer:
+ RESULT = 'result'
+
+ def __init__(self):
+ self.response_handler_dictionary = {}
+ self._add_response_handler(JoinWords(action_name='join words', description='join words'))
+ self._add_response_handler(SplitWords(action_name='split words', description='split word according to split text'))
+ self._add_response_handler(CreateCorrectionHistory(action_name='create correction history', description='creates a correction history for selected words'))
+ self._add_response_handler(DeleteCorrectionHistory(action_name='delete correction history', description='deletes the correction history of selected words'))
+ self._add_response_handler(AddBox(action_name='add box', description='add box with overwritten text'))
+ self._add_response_handler(SaveChanges(action_name='save changes', description='save change to line number/deletion status for word(s)' ))
+
+ def _add_response_handler(self, response_handler: ResponseHandler):
+ """Add response_handler to response_handler_dictionary.
+ """
+ self.response_handler_dictionary.update({response_handler.action_name: response_handler})
+
+ def create_json_dict(self, xml_file: str, last_operation_result=None) ->dict:
+ """Return a json dict of page with information about action.
+ """
+ page = Page(xml_file)
+ replace_ligatures(page)
+ converter = JSONConverter(page)
+ json_dict = converter.create_json_dict()
+ action_dict = { 'target_file': xml_file,\
+ 'date_stamp': os.path.getmtime(xml_file) }
+ if last_operation_result is not None:
+ action_dict.update({self.RESULT: last_operation_result })
+ response_handlers = []
+ for response_handler in self.response_handler_dictionary.values():
+ response_handlers.append(response_handler.create_json_dict())
+ action_dict.update({ 'response_handlers': response_handlers })
+ json_dict.update({ 'actions': action_dict})
+ return json_dict
+
+ def handle_response(self, json_dict: dict) ->dict:
+ """Handle response in json_dict and return new data json_dict.
+ """
+ if bool(json_dict.get('target_file')):
+ target_file = json_dict['target_file']
+ if bool(json_dict.get('date_stamp')):
+ current_stamp = os.path.getmtime(target_file)
+ if current_stamp <= json_dict['date_stamp']:
+ exit_code = 2
+ operation = 'unknown'
+ if bool(json_dict.get('response_handler'))\
+ and bool(self.response_handler_dictionary.get(json_dict['response_handler']['action_name'])):
+ operation = json_dict['response_handler']['action_name']
+ response_handler = self.response_handler_dictionary[operation]
+ exit_code = response_handler.handle_response(Page(target_file), json_dict)
+ message = f'Operation "{operation}" succeeded!' if exit_code == 0 else f'Operation "{operation}" failed'
+ return self.create_json_dict(target_file, last_operation_result=message)
+ else:
+ return self.create_json_dict(target_file,\
+ last_operation_result=f'FAIL: file {target_file} was changed between operations!')
+ else:
+ return self.create_json_dict(target_file,\
+ last_operation_result='ERROR: there was no key "date_stamp" in json')
+ else:
+ return { 'actions': { self.RESULT: 'ERROR: there was no key "target_file" in json!' }}
+
+class InteractiveShell:
+ def __init__(self):
+ self.response_handlers = []
+ self.response_handlers.append(SimpleJoinWords(dialog_string='specify ids of words to join [default]'))
+ self.response_handlers.append(RestoreBackup(response_starts_with='b', dialog_string='b=restore backup'))
+ self.response_handlers.append(CreateCorrectionHistory(response_starts_with='c', dialog_string='c=create correction history [+ ids]'))
+ self.response_handlers.append(DeleteCorrectionHistory(response_starts_with='D', dialog_string='D=delete correction history [+ ids]'))
+ self.response_handlers.append(ChangeDeletionStatus(response_starts_with='d', dialog_string='d=mark deleted [+ ids]'))
+ self.response_handlers.append(SaveChanges(response_starts_with='i', dialog_string='i=fix ids' ))
+ self.response_handlers.append(ChangeLine2Value(response_starts_with='l', dialog_string='l[:value]=change line to value for ids' ))
+ self.response_handlers.append(Reload(response_starts_with='r', dialog_string='r=reload xml file'))
+ self.response_handlers.append(SplitWords(response_starts_with='s', dialog_string='s=split and join word ("s splittext id")'))
+ self.response_handlers.append(ChangeDeletionStatus(response_starts_with='u', dialog_string='u=undelete [+ ids]'))
+ self.response_handlers.append(JoinWords(response_starts_with='w', dialog_string='w=join words with whitespace between them [+ ids]'))
+ self.response_handlers.append(ResponseHandler())
+
+ def _get_words_from_response(self, response, words) ->list:
+ """Return a list of word that correspond to indices
+ """
+ if re.match(r'\d+-\d+', response)\
+ or re.match(r'\d+\+', response):
+ index_boundaries = []
+ if response[-1] == '+':
+ index_boundaries.append(int(response[:response.index('+')]))
+ index_boundaries.append(index_boundaries[0]+1)
+ else:
+ index_boundaries = [ int(i) for i in response.split('-') ]
+ index_boundaries_length_diff = len(response.split('-')[0]) - len(response.split('-')[1])
+ if index_boundaries_length_diff > 0:
+ index_boundaries[1] = int(response.split('-')[0][0-index_boundaries_length_diff-1] + response.split('-')[1])
+ indices = [ i for i in range(index_boundaries[0], index_boundaries[1]+1) ]
+ if index_boundaries[0] > index_boundaries[1]:
+ indices = [ index_boundaries[0] ]
+ while indices[-1] > index_boundaries[1]:
+ indices.append(indices[-1]-1)
+ else:
+ indices = [ int(i) for i in response.split(' ') ]
+ result_words = []
+ for index in indices:
+ if len([ word for word in words if word.id == index ]) > 0:
+ result_words += [ word for word in words if word.id == index ]
+ return result_words
+
+ def run_interactive_editor(self, page) -> int:
+ """Run interactive shell.
+ """
+ replace_ligatures(page)
+ HTMLConverter(page).convert()
+ for response_handler in self.response_handlers: response_handler.print_dialog()
+ response = input('>')
+ for response_handler in self.response_handlers:
+ if response_handler.match(response):
+ return response_handler.handle_interactive_response(page, response, self)
+
+def replace_ligatures(page):
+ """Replace ligatures
+ """
+ if len([ word for word in page.words if re.match(r'.*[flfi]', word.text) ]) > 0:
+ for word in [ word for word in page.words if re.match(r'.*[fi]', word.text) ]:
+ word.text = word.text.replace('fi', 'fi')
+ for word in [ word for word in page.words if re.match(r'.*[fl]', word.text) ]:
+ word.text = word.text.replace('fl', 'fl')
+
+def dict_contains_keys(a_dict, key_list)->bool:
+ """Return whether dict a_dict contains key path given by key_list.
+ """
+ if len(key_list) == 0:
+ return True
+ else:
+ if key_list[0] in a_dict.keys():
+ return dict_contains_keys(a_dict[key_list[0]], key_list[1:])
+ return False
+
+def usage():
+ """prints information on how to use the script
+ """
+ print(main.__doc__)
+
+def main(argv):
+ """This program can be used to fix faksimile position ->set them to their absolute value.
+
+ fixes/interactive_editor.py [OPTIONS]
+
+ a xml file about a manuscript, containing information about its pages.
+ a xml file about a page, containing information about svg word positions.
+
+ OPTIONS:
+ -h|--help show help
+
+ :return: exit code (int)
+ """
+ try:
+ opts, args = getopt.getopt(argv, "h", ["help"])
+ except getopt.GetoptError:
+ usage()
+ return 2
+ for opt, arg in opts:
+ if opt in ('-h', '--help'):
+ usage()
+ return 0
+ if len(args) < 1:
+ usage()
+ return 2
+ exit_status = 0
+ xml_file = args[0]
+ if isfile(xml_file):
+ counter = 0
+ shell = InteractiveShell()
+ for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK):
+ if not UNITTESTING:
+ print(Fore.CYAN + f'Processing {page.title}, {page.number} with interactive editor ...' + Style.RESET_ALL)
+ back_up(page, page.xml_file)
+ counter += 1 if shell.run_interactive_editor(page) == 0 else 0
+ if not UNITTESTING:
+ print(Style.RESET_ALL + f'[{counter} pages changed by interactive shell]')
+ else:
+ raise FileNotFoundError('File {} does not exist!'.format(xml_file))
+ return exit_status
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
Index: fixes/test_fix_boxes.py
===================================================================
--- fixes/test_fix_boxes.py (revision 100)
+++ fixes/test_fix_boxes.py (revision 101)
@@ -1,68 +1,68 @@
import lxml.etree as ET
from os import sep, path, remove
from os.path import isdir, isfile, dirname, basename
import shutil
import sys
import tempfile
import unittest
import warnings
import fix_boxes
sys.path.append('svgscripts')
from datatypes.faksimile import FaksimilePage
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.box import Box
from datatypes.page import Page
from datatypes.path import Path
from datatypes.positional_word_part import PositionalWordPart
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word
from datatypes.word_position import WordPosition
from process_words_post_merging import MERGED_DIR
class TestFixBoxes(unittest.TestCase):
def setUp(self):
fix_boxes.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.fix_boxes = DATADIR + sep + 'Mp_XIV_page416.xml'
self.fix_boxes_src = DATADIR + sep + '03.svg'
def test_fix_boxes(self):
page = Page(self.fix_boxes)
page.source = self.fix_boxes_src
fixed_word_ids = sorted([ int(id) for id in set([ node.getparent().get('id') for node in page.page_tree.xpath('//' + Word.XML_TAG + f'/debug[@msg="{fix_boxes.DEBUG_MSG}"]')])])
self.assertEqual(len(fixed_word_ids), 2)
self.assertEqual(fix_boxes.fix_boxes(page), 0)
self.assertEqual(page.words[fixed_word_ids[0]].earlier_version.text, 'Wink.')
self.assertEqual(page.words[fixed_word_ids[1]].overwrites_word.text, 'ist')
def test_init_word_with_box(self):
page = Page(self.fix_boxes)
word_node = [ node.getparent() for node in page.page_tree.xpath('//' + Word.XML_TAG + f'/debug[@msg="{fix_boxes.DEBUG_MSG}"]')][0]
word = fix_boxes.WordWithBoxes.create_cls(word_node)
self.assertEqual(word.word_parts[1].text, 'endung')
self.assertTrue(word.word_parts[1].word_box is not None)
self.assertEqual(word.word_parts[2].text, ',')
self.assertTrue(word.word_parts[2].word_box is not None)
word_node = [ node.getparent() for node in page.page_tree.xpath('//' + Word.XML_TAG + f'/debug[@msg="{fix_boxes.DEBUG_MSG}"]')][2]
word = fix_boxes.WordWithBoxes.create_cls(word_node)
self.assertEqual(len(word.word_parts), 0)
"""
tree = ET.Element('page')
word.attach_word_to_tree(tree)
print(ET.dump(tree))
"""
def test_split_and_attach(self):
page = Page(self.fix_boxes)
word = [ word for word in page.words if word.text == 'Wendung,' ][0]
- fix_boxes._split_into_parts_and_attach_box(word, 0, 'ink', True, 'endung')
+ fix_boxes.split_into_parts_and_attach_box(word, 0, 'ink', True, 'endung')
self.assertEqual(word.word_parts[1].text, 'endung')
self.assertTrue(word.word_parts[1].word_box is not None)
if __name__ == "__main__":
unittest.main()
Index: fixes/test_server.py
===================================================================
--- fixes/test_server.py (revision 0)
+++ fixes/test_server.py (revision 101)
@@ -0,0 +1,32 @@
+import lxml.etree as ET
+from os import sep, path, remove
+from os.path import isdir, isfile, dirname, basename
+import shutil
+import sys
+import tempfile
+import unittest
+import warnings
+
+import server
+
+sys.path.append('svgscripts')
+from datatypes.page import Page
+
+
+class TestServer(unittest.TestCase):
+ def setUp(self):
+ server.UNITTESTING = True
+ DATADIR = path.dirname(__file__) + sep + 'test_data'
+ self.fix_boxes = DATADIR + sep + 'Mp_XIV_page416.xml'
+ self.fix_boxes_src = DATADIR + sep + '03.svg'
+
+ @unittest.skip('runs forever')
+ def test_run(self):
+ server.run()
+
+ def test_get_local_dictionary(self):
+ local_dictionary = server.Server.get_local_file_dictionary()
+ self.assertTrue(server.Server.XML in local_dictionary.keys())
+
+if __name__ == "__main__":
+ unittest.main()
Index: fixes/fix_boxes.py
===================================================================
--- fixes/fix_boxes.py (revision 100)
+++ fixes/fix_boxes.py (revision 101)
@@ -1,215 +1,215 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
from fix_old_data import save_page
sys.path.append('svgscripts')
from convert_wordPositions import HTMLConverter
from datatypes.box import Box
from datatypes.faksimile import FaksimilePage
from datatypes.manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word, update_transkription_position_ids
from join_faksimileAndTranskription import sort_words
from util import back_up, back_up_svg_file, copy_faksimile_svg_file
from process_files import update_svgposfile_status
from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import create_function_dictionary
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
MAX_SVG_XY_THRESHOLD = 10
BOX_ERROR_STATUS = 'box error'
DEBUG_MSG = 'TODO: should have a box'
class WordWithBoxes(Word):
@classmethod
def create_cls(cls, word_node):
"""Creates a word from a (lxml.Element) node.
[:return:] WordWithBoxes
"""
word = super(WordWithBoxes,cls).create_cls(word_node)
word.missing_boxes = []
for index, debug_node in enumerate(word_node.xpath('./debug')):
missing_text = debug_node.get('text')
is_earlier_version = bool(debug_node.get('earlier-version'))\
and debug_node.get('earlier-version') == 'true'
overwritten_by = debug_node.get('overwritten-by')
if overwritten_by is not None:
- _split_into_parts_and_attach_box(word, index, missing_text, is_earlier_version, overwritten_by)
+ split_into_parts_and_attach_box(word, index, missing_text, is_earlier_version, overwritten_by)
else:
- _attach_box(word, 0, missing_text, False)
+ attach_box(word, 0, missing_text, False)
word.create_correction_history()
- if len(word.corrections):
+ if len(word.corrections) > 0:
for wp in word.word_parts:
wp.overwrites_word = None
return word
-def _attach_box(target_word, box_index, earlier_text, is_earlier_version):
+def attach_box(target_word, box_index, earlier_text, is_earlier_version):
"""Attach box to word.
"""
transkription_position = target_word.transkription_positions[0]
if len(target_word.transkription_positions) > 1:
positional_word_parts = []
for tp in target_word.transkription_positions:
positional_word_parts += tp.positional_word_parts
transkription_position = TranskriptionPosition(positional_word_parts=positional_word_parts)
target_word.word_box = Box(id=box_index, path=Path.create_path_from_transkription_position(transkription_position).path,\
earlier_text=earlier_text, earlier_version=is_earlier_version)
-def _split_into_parts_and_attach_box(target_word, box_index, missing_text, is_earlier_version, overwritten_by, child_process=False)->list:
+def split_into_parts_and_attach_box(target_word, box_index, missing_text, is_earlier_version, overwritten_by, child_process=False)->list:
"""Split word into word parts and attach a box to the part with text == overwritten_by.
"""
if len(target_word.word_parts) > 0:
index = 0
if True in [ wp.word_box is not None for wp in target_word.word_parts ]:
latest_word_with_box = [ wp for wp in target_word.word_parts if wp.word_box is not None ][-1]
index = target_word.word_parts.index(latest_word_with_box)+1
child_word_parts = []
for wp in target_word.word_parts[index:]:
- word_parts = _split_into_parts_and_attach_box(wp, box_index, missing_text, is_earlier_version, overwritten_by, child_process=True)
+ word_parts = split_into_parts_and_attach_box(wp, box_index, missing_text, is_earlier_version, overwritten_by, child_process=True)
if child_process:
child_word_parts += word_parts
elif len(word_parts) > 0:
old_index = target_word.word_parts.index(wp)
target_word.word_parts[old_index] = word_parts[0]
for new_wp in word_parts[1:]:
target_word.word_parts.insert(old_index+1, new_wp)
if overwritten_by in [ new_wp.text for new_wp in word_parts ]:
break
if child_process:
return child_word_parts
return target_word.word_parts
elif overwritten_by in target_word.text:
new_words_triple = target_word.split(overwritten_by)
word_with_box = [ wp for wp in new_words_triple if wp is not None and wp.text == overwritten_by ][0]
- _attach_box(word_with_box, box_index, missing_text, is_earlier_version)
+ attach_box(word_with_box, box_index, missing_text, is_earlier_version)
if not child_process:
if len(new_words_triple) > 1:
target_word.word_parts = [ i for i in new_words_triple if i is not None ]
target_word.transkription_positions = []
else:
target_word.word_box = word_with_box.word_box
return [ i for i in new_words_triple if i is not None ]
return []
def fix_boxes(page)->int:
"""Fix boxes and return exit code
"""
exit_status = 0
for word_node in set([ node.getparent() for node in page.page_tree.xpath('//' + Word.XML_TAG + f'/debug[@msg="{DEBUG_MSG}"]')]):
word = WordWithBoxes.create_cls(word_node)
try:
replace_word = [ w for w in page.words if w.id == word.id and w.text == word.text ][0]
page.words[page.words.index(replace_word)] = word
except IndexError:
return 2
if not UNITTESTING:
save_page(page, attach_first=True)
return exit_status
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix boxes.
svgscripts/fix_boxes.py [OPTIONS] a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
xml_file = args[0]
if isfile(xml_file):
counter = 0
for page in Page.get_pages_from_xml_file(xml_file, status_contains=BOX_ERROR_STATUS):
counter = 0
if not UNITTESTING:
print(Fore.CYAN + f'Fixing boxes of {page.title}, {page.number} ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
if fix_boxes(page) == 0:
counter += 1
if not UNITTESTING:
print(Style.RESET_ALL + f'[{counter} pages changed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(xml_file))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: fixes/server.py
===================================================================
--- fixes/server.py (revision 0)
+++ fixes/server.py (revision 101)
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This program can be used to send xml data as json over http.
+"""
+# Copyright (C) University of Basel 2020 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+import getopt
+from http.server import BaseHTTPRequestHandler, HTTPServer, SimpleHTTPRequestHandler
+import http.client
+import simplejson as json
+from os.path import exists, isfile, isdir, dirname, basename
+import cgi
+import sys
+
+
+from interactive_editor import ResponseOrganizer
+
+sys.path.append('svgscripts')
+from convert_wordPositions import JSONConverter
+from datatypes.page import Page
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+UNITTESTING = False
+
+class Server(BaseHTTPRequestHandler):
+ CONTENT_TYPE = 'Content-Type'
+ CONTENT_LENGTH = 'Content-Length'
+ CONFIG_FILE = '.local_variables'
+ XML = 'xml'
+ SVG = 'svg'
+ MANUSCRIPT = 'manuscript'
+
+ @classmethod
+ def get_local_file_dictionary(cls) ->dict:
+ """Return a dictionary about local files with keys: XML, SVG, MANUSCRIPT.
+ """
+ local_file_dictionary = {}
+ if isfile(cls.CONFIG_FILE):
+ with open(cls.CONFIG_FILE, 'r') as reader:
+ for raw_line in reader.readlines():
+ line = raw_line.replace('\n', '')
+ line_content = line.split('=')
+ if len(line_content) == 2\
+ and isfile(line_content[1]):
+ local_file_dictionary.update({line_content[0]: line_content[1]})
+ return local_file_dictionary
+
+ def _set_headers(self, response_code):
+ self.send_response(response_code)
+ self.send_header('Content-type', 'application/json')
+ #self.send_header('Access-Control-Allow-Credentials', 'true')
+ self.send_header("Cache-Control", "no-cache")
+ self.send_header("Access-Control-Allow-Origin", "*")
+ self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
+ self.send_header("Access-Control-Allow-Headers", "X-Requested-With")
+ self.send_header("Access-Control-Allow-Headers", "Content-Type")
+ self.end_headers()
+
+ def do_HEAD(self):
+ self._set_headers(200)
+
+ def do_OPTIONS(self):
+ """Process OPTIONS.
+ """
+ self.send_response(200, "ok")
+ self.send_header('Access-Control-Allow-Origin', '*')
+ self.send_header('Access-Control-Allow-Methods', 'GET, POST, OPTIONS')
+ self.send_header("Access-Control-Allow-Headers", "X-Requested-With")
+ self.send_header("Access-Control-Allow-Headers", "Content-Type")
+ self.end_headers()
+
+ def do_GET(self):
+ """Process GET.
+ """
+ self._set_headers(200)
+ local_file_dictionary = Server.get_local_file_dictionary()
+ if self.XML in local_file_dictionary.keys():
+ response_organizer = ResponseOrganizer()
+ json_dict = response_organizer.create_json_dict(local_file_dictionary[self.XML])
+ self.wfile.write(str.encode(json.dumps(json_dict)))
+
+ def _parse_header(self, key) ->str:
+ """Return content of header for key.
+ """
+ headers = [ header for header in self.headers._headers if key in header ]
+ if len(headers) > 0:
+ return headers[0][1]
+ return ''
+
+ def do_POST(self):
+ """Process POST.
+ """
+ ctype = self._parse_header(self.CONTENT_TYPE)
+
+ if ctype != 'application/json':
+ length = int(self._parse_header(self.CONTENT_LENGTH))
+ self._send_error()
+ return
+
+ # read the message and convert it into a python dictionary
+ length = int(self._parse_header(self.CONTENT_LENGTH))
+ response = json.loads(self.rfile.read(length))
+ response_organizer = ResponseOrganizer()
+ json_dict = response_organizer.handle_response(response)
+ self._set_headers(200)
+ self.wfile.write(str.encode(json.dumps(json_dict)))
+
+ def _send_error(self):
+ """Send error msg.
+ """
+ self._set_headers(400)
+ self.end_headers()
+
+def run(port=8008):
+ server_address = ('', port)
+ httpd = HTTPServer(server_address, Server)
+ print(f'Starting httpd on port {port}...')
+ httpd.serve_forever()
+
+def usage():
+ """prints information on how to use the script
+ """
+ print(main.__doc__)
+
+def main(argv):
+ """This program can be used to send xml data as json over http.
+
+ fixes/server.py OPTIONS
+
+ OPTIONS:
+
+ -h|--help: show help
+
+ :return: exit code (int)
+ """
+ try:
+ opts, args = getopt.getopt(argv, "h", ["help"])
+ except getopt.GetoptError:
+ usage()
+ return 2
+ for opt, arg in opts:
+ if opt in ('-h', '--help') or not args:
+ usage()
+ return 0
+ run()
+ return exit_code
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
+