Index: shared_util/myxmlwriter.py
===================================================================
--- shared_util/myxmlwriter.py (revision 112)
+++ shared_util/myxmlwriter.py (revision 113)
@@ -1,203 +1,212 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to pretty-write a xml string to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import inspect
import xml.dom.minidom as MD
import xml.etree.ElementTree as ET
import lxml.etree as LET
from datetime import datetime
from rdflib import URIRef
from os import makedirs
from os.path import sep, basename, dirname, isfile
import sys
+import shutil
import warnings
sys.path.append('svgscripts')
from datatypes.page import FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
FILE_TYPE_SVG_WORD_POSITION = FILE_TYPE_SVG_WORD_POSITION
FILE_TYPE_XML_MANUSCRIPT = FILE_TYPE_XML_MANUSCRIPT
FILE_TYPE_XML_DICT = 'xml-dictionary'
def attach_dict_to_xml_node(dictionary, xml_node):
"""Create a xml tree from a dictionary.
"""
for key in dictionary.keys():
elem_type = type(dictionary[key])
if elem_type != dict:
node = LET.SubElement(xml_node, key, attrib={'type': elem_type.__name__})
node.text = str(dictionary[key])
else:
attach_dict_to_xml_node(dictionary[key], LET.SubElement(xml_node, key))
+def copy_to_bak_dir(source_file: str, bak_dir='./bak'):
+ """Copy file to bakup directory.
+ """
+ date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
+ makedirs(bak_dir, exist_ok=True)
+ target_file = bak_dir + sep + basename(source_file) + '_' + date_string
+ shutil.copy(source_file, target_file)
+
def dict2xml(dictionary, target_file_name):
"""Write dict 2 xml.
"""
xml_tree = LET.ElementTree(LET.Element('root'))
attach_dict_to_xml_node(dictionary, LET.SubElement(xml_tree.getroot(), 'dict'))
write_pretty(xml_element_tree=xml_tree, file_name=target_file_name,\
script_name=inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_XML_DICT)
def get_dictionary_from_node(node):
"""Return dictionary from node.
:return: dict
"""
new_dict = {}
if len(node.getchildren()) > 0:
new_dict.update({ node.tag : {} })
for child_node in node.getchildren():
new_dict.get(node.tag).update(get_dictionary_from_node(child_node))
else:
elem_cls = eval(node.get('type')) if bool(node.get('type')) else str
value = elem_cls(node.text) if bool(node.text) else None
new_dict.update({ node.tag: value })
return new_dict
def lock_xml_tree(xml_element_tree, **locker_dict):
"""Lock xml_element_tree.
"""
if xml_element_tree is not None and not test_lock(xml_element_tree, silent=True):
message = locker_dict.get('message') if bool(locker_dict.get('message')) else ''
reference_file = locker_dict.get('reference_file') if bool(locker_dict.get('reference_file')) else ''
metadata = xml_element_tree.xpath('./metadata')[0]\
if len(xml_element_tree.xpath('./metadata')) > 0\
else LET.SubElement(xml_element_tree.getroot(), 'metadata')
lock = LET.SubElement(metadata, 'lock')
LET.SubElement(lock, 'reference-file').text = reference_file
if message != '':
LET.SubElement(lock, 'message').text = message
def parse_xml_of_type(xml_source_file, file_type):
"""Return a xml_tree from xml_source_file is file is of type file_type.
"""
parser = LET.XMLParser(remove_blank_text=True)
xml_tree = LET.parse(xml_source_file, parser)
if not xml_has_type(file_type, xml_tree=xml_tree):
msg = 'File {} is not of type {}!'.format(xml_source_file, file_type)
raise Exception(msg)
return xml_tree
def test_lock(xml_element_tree=None, silent=False):
"""Test if xml_element_tree is locked and print a message.
:return: True if locked
"""
if xml_element_tree is None:
return False
if len(xml_element_tree.findall('./metadata/lock')) > 0:
reference_file = xml_element_tree.findall('./metadata/lock/reference-file')
message = xml_element_tree.findall('./metadata/lock/message')
if not silent:
warning_msg = 'File {0} is locked!'.format(xml_element_tree.docinfo.URL)
if len(reference_file) > 0:
warning_msg = warning_msg.replace('!', ' ') + 'on {0}.'.format(reference_file[0].text)
if len(message) > 0:
warning_msg = warning_msg + '\n{0}'.format(message[0].text)
warnings.warn(warning_msg)
return True
return False
def update_metadata(xml_element_tree, script_name, file_type=None):
"""Updates metadata of xml tree.
"""
if len(xml_element_tree.getroot().findall('./metadata')) > 0:
if len(xml_element_tree.getroot().find('./metadata').findall('./modifiedBy[@script="{}"]'.format(script_name))) == 0:
LET.SubElement(xml_element_tree.getroot().find('./metadata'), 'modifiedBy', attrib={'script': script_name})
xml_element_tree.getroot().find('./metadata').findall('./modifiedBy[@script="{}"]'.format(script_name))[0].text = \
datetime.now().strftime('%Y-%m-%d %H:%M:%S')
else:
metadata = LET.SubElement(xml_element_tree.getroot(), 'metadata')
if file_type is not None:
LET.SubElement(metadata, 'type').text = file_type
createdBy = LET.SubElement(metadata, 'createdBy')
LET.SubElement(createdBy, 'script').text = script_name
LET.SubElement(createdBy, 'date').text = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def write_backup(xml_element_tree: LET.ElementTree, file_type=None, bak_dir='./bak') -> str:
"""Back up a xml_source_file.
:return: target_file_name
"""
date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
makedirs(bak_dir, exist_ok=True)
target_file_name = bak_dir + sep + basename(xml_element_tree.docinfo.URL) + '_' + date_string
reference_file = xml_element_tree.docinfo.URL
write_pretty(xml_element_tree=xml_element_tree, file_name=target_file_name,\
script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, reference_file),\
file_type=file_type)
return target_file_name
def write_pretty(xml_string=None, xml_element_tree=None, file_name=None, script_name=None, backup=False, file_type=None, **locker_dict):
"""Writes a xml string pretty to a file.
"""
if not bool(xml_string) and not bool(xml_element_tree):
raise Exception("write_pretty needs a string or a xml.ElementTree!")
if not test_lock(xml_element_tree):
if len(locker_dict) > 0 and bool(locker_dict.get('reference_file')):
lock_xml_tree(xml_element_tree, **locker_dict)
if script_name is not None and xml_element_tree is not None:
update_metadata(xml_element_tree, script_name, file_type=file_type)
if file_name is None and xml_element_tree is not None\
and xml_element_tree.docinfo is not None and xml_element_tree.docinfo.URL is not None:
file_name = xml_element_tree.docinfo.URL
if file_name is None:
raise Exception("write_pretty needs a file_name or a xml.ElementTree with a docinfo.URL!")
if backup and xml_element_tree is not None:
write_backup(xml_element_tree, file_type=file_type)
dom = MD.parseString(xml_string) if(bool(xml_string)) else MD.parseString(ET.tostring(xml_element_tree.getroot()))
f = open(file_name, "w")
dom.writexml(f, addindent="\t", newl='\n', encoding='utf-8')
f.close()
def xml2dict(xml_source_file):
"""Create dict from xml_source_file of Type FILE_TYPE_XML_DICT.
:return: dict
"""
new_dict = {}
xml_tree = LET.parse(xml_source_file)
if xml_has_type(FILE_TYPE_XML_DICT, xml_tree=xml_tree)\
and len(xml_tree.xpath('/root/dict')) > 0:
for node in xml_tree.xpath('/root/dict')[0].getchildren():
new_dict.update(get_dictionary_from_node(node))
else:
msg = 'File {} is not of type {}!'.format(xml_source_file, FILE_TYPE_XML_DICT)
raise Exception(msg)
return new_dict
def xml_has_type(file_type, xml_source_file=None, xml_tree=None):
"""Return true if xml_source_file/xml_tree has file type == file_type.
"""
if xml_tree is None and xml_source_file is None:
return False
if xml_tree is None and isfile(xml_source_file):
xml_tree = LET.parse(xml_source_file)
if len(xml_tree.xpath('//metadata/type/text()')) < 1:
return False
return xml_tree.xpath('//metadata/type/text()')[0] == file_type
Index: replace.vim
===================================================================
--- replace.vim (revision 0)
+++ replace.vim (revision 113)
@@ -0,0 +1,12 @@
+:let fname = 'bak/' . expand('%:t') . '_' . strftime('%Y_%m_%d_%H.%M.%S')
+:w fname
+:silent execute 'write' fname
+:%s/ß/ß/ge
+:%s/\s=\s/=/ge
+:%s/ä/ä/ge
+:%s/ö/ö/ge
+:%s/ü/ü/ge
+:%s/uü/ü/ge
+:%s/ü/ü/ge
+:%s/\.\.\./…/ge
+:%s/>->– 1}}}
from colorama import Fore, Style
from datetime import datetime
from functools import cmp_to_key
import getopt
import inspect
import itertools
import lxml.etree as ET
import re
import shutil
import signal
import string
import subprocess
from svgpathtools import svg_to_paths
import sys
import tempfile
import os
from os import listdir, sep, path, setpgrp, devnull, makedirs
from os.path import basename, commonpath, dirname, exists, isfile, isdir, realpath, splitext
import warnings
import wget
import xml.etree.ElementTree as XET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.faksimile_image import FaksimileImage
from datatypes.lineNumber import LineNumber
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.page_creator import PageCreator
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word, update_transkription_position_ids
from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
class ExternalViewer:
"""This class can be used to show files with external viewers.
"""
file_format_viewer_dict = { '.pdf': PDF_READER, '.svg': SVG_EDITOR }
@classmethod
def show_files(cls, single_file=None, list_of_files=[]):
"""Opens file(s) with corresponding external viewer(s).
"""
DEVNULL = None
if type(single_file) == list:
list_of_files = single_file
elif single_file is not None:
list_of_files.append(single_file)
if len(list_of_files) > 1:
DEVNULL = open(devnull, 'wb')
process_list = []
list_of_files.reverse()
while len(list_of_files) > 0:
file2open = list_of_files.pop()
viewer = cls.file_format_viewer_dict.get(splitext(file2open)[1])
if viewer is not None:
if len(list_of_files) > 0:
process_list.append(\
subprocess.Popen([viewer, file2open], stdout=DEVNULL, stderr=DEVNULL, preexec_fn=os.setsid))
else:
subprocess.run([viewer, file2open])
for process in process_list:
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
if DEVNULL is not None:
DEVNULL.close()
def back_up(page: Page, reference_file, bak_dir='./bak') -> str:
"""Back up a xml_source_file.
:return: target_file_name
"""
date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
makedirs(bak_dir, exist_ok=True)
page.bak_file = bak_dir + sep + basename(page.page_tree.docinfo.URL) + '_' + date_string
write_pretty(xml_element_tree=page.page_tree, file_name=page.bak_file,\
script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, reference_file),\
file_type=FILE_TYPE_SVG_WORD_POSITION)
return page.bak_file
def back_up_svg_file(svg_tree: ET.ElementTree, namespaces=None, bak_dir='./bak') -> str:
"""Back up a xml_source_file.
:return: target_file_name
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
makedirs(bak_dir, exist_ok=True)
bak_file = bak_dir + sep + date_string + '_' + basename(svg_tree.docinfo.URL)
copy_faksimile_svg_file(target_file=bak_file, faksimile_tree=svg_tree, namespaces=namespaces)
return bak_file
def copy_faksimile_svg_file(target_file=None, faksimile_source_file=None, faksimile_tree=None, target_directory=None, abs_image_path=None, local_image_path=None, namespaces=None):
"""Copy a faksimile_svg_file to target_file.
"""
if faksimile_source_file is None and faksimile_tree is not None:
faksimile_source_file = faksimile_tree.docinfo.URL
elif faksimile_source_file is None:
raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file')
if target_file is not None and target_directory is not None:
target_file = target_directory + sep + target_file
elif target_file is None and target_directory is not None:
target_file = target_directory + sep + basename(faksimile_source_file)
elif target_file is None:
raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory')
paths, attributes, svg_attributes = svg_to_paths.svg2paths(faksimile_source_file, return_svg_attributes=True)
for key in [ key for key in svg_attributes.keys() if key.startswith('xmlns:') ]:
try:
XET.register_namespace(key.replace('xmlns:', ''), svg_attributes[key])
except ValueError: pass
XET.register_namespace('', 'http://www.w3.org/2000/svg')
if namespaces is None:
- namespaces = { 'ns': svg_attributes['xmlns'], 'xlink': svg_attributes['xmlns:xlink'],\
- 'sodipodi': svg_attributes['xmlns:sodipodi'] }
+ xsodi = svg_attributes['xmlns:sodipodi'] if bool(svg_attributes.get('xmlns:sodipodi')) else svg_attributes.get('sodipodi')
+ namespaces = { 'ns': svg_attributes['xmlns'], 'xlink': svg_attributes['xmlns:xlink'], 'sodipodi': xsodi }
+ if not bool(namespaces.get('sodipodi')):
+ namespaces['sodipodi'] = 'http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd'
if faksimile_tree is not None:
element = XET.fromstring(ET.tostring(faksimile_tree))\
if type(faksimile_tree) == ET._ElementTree\
else XET.fromstring(XET.tostring(faksimile_tree.getroot()))
target_tree = XET.ElementTree(element)
else:
target_tree = XET.parse(faksimile_source_file)
if (local_image_path is not None or abs_image_path is not None)\
and len(target_tree.findall('.//ns:image', namespaces=namespaces)) > 0:
image_node = target_tree.findall('.//ns:image', namespaces=namespaces)[0]
if local_image_path is not None:
image_node.set('{%s}href' % namespaces['xlink'], local_image_path)
if abs_image_path is not None:
image_node.set('{%s}absref' % namespaces['sodipodi'], abs_image_path)
target_tree.write(target_file)
def copy_faksimile_update_image_location(faksimile_source_file=None, faksimile_tree=None, target_file=None, target_directory=None, overwrite=False):
"""Copy a faksimile_svg_file to target_file and update image location.
"""
if faksimile_source_file is None and faksimile_tree is not None:
faksimile_source_file = faksimile_tree.docinfo.URL
elif faksimile_source_file is None:
raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file')
if target_file is not None and target_directory is not None:
target_file = target_directory + sep + target_file
elif target_file is None and target_directory is not None:
target_file = target_directory + sep + basename(faksimile_source_file)
elif target_directory is None and target_file is not None:
target_directory = dirname(target_file)
elif target_file is None:
raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory')
source_tree = ET.parse(faksimile_source_file) if faksimile_tree is None else faksimile_tree
namespaces = { k if k is not None else 'ns': v for k, v in source_tree.getroot().nsmap.items() }
image_nodes = source_tree.xpath('//ns:image', namespaces=namespaces)
local_image_path = None
abs_image_path = None
user_abs_image_path = None
if len(image_nodes) > 0:
image = FaksimileImage.CREATE_IMAGE(image_nodes[0], source_file=faksimile_source_file)
abs_image_path = image.local_path
for user_name in USER_ROOT_LOCATION_DICT.keys():
if user_name in target_directory:
user_abs_image_path = abs_image_path.replace(FAKSIMILE_LOCATION, USER_ROOT_LOCATION_DICT[user_name]).replace('//','/')
break
# if target_directory is subdir of FAKSIMILE_LOCATION
if realpath(target_directory).startswith(realpath(FAKSIMILE_LOCATION)):
common_path = commonpath([ realpath(target_directory), realpath(dirname(image.local_path)) ])
relative_directory = '/'.join(\
[ '..' for d in realpath(target_directory).replace(common_path + '/', '').split('/') ])
local_image_path = relative_directory + realpath(image.local_path).replace(common_path, '')
if not isfile(target_directory + sep + local_image_path):
local_image_path = None
elif abs_image_path is not None:
local_image_path = abs_image_path
if abs_image_path is not None and not isfile(abs_image_path):
wget.download(image.URL, out=dirname(abs_image_path))
if not isfile(target_file) or overwrite:
abs_image_path = user_abs_image_path if user_abs_image_path is not None else abs_image_path
copy_faksimile_svg_file(target_file=target_file, faksimile_source_file=faksimile_source_file,\
faksimile_tree=faksimile_tree, abs_image_path=abs_image_path,\
local_image_path=local_image_path, namespaces=namespaces)
else:
msg = 'File {0} not copied to directory {1}, it already contains a file {2}.'.format(faksimile_source_file, target_directory, target_file)
warnings.warn(msg)
def copy_xml_file_word_pos_only(xml_source_file, target_directory):
"""Copy word positions of a xml file to target directory.
:return: (str) xml_target_file
"""
xml_target_file = target_directory + sep + basename(xml_source_file)
source_page = Page(xml_source_file)
target_page = PageCreator(xml_target_file, title=source_page.title, page_number=source_page.number, orientation=source_page.orientation)
target_page.words = source_page.words
target_page.update_and_attach_words2tree()
write_pretty(xml_element_tree=target_page.page_tree, file_name=xml_target_file,\
script_name=__file__ + '({})'.format(inspect.currentframe().f_code.co_name), file_type=FILE_TYPE_SVG_WORD_POSITION)
return xml_target_file
def create_highlighted_svg_file(faksimile_tree, node_ids, nodes_color_dict=None, target_file=None, target_directory=None, local_image_path=None, namespaces=None, highlight_color=HIGHLIGHT_COLOR, opacity=OPACITY):
"""Highlights the nodes of a faksimile_tree that are specified by the list of node_ids and writes the tree to a file.
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
for node in itertools.chain(*[\
faksimile_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)\
for node_id in node_ids\
]):
node.set('fill', highlight_color)
node.set('opacity', opacity)
node.set('style', '')
copy_faksimile_update_image_location(target_file=target_file, faksimile_tree=faksimile_tree, target_directory=target_directory)
def get_empty_node_ids(faksimile_tree, x_min=0.0, x_max=0.0, y_min=0.0, y_max=0.0, text_field_id=None, faksimile_page=None, namespaces={}):
"""Returns a list of ids of rect and path nodes that do not have a title element.
"""
THRESHOLD_X = 10
if faksimile_page is not None:
x_min = faksimile_page.text_field.xmin + faksimile_page.faksimile_image.x
x_max = faksimile_page.text_field.xmax + faksimile_page.faksimile_image.x - THRESHOLD_X
y_min = faksimile_page.text_field.ymin + faksimile_page.faksimile_image.y
y_max = faksimile_page.text_field.ymax + faksimile_page.faksimile_image.y
text_field_id = faksimile_page.text_field.id
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
empyt_node_ids = []
nodes_without_title = faksimile_tree.xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'.format(\
x_min, x_max, y_min, y_max, text_field_id), namespaces=namespaces)
nodes_without_title += get_paths_inside_rect(faksimile_tree, '//ns:path[not(./ns:title)]', x_min, x_max, y_min, y_max, text_field_id, namespaces=namespaces)
for node_without_title in nodes_without_title:
empyt_node_ids.append(node_without_title.get('id'))
return empyt_node_ids
def get_mismatching_ids(words, faksimile_positions):
""" Return the list of mismatching words and the list of mismatching faksimile_positions
as a 2-tuple.
"""
mismatching_words = []
mismatching_faksimile_positions = []
faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions)
word_texts = [ word.text for word in words if word.text != '.' ]
for word_text in set(word_texts):
if word_text not in unique_faksimile_words:
mismatching_words += [ word for word in words if word.text == word_text ]
for faksimile_position_text in unique_faksimile_words:
if faksimile_position_text not in set(word_texts):
mismatching_faksimile_positions += [ faksimile_position for faksimile_position in faksimile_positions\
if faksimile_position.text == faksimile_position_text ]
return mismatching_words, mismatching_faksimile_positions
def process_warnings4status(warnings, warning_messages, current_status, ok_status, status_prefix='') ->str:
"""Process potential warnings and return actual status.
"""
if warnings is not None and len(warnings) > 0:
status = status_prefix
for warning_message in warning_messages:
if True in [ str(warn.message).startswith(warning_message) for warn in warnings ]:
status += f':{warning_message}:'
if status != status_prefix:
return status
return f'{current_status}:{ok_status}:'
else:
return f'{current_status}:{ok_status}:'
def change_title_of_svg(svg_file, node_id, text):
"""Change the title of a rect/path node.
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
nodes = svg_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces)
if len(nodes) > 0:
nodes[0].text = text
copy_faksimile_svg_file(target_file=svg_file, faksimile_tree=svg_tree)
+def change_id_of_textfield(svg_file, manuscript_title, page_number, faksimie_page_number):
+ """Change the title of a rect/path node.
+ """
+ svg_tree = ET.parse(svg_file)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ aliases = [ id for id in svg_tree.xpath('//ns:rect/@id', namespaces=namespaces) if not id.startswith('rect') and id.endswith(faksimie_page_number) ]
+ if len(aliases) > 0:
+ alias = aliases[0]
+ id = manuscript_title.replace(' ', '-') + '_' + page_number
+ text_fields = svg_tree.xpath(f'//ns:rect[@id="{alias}"]', namespaces=namespaces)
+ if len(text_fields) > 0:
+ text_fields[0].set('id', id)
+ copy_faksimile_svg_file(target_file=svg_file, faksimile_tree=svg_tree)
+
def record_changes(original_svg_file, changed_svg_file, node_ids, namespaces={}):
"""Copy changes made to changed_svg_file to original_svg_file.
"""
old_tree = ET.parse(original_svg_file)
new_tree = ET.parse(changed_svg_file)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() }
for node_id in node_ids:
new_titles = new_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces)
old_nodes = old_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)
if len(new_titles) > 0 and len(old_nodes) > 0:
if old_nodes[0].find('ns:title', namespaces=namespaces) is not None:
old_nodes[0].find('ns:title', namespaces=namespaces).text = new_titles[0].text
else:
old_title_id_string = new_titles[0].get('id')
old_title = ET.SubElement(old_nodes[0], 'title', attrib={ 'id': old_title_id_string })
old_title.text = new_titles[0].text
elif len(old_nodes) > 0:
for old_node in old_nodes:
old_node.getparent().remove(old_node)
copy_faksimile_svg_file(target_file=original_svg_file, faksimile_tree=old_tree)
def record_changes_on_svg_file_to_page(xml_source_file, svg_file, word_ids=None):
"""Copy changes made to svg_file to xml_source_file.
:return: datatypes.page.Page
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
transkription_field = TranskriptionField(svg_file)
page = Page(xml_source_file)
words = [ word for word in page.words if word.id in word_ids ]\
if word_ids is not None else page.words
new_page_words = []
for word in words:
word_id = 'word_' + str(word.id) + '_'
recorded_ids = []
for transkription_position in word.transkription_positions:
transkription_position_id = word_id + str(transkription_position.id)
tp_nodes = svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]'.format(transkription_position_id), namespaces=namespaces)
if len(tp_nodes) > 0:
record_changes_to_transkription_position(tp_nodes[0], transkription_position,\
transkription_field.xmin, transkription_field.ymin, namespaces=namespaces)
recorded_ids.append(transkription_position_id)
extra_nodes = [ node for node in\
svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[contains(@id, "{0}")]'.format(word_id), namespaces=namespaces)\
if node.get('id') not in recorded_ids ]
if len(extra_nodes) > 0:
for extra_node in extra_nodes:
old_ids = [ inkscape_id.replace('#','') for inkscape_id in\
svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]/@inkscape:label'.format(extra_node.get('id')),\
namespaces=namespaces) ]
if len(old_ids) > 0 and re.match(r'word_[0-9]+_[0-9]+', old_ids[0]):
old_id_list = old_ids[0].split('_')
ref_word_id = int(old_id_list[1])
ref_tp_id = old_id_list[2]
ref_words = [ word for word in page.words if word.id == ref_word_id ]
if len(ref_words) > 0:
ref_tps = [ tp for tp in ref_words[0].transkription_positions\
if tp.id == ref_tp_id ]
if len(ref_tps) > 0:
ref_words[0].transkription_positions.remove(ref_tps[0])
record_changes_to_transkription_position(extra_node,\
ref_tps[0], transkription_field.xmin, transkription_field.ymin, namespaces=namespaces)
word.transkription_positions.append(ref_tps[0])
for word in page.words:
if word.has_mixed_status('text'):
new_page_words += [ word for word in word.split_according_to_status('text') if word.text is not None and word.text != '' ]
elif len(word.transkription_positions) > 0:
new_text = [ tp.text for tp in word.transkription_positions if tp.text is not None and tp.text != '' ]
if len(new_text) > 0:
word.text = new_text[0]
new_page_words.append(word)
page.words = new_page_words
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
page.unlock()
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\
script_name=__file__ + ' -> ' + inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_SVG_WORD_POSITION)
return page
def record_changes_on_xml_file_to_page(xml_source_file, xml_file) -> Page:
"""Copy changes made to xml_file to xml_source_file.
:return: datatypes.page.Page
"""
copy_page = Page(xml_file)
page = Page(xml_source_file)
page.unlock()
back_up(page, xml_file)
page.words = []
for word in copy_page.words:
if word.split_strings is None\
or len(word.split_strings) == 0:
page.words.append(word)
else:
next_word = word
for split_string in word.split_strings:
_, new_word, next_word = next_word.split(split_string)
page.words.append(new_word)
if next_word is not None:
page.words.append(next_word)
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
remove_words_if_done = []
for word in page.words:
if 'join_string' in word.__dict__.keys()\
and word.join_string is not None:
if word.id > 0\
and page.words[word.id-1].text + word.text == word.join_string:
page.words[word.id-1].join(word)
remove_words_if_done.append(word)
elif word.id < len(page.words)\
and word.text + page.words[word.id+1].text == word.join_string:
word.join(page.words[word.id+1])
remove_words_if_done.append(page.words[word.id+1])
for word in remove_words_if_done:
page.words.remove(word)
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\
script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, xml_file), file_type=FILE_TYPE_SVG_WORD_POSITION)
return page
def record_changes_to_transkription_position(node, transkription_position, xmin=0.0, ymin=0.0, namespaces=None):
"""Record changes made to node to transkription_position.
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in node.nsmap.items() }
if bool(node.get('x')):
transkription_position.left = float(node.get('x')) - xmin
if bool(node.get('y')):
transkription_position.top = float(node.get('y')) - ymin
if bool(node.get('width')):
transkription_position.width = float(node.get('width'))
if bool(node.get('height')):
transkription_position.height = float(node.get('height'))
if len(node.xpath('./ns:title/text()', namespaces=namespaces)) > 0:
transkription_position.text = node.xpath('./ns:title/text()', namespaces=namespaces)[0]
def replace_chars(words, faksimile_positions, unique_faksimile_words=None):
"""Return unique_faksimile_words and faksimile_positions, with characters changed according to transcription words.
"""
if unique_faksimile_words is None:
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
for index, word_text in enumerate(unique_faksimile_words):
if len([ word for word in words if word.text == word_text ]) == 0:
if re.match(r'.*".*', word_text)\
and len([ word for word in words if word.text == word_text.replace('"', '“') ]) > 0:
unique_faksimile_words[index] = word_text.replace('"', '“')
elif re.match(r'.*ss.*', word_text)\
and len([ word for word in words if word.text == word_text.replace('ss', 'ß') ]) > 0:
unique_faksimile_words[index] = word_text.replace('ss', 'ß')
elif re.match(r'.*-.*', word_text)\
and len([ word for word in words if word.text == word_text.replace('-', '–') ]) > 0:
unique_faksimile_words[index] = word_text.replace('-', '–')
for faksimile_position in [ faksimile_position for faksimile_position in faksimile_positions\
if faksimile_position.text == word_text ]:
faksimile_position.text = unique_faksimile_words[index]
elif word_text == '-'\
and len([ word for word in words if word.text == '–' ]) > 0:
print([ word.text for word in words if word.text == word_text ])
print([ word.text for word in words if word.text == '–' ])
return faksimile_positions, unique_faksimile_words
def reset_tp_with_matrix(transkription_positions, new_left=0, new_top=-5, tr_xmin=0.0, tr_ymin=0.0):
"""Fix transkription_position with transform matrix.
"""
if len(transkription_positions) > 0:
for tp in transkription_positions:
if tp.transform is not None\
and tp.transform.isRotationMatrix():
tp.transform.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + tr_xmin, 3)
tp.left = round(tp.left, 3) - tp.transform.matrix[Matrix.XINDEX]\
if abs(round(tp.left, 3) - tp.transform.matrix[Matrix.XINDEX]) > 1\
else 0
tp.bottom = round(tp.bottom, 3) - tp.transform.matrix[Matrix.YINDEX]
tp.transform.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + tr_ymin, 3)
tp.top= tp.bottom - tp.height + 2
def update_svgposfile_status(file_name, manuscript_file=None, status='changed', append=True):
"""Updates svg position file's status. Changes its status to status if it does not contain 'OK',
else it appends new status to old status.
"""
if isfile(file_name):
parser = ET.XMLParser(remove_blank_text=True)
file_tree = ET.parse(file_name, parser)
old_status = file_tree.getroot().get('status')
if old_status is None or 'OK' not in old_status.split(':'):
file_tree.getroot().set('status', status)
elif append:
if status not in old_status.split(':'):
new_status = old_status + ':' + status
file_tree.getroot().set('status', new_status)
else:
file_tree.getroot().set('status', new_status)
write_pretty(xml_element_tree=file_tree, file_name=file_name, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
if manuscript_file is not None and isfile(manuscript_file):
page_number = file_tree.getroot().get('number')
update_manuscript_file(manuscript_file, page_number, file_name, status=status)
def update_manuscript_file(manuscript_file, page_number, file_name, status='changed', append=True):
"""Updates manuscript file: adds status information about page.
"""
if isfile(manuscript_file):
parser = ET.XMLParser(remove_blank_text=True)
manuscript_tree = ET.parse(manuscript_file, parser)
if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0]
old_status = node.get('status')
if old_status is None or 'OK' not in old_status.split(':'):
node.set('status', status)
elif append:
if status not in old_status.split(':'):
new_status = old_status + ':' + status
node.set('status', new_status)
else:
node.set('status', new_status)
if not bool(node.get('output')):
node.set('output', file_name)
else:
pages_node = manuscript_tree.getroot().find('pages')\
if manuscript_tree.getroot().find('pages') is not None\
else ET.SubElement(manuscript_tree.getroot(), 'pages')
new_id = len(pages_node.findall('page')) + 1
ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name})
write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT)
Index: svgscripts/process_footnotes.py
===================================================================
--- svgscripts/process_footnotes.py (revision 112)
+++ svgscripts/process_footnotes.py (revision 113)
@@ -1,294 +1,296 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import getopt
import lxml.etree as ET
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path as PathlibPath
from progress.bar import Bar
import inspect
import re
import shutil
import sys
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.atypical_writing import AtypicalWriting
from datatypes.clarification import Clarification
from datatypes.editor_comment import EditorComment
from datatypes.editor_correction import EditorCorrection
from datatypes.footnotes import extract_footnotes
from datatypes.imprint import extract_imprints
from datatypes.line_continuation import LineContinuation
from datatypes.standoff_tag import StandoffTag
from datatypes.text import Text
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.uncertain_decipherment import UncertainDecipherment
from util import back_up
from process_files import update_svgposfile_status
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
ATYPICAL_GROUP = re.compile(r'(.*:.*]\s*)(¿)(.*)')
CLARIFICATION_GROUP = re.compile(r'(.*:.*]\s*)(Vk)(.*)')
CONTINUATION_GROUP = re.compile(r'(.*:\s*)(Fortsetzung\s*)')
COMMENT_GROUP = re.compile(r'(.*:.*])')
EDITOR_CORRECTION_GROUP = re.compile(r'(.*:.*]\s*)(>[?]*)(.*)')
LINE_REFERENCE_GROUP = re.compile(r'(\d+-|\d/(\d+/)*)*([0-9]+)(:.*)')
LINE_REFERENCE_GROUP_START_INDEX = 1
LINE_REFERENCE_GROUP_MID_INDEX = 2
LINE_REFERENCE_GROUP_END_INDEX = 3
LINE_COMMENT_GROUP = re.compile(r'(.*\d+:)')
UNCERTAINTY_WORD_GROUP = re.compile(r'(.*:.*]\s*)([>]*\?)(.*)')
UNCERTAINTY_EDITOR_GROUP = re.compile(r'(.*)(\?)')
WORD_REFERENCE_GROUP = re.compile(r'(.*[0-9]+:\s*)(.*)(].*)')
DEBUG = False
def categorize_footnotes(page, footnotes=None, debug=False, skip_after=-1.0, find_content=False):
"""Categorize footnotes.
"""
DEBUG = debug
if footnotes is None:
footnotes = extract_footnotes(page, skip_after=skip_after)
for footnote in footnotes:
line_match = re.match(LINE_REFERENCE_GROUP, footnote.content)
if line_match is not None:
_process_line_match(page, footnote, line_match)
else:
warnings.warn(f'Unknown editor comment without a line reference: <{footnote}>')
if find_content and len(page.text_connection_marks) > 0:
TextConnectionMark.find_content_in_footnotes(page, footnotes=footnotes)
page.update_and_attach_words2tree()
for line in page.lines: line.attach_object_to_tree(page.page_tree)
DEBUG = False
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
def save_imprints(page):
"""Categorize footnotes.
"""
for imprint in extract_imprints(page):
imprint.attach_object_to_tree(page.page_tree)
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}', file_type=FILE_TYPE_SVG_WORD_POSITION)
def _is_uncertain(footnote) -> bool:
"""Return whether footnote contains sign for uncertainty.
"""
uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content)
return (uncertain_match is not None\
and len([ markup for markup in footnote.standoff_markups\
if markup.css_string.endswith('italic;')\
and uncertain_match.end() >= markup.startIndex\
and uncertain_match.end() <= markup.endIndex ]) > 0)
def _process_line_match(page, footnote, line_match):
"""Process footnote if reference to a line matches.
"""
word_match = re.match(WORD_REFERENCE_GROUP, footnote.content)
end_line_number = int(line_match.group(LINE_REFERENCE_GROUP_END_INDEX))
lines = []
if line_match.group(LINE_REFERENCE_GROUP_START_INDEX) is not None:
if line_match.group(LINE_REFERENCE_GROUP_MID_INDEX) is not None:
line_ids = [ int(line_id) for line_id in\
line_match.group(LINE_REFERENCE_GROUP_START_INDEX).split('/')\
if line_id != '' ] + [ end_line_number ]
lines = [ line for line in page.lines if line.id in line_ids ]
else:
start_line_number = int(line_match.group(1)[0:-1])
lines = [ line for line in page.lines if line.id >= start_line_number and line.id <= end_line_number ]
else:
lines = [ line for line in page.lines if line.id == end_line_number ]
if word_match is not None:
_process_word_match(page.words, footnote, line_match, word_match.group(2), end_line_number)
elif len(lines) > 0:
uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content)
for line in lines:
_process_line_reference(page, footnote, line, _is_uncertain(footnote))
else:
warnings.warn(f'Footnote refers to missing line {line_number}: {footnote}')
def _process_line_reference(page, footnote, line, is_uncertain):
"""Process footnote if there is a line reference.
"""
continuation_match = re.match(CONTINUATION_GROUP, footnote.content)
if continuation_match is not None:
reference_string = footnote.content[continuation_match.end():]
if is_uncertain:
reference_string = reference_string[:-1]
line.editor_comments.append(LineContinuation.create_cls(reference_string=reference_string, is_uncertain=is_uncertain))
else:
comment_match = re.match(LINE_COMMENT_GROUP, footnote.content)
if comment_match is not None:
is_uncertain = _is_uncertain(footnote)
comment = footnote.content[comment_match.end():-1].strip()\
if is_uncertain\
else footnote.content[comment_match.end():].strip()
line.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain))
else:
warnings.warn(f'Unknown editor comment for line "{line.id}": <{footnote}>')
def _process_word_match(words, footnote, line_match, word_text, line_number, parent_word_composition=None):
"""Process footnote if there is a word reference.
"""
referred_words = [ word for word in words\
if word.line_number == line_number\
and (word.text == word_text\
or re.match(rf'\W*{word_text}\W', word.text)\
or word.edited_text == word_text) ]
referred_word_parts = [ word.word_parts for word in words\
if word.line_number == line_number\
and len(word.word_parts) > 0\
and word_text in [ wp.text for wp in word.word_parts ] ]
overwritten_word_matches = [ word for word in words\
if word.line_number == line_number\
and len(word.word_parts) > 0\
and len([word_part for word_part in word.word_parts\
if word_part.overwrites_word is not None\
and word_part.overwrites_word.text == word_text]) > 0]
if len(referred_words) > 0\
or len(overwritten_word_matches) > 0\
or len(referred_word_parts) > 0:
word = None
if len(referred_words) == 1:
word = referred_words[0]
elif len(overwritten_word_matches) > 0:
word = [ word_part.overwrites_word for word_part in overwritten_word_matches[0].word_parts\
if word_part.overwrites_word is not None and word_part.overwrites_word.text == word_text][0]
elif len(referred_word_parts) > 0:
word = [ word_part for word_part in referred_word_parts[0] if word_part.text == word_text ][0]
- else:
+ elif len([ better_word for better_word in referred_words if better_word.text == word_text]) > 0:
word = [ better_word for better_word in referred_words if better_word.text == word_text][0]
+ else:
+ word = referred_words[0]
atypical_match = re.match(ATYPICAL_GROUP, footnote.content)
correction_match = re.match(EDITOR_CORRECTION_GROUP, footnote.content)
clarification_match = re.match(CLARIFICATION_GROUP, footnote.content)
is_uncertain = re.match(UNCERTAINTY_WORD_GROUP, footnote.content) is not None
if correction_match is not None:
correction = correction_match.group(3).strip()
word.editor_comments.append(EditorCorrection(correction_text=correction, is_uncertain=is_uncertain))
if not is_uncertain:
word.edited_text = correction
elif clarification_match is not None:
word.editor_comments.append(Clarification(text=footnote.extract_part(word_text, css_filter='bold;')))
elif atypical_match is not None:
text = footnote.extract_part(word_text, css_filter='bold;')\
if footnote.markup_contains_css_filter('bold;')\
else None
word.editor_comments.append(AtypicalWriting(text=text))
elif is_uncertain:
word.editor_comments.append(UncertainDecipherment())
else:
comment_match = re.match(COMMENT_GROUP, footnote.content)
if comment_match is not None:
is_uncertain = _is_uncertain(footnote)
comment = footnote.content[comment_match.end():-1].strip()\
if is_uncertain\
else footnote.content[comment_match.end():].strip()
word.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain))
else:
warnings.warn(f'Unknown editor comment for word "{word.text}": <{footnote}>')
elif re.match(r'.*\s.*', word_text):
for word_part in word_text.split(' '):
_process_word_match(words, footnote, line_match, word_part, line_number, parent_word_composition=word_text)
elif len([word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]) > 0:
new_words = []
for word in [word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]:
new_words += word.word_parts
_process_word_match(new_words, footnote, line_match, word_text, line_number)
else:
warnings.warn(f'No word found with text "{word_text}" on line {line_number}: <{footnote}>')
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to process the footnotes of a page.
svgscripts/process_footnotes.py [OPTIONS]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-s|--skip-until=left skip all nodes.get('X') < left
:return: exit code (int)
"""
skip_after=-1.0
try:
opts, args = getopt.getopt(argv, "hs:", ["help", "skip-until=" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-s', '--skip-until'):
skip_after = float(arg)
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
manuscript_file = file_a\
if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
else None
counter = 0
for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK):
if not UNITTESTING:
print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
categorize_footnotes(page, skip_after=skip_after, find_content=True)
save_imprints(page)
counter += 1
not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/extractWordPosition.py
===================================================================
--- svgscripts/extractWordPosition.py (revision 112)
+++ svgscripts/extractWordPosition.py (revision 113)
@@ -1,711 +1,716 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract the position of the words in a svg file and write them to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import inspect
import getopt
from lxml import etree as ET
from os import sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
from progress.bar import Bar
import re
import sys
import warnings
from datatypes.lineNumber import LineNumber
from datatypes.matrix import Matrix
from datatypes.page_creator import PageCreator, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from datatypes.pdf import PDFText
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word
from datatypes.word_insertion_mark import WordInsertionMark
from util import process_warnings4status, reset_tp_with_matrix
sys.path.append('shared_util')
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Extractor:
"""
This class can be used to extract the word positions in a svg file and write it to a xml file.
Args:
[xml_dir (str): target directory]
[title (str): title of document]
[manuscript_file (str): xml file containing information about the archival unity to which the current page belongs
"""
UNITTESTING = False
SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ]
SET_POSITIONS_TO_TEXTFIELD_0_0 = False
def __init__(self, xml_dir=None, title=None, manuscript_file=None, compare2pdf=False):
if bool(xml_dir):
self.xml_dir = xml_dir
not isdir(self.xml_dir) and mkdir(self.xml_dir)
else:
self.xml_dir = 'xml' if(isdir('xml')) else ''
self.latest_status = None
self.compare2pdf = compare2pdf
self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
self.title = title
self.manuscript_file = manuscript_file
self.manuscript_tree = None
self.svg_tree = None
if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file):
self.manuscript_tree = ET.parse(self.manuscript_file)
self.title = self.manuscript_tree.getroot().get('title')
elif bool(self.manuscript_file):
raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file))
elif bool(self.title):
self.update_title_and_manuscript(self.title, False)
def _get_pwps_break_points(self, page, pwps) ->list:
"""Return a list of break points from word_part_objs.
"""
break_points = []
if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
for Sonderzeichen in self.SONDERZEICHEN_LIST:
sonderzeichen_break_points = [ (i, pwp) for i, pwp in enumerate(pwps) if pwp.text == Sonderzeichen and any(sz in pwp.style_class for sz in page.sonderzeichen_list) ]
if len(sonderzeichen_break_points) > 0:
#break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(sonderzeichen_pwps) ]]
for i, pwp in sonderzeichen_break_points:
break_points.append((i, i+1))
wim_index = len(page.word_insertion_marks)
wim = WordInsertionMark(id=wim_index, x=pwp.left, y=pwp.top-pwp.height, height=pwp.height, width=pwp.width,\
line_number=page.get_line_number(pwp.top-pwp.height-1), mark_type=Sonderzeichen)
page.word_insertion_marks.append(wim)
if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_pwps(pwps)))): # case: digits from line number and chars from words -> create break points
THRESHOLDX = 20 # Threshold between line number and text
last_x = -1
for i, x in enumerate([float(pwp.left) for pwp in pwps]):
if(last_x > -1 and (x - last_x > THRESHOLDX)):
break_points.append((i, i))
last_x = x
return break_points
def _get_break_points(self, page, word_part_objs, transkription_field=None) ->list:
"""Return a list of break points from word_part_objs.
"""
break_points = []
if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
for Sonderzeichen in self.SONDERZEICHEN_LIST:
contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ]
if True in contains_Sonderzeichen:
break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]]
for sz_point in [i for i, e in break_points]:
wim_index = len(page.word_insertion_marks)
x = float(word_part_objs[sz_point]['x'])
y = float(word_part_objs[sz_point]['y'])
if page.svg_file is not None and isfile(page.svg_file)\
and (not self.SET_POSITIONS_TO_TEXTFIELD_0_0 or transkription_field is not None):
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
xmin = 0 if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field.xmin
ymin = 0 if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field.ymin
wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\
line_number=page.get_line_number(y-1), mark_type=Sonderzeichen)
page.word_insertion_marks.append(wim)
if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points
THRESHOLDX = 20 # Threshold between line number and text
last_x = -1
for i, x in enumerate([float(dict['x']) for dict in word_part_objs]):
if(last_x > -1 and (x - last_x > THRESHOLDX)):
break_points.append((i, i))
last_x = x
return break_points
def _process_pwps_break_points(self, break_points, page, index, pwps) ->int:
"""Process break points on pwps and return new index.
"""
from_index = 0
debug_msg = 'process break points'
for end_point, next_from_index in break_points:
new_pwps = pwps[from_index:end_point]
from_index = next_from_index
index = self.create_word_from_pwps(page, index, new_pwps, debug_msg=debug_msg)
if from_index > 0 and from_index < len(pwps):
new_pwps = pwps[from_index:]
index = self.create_word_from_pwps(page, index, new_pwps, debug_msg=debug_msg + ' ... end point')
if len(page.words) > 1\
and re.match(r'[^\w\s]', page.words[-1].text):
last_word = page.words.pop()
page.words[-1].join(last_word)
return last_word.id
return index
def _process_break_points(self, break_points, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None) ->int:
"""Process break points on word_part_objs and return new index.
"""
from_index = 0
for end_point, next_from_index in break_points:
new_word_part_objs = word_part_objs[from_index:end_point]
new_endX = word_part_objs[end_point]['x']
from_index = next_from_index
index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
if from_index > 0 and from_index < len(word_part_objs):
new_word_part_objs = word_part_objs[from_index:]
index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
return index
def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None) ->int:
"""Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word).
If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created.
:returns: the new word counter (int)
"""
break_points = self._get_break_points(page, word_part_objs, transkription_field=transkription_field)
if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words
return self._process_break_points(break_points, page, index, word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
else:
if len(word_part_objs) > 0:
provide_tf = None if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\
debug_msg_string=debug_msg, transkription_field=provide_tf, svg_path_tree=self.svg_tree)
text = self.get_word_from_part_obj(word_part_objs)
line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
if line_number == -1:
if transkription_positions[0].transform is not None:
line_number = page.get_line_number(transkription_positions[0].transform.getY())
if line_number == -1 and len(page.words) > 0:
lastWord = page.words[-1]
lastWord_lastTP = lastWord.transkription_positions[-1]
lastTP = transkription_positions[-1]
if transkription_positions[0].left > lastWord_lastTP.left\
and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2:
line_number = lastWord.line_number
else:
line_number = lastWord.line_number+1
#reset_tp_with_matrix(transkription_positions)
newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
page.words.append(newWord)
return int(index) + 1
else:
return int(index)
def create_word_from_pwps(self, page, index, pwps, debug_msg=None) ->int:
"""Creates transkription_positions and a new word from pwps (i.e. a list of PositionalWordPart).
If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, pwps will be split and several words are created.
:returns: the new word counter (int)
"""
break_points = self._get_pwps_break_points(page, pwps)
if(len(break_points) > 0): # if there are break points -> split pwps and add the corresponding words
return self._process_pwps_break_points(break_points, page, index, pwps)
else:
if len(pwps) > 0:
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps, debug_msg_string=debug_msg)
text = self.get_word_from_pwps(pwps)
line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
if line_number == -1:
if transkription_positions[0].transform is not None:
line_number = page.get_line_number(transkription_positions[0].transform.getY())
if line_number == -1 and len(page.words) > 0:
lastWord = page.words[-1]
lastWord_lastTP = lastWord.transkription_positions[-1]
lastTP = transkription_positions[-1]
if transkription_positions[0].left > lastWord_lastTP.left\
and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2:
line_number = lastWord.line_number
else:
line_number = lastWord.line_number+1
#reset_tp_with_matrix(transkription_positions)
newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
page.words.append(newWord)
return int(index) + 1
else:
return int(index)
def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default', multipage_index=-1, marginals_page=None):
"""Extracts information about positions of text elements and writes them to a xml file.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
exit_status = 0
with warnings.catch_warnings(record=record_warnings) as w:
warnings.simplefilter(warning_filter)
page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile,\
multipage_index=multipage_index, marginals_page=marginals_page)
status_message = process_warnings4status(w, [ PageCreator.WARNING_MISSING_USE_NODE4PWP, PageCreator.WARNING_MISSING_GLYPH_ID4WIM ],\
'', 'OK', 'with warnings')
if status_message != 'OK':
self.latest_status = status_message
exit_status = 1
else:
self.latest_status = None
page.page_tree.getroot().set('status', status_message)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
return exit_status
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, multipage_index=-1, marginals_page=None) -> PageCreator:
"""Extracts information about positions of text elements.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
transkription_field = TranskriptionField(file_name, multipage_index=multipage_index)
text_field = transkription_field.convert_to_text_field()
self.svg_tree = ET.parse(file_name)
page = PageCreator(xml_target_file, title=self.title, multipage_index=multipage_index,\
page_number=page_number, pdfFile=pdfFile, svg_file=svg_file,\
svg_text_field=text_field, source=file_name, marginals_source=marginals_page)
sonderzeichen_list, letterspacing_list, style_dict = self.get_style(self.svg_tree.getroot())
page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
page.init_line_numbers(LineNumber.extract_line_numbers(self.svg_tree, transkription_field, set_to_text_field_zero=self.SET_POSITIONS_TO_TEXTFIELD_0_0),\
transkription_field.ymax)
self.improved_extract_word_position(self.svg_tree, page, transkription_field=transkription_field)
page.create_writing_processes_and_attach2tree()
page.update_and_attach_words2tree()
for word_insertion_mark in page.word_insertion_marks:
# it is not clear if we really need to know this alternative word ordering. See 'TODO.md'
#word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark)
word_insertion_mark.attach_object_to_tree(page.page_tree)
return page
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def improved_extract_word_position(self, svg_tree, page, transkription_field=None):
"""Extracts word positions.
"""
if page.svg_file is None or not isfile(page.svg_file):
warnings.warn('There is no page.svg_file or it does not exist ... using old function "extract_word_position"!')
self.extract_word_position(svg_tree, page, transkription_field=transkription_field)
else:
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
counter = 0
word_part_obj = []
pwps = []
endSign = '%'
last_matrix = None
MAXBOTTOMDIFF = 5
MAXXDIFF = 11
INTERCHARSPACE = 1.1
if not Extractor.UNITTESTING:
bar = Bar('(improved) extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
current_matrix = Matrix(text_item.get('transform'))
# check for line breaks
if last_matrix is not None and len(pwps) > 0 and (\
(current_matrix.getX() > pwps[-1].left+pwps[-1].width + INTERCHARSPACE or last_matrix.getX()-current_matrix.getX() > MAXXDIFF) or\
(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF)):
endSign = '%'
if(self.get_word_from_pwps(pwps) != ''):
debug_msg = 'check for line breaks, diffx: {}, diffy: {}, current_matrix: {}, last_matrix: {}'.format(\
round(current_matrix.getX() - (pwps[-1].left+pwps[-1].width), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
current_matrix.toString(), last_matrix.toString())
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg=debug_msg)
pwps = []
endX = current_matrix.getX()
if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT
if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))):
x = current_matrix.getX() if not current_matrix.isRotationMatrix() else 0.0
y = current_matrix.getY() if not current_matrix.isRotationMatrix() else 0.0
pwps += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST({ "text": text_item.text, "x": x, "y": y, "class": text_item.get('class'), "matrix": current_matrix},\
svg_path_tree, namespaces, page=page)
else:
if(self.get_word_from_pwps(pwps) != ''):
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg="text: next string empty")
pwps = []
for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT
endX = current_matrix.add2X(tspan_item.get('x'))
if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))):
y = current_matrix.add2Y(tspan_item.get('y'))
pwps += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST({ "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'),\
"matrix": current_matrix }, svg_path_tree, namespaces, page=page)
if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0:
"""text_item has letterspacing class
(set s & set t = new set with elements common to s and t)
"""
if(self.get_word_from_pwps(pwps) != ''):
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg="letterspacing class")
pwps = []
else:
if(self.get_word_from_pwps(pwps) != ''):
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg="tspan: next string empty")
pwps = []
last_matrix = current_matrix
not bool(Extractor.UNITTESTING) and bar.next()
if(self.get_word_from_pwps(pwps) != ''):
counter = self.create_word_from_pwps(page, counter, pwps, debug_msg='end of loop')
pwps = []
not bool(Extractor.UNITTESTING) and bar.finish()
def extract_word_position(self, svg_tree, page, transkription_field=None):
"""Extracts word positions.
"""
counter = 0
word_part_obj = []
endSign = '%'
last_matrix = None
MAXBOTTOMDIFF = 5
MAXXDIFF = 6
if not Extractor.UNITTESTING:
bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
provide_tf = None if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field
current_matrix = Matrix(text_item.get('transform'), transkription_field=provide_tf)
# check for line breaks
if (last_matrix is not None and len(word_part_obj) > 0 and (\
Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field)
word_part_obj = []
endX = current_matrix.getX()
if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT
if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))):
word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} )
else:
endSign = text_item.text
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field)
word_part_obj = []
endSign = '%'
for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT
endX = current_matrix.add2X(tspan_item.get('x'))
if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))):
y = current_matrix.add2Y(tspan_item.get('y'))
word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix })
if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0:
"""text_item has letterspacing class
(set s & set t = new set with elements common to s and t)
"""
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\
debug_msg='tspan with letterspacing', transkription_field=transkription_field)
word_part_obj = []
else:
endSign = tspan_item.text
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\
debug_msg='svg/text/tspan/\s', transkription_field=transkription_field)
word_part_obj = []
endSign = '%'
last_matrix = current_matrix
not bool(Extractor.UNITTESTING) and bar.next()
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\
transkription_field=transkription_field)
word_part_obj = []
endSign = '%'
not bool(Extractor.UNITTESTING) and bar.finish()
def find_inserted_words_by_position(self, target_tree, x, y):
"""Returns an Array with the words that are inserted above the x, y position or [] if not found.
"""
warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.')
MINY = 31.0
MAXY = 10.0
DIFFX = 9.0
if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
result_list = []
minus2left = 20.0
minus2top = 19.0
while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX :
result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
'//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ]
minus2left -= 1
minus2top += 1
if len(result_list) > 0:
result_bottom = result_list[len(result_list)-1].bottom
result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)):
result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
result_left_max = result_left_min + DIFFX
if float(item.get('left')) - result_left_max < DIFFX:
result_list.append(Word.CREATE_WORD(item))
else:
break
return result_list
else:
return []
def find_inserted_words(self, target_tree, word_insertion_mark):
"""Returns an Array with the words that are inserted above/underneath the word_insertion_mark.
"""
warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.')
if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1:
return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y)
if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
MINY = 31.0
MAXY = 10.0
DIFFX = 9.0
result_list = []
x = word_insertion_mark.x
y = word_insertion_mark.y
if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line
line_number = word_insertion_mark.line_number - 1
words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
'//word[@line-number={0}]'.format(line_number)) ]
if len(words_on_line) > 0:
minus2top = 1.0
while len(result_list) == 0 and minus2top < MINY:
for word in words_on_line:
for transkription_position in word.transkription_positions:
if transkription_position.top > y - minus2top\
and transkription_position.left > x - DIFFX\
and transkription_position.left < x + DIFFX:
result_list.append(word)
break
minus2top += 1
elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line
line_number = word_insertion_mark.line_number + 1
words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
'//word[@line-number={0}]'.format(line_number)) ]
if len(words_on_line) > 0:
plus2top = 1.0
while len(result_list) == 0 and plus2top < MINY :
for word in words_on_line:
for transkription_position in word.transkription_positions:
if transkription_position.top > y + plus2top\
and transkription_position.left > x - DIFFX\
and transkription_position.left < x + DIFFX:
result_list.append(word)
break
plus2top += 1
if len(result_list) > 0: # now, collect more words that are right of already collected words
result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom
result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
+ result_list[len(result_list)-1].transkription_positions[0].width
for item in target_tree.getroot().xpath(\
'//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)):
result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
+ result_list[len(result_list)-1].transkription_positions[0].width
result_left_max = result_left_min + DIFFX
if float(item.get('left')) - result_left_max < DIFFX:
result_list.append(Word.CREATE_WORD(item))
else:
break
return result_list
else:
return []
def get_file_name(self, file_name, page_number=None):
"""Returns the file_name of the target xml file.
"""
dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else ''
if bool(self.title):
return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml'
else:
return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml'))
def get_page_number(self, file_name, page_number=None):
""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
"""
if not bool(page_number) and bool(re.search(r'\d', file_name)):
"""if page_number=None and filename contains digits,
then split filename into its parts that contain only digits, remove empty strings
and return the last part containing only digits.
"""
page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop()
if bool(page_number):
leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else ''
return leading_zeros + str(page_number)
else:
return ''
def get_style(self, etree_root):
"""Returns the style specification as a dictionary.
:returns:
sonderzeichen_list: list of keys for classes that are 'Sonderzeichen'
style_dict: dictionary: key = class name (str), value = style specification (dictionary)
"""
style_dict = {}
sonderzeichen_list = []
letterspacing_list = []
style = etree_root.find('style', etree_root.nsmap)
if style is not None:
for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))):
style_key = style_item.split('{')[0].replace('.', '')
style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \
for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))}
style_dict[style_key] = style_value_dict
if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'):
sonderzeichen_list.append(style_key)
if bool(style_value_dict.get('letter-spacing')):
letterspacing_list.append(style_key)
return sonderzeichen_list, letterspacing_list, style_dict
def get_text_items(self, tree_root, transkription_field=None):
"""Returns all text elements with a matrix or (if transkription_field is specified)
all text elements that are located inside the transkription field.
"""
if transkription_field is not None:
return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=x),\
tree_root.iterfind(".//text", tree_root.nsmap))
else:
return tree_root.iterfind(".//text", tree_root.nsmap)
def get_word_from_pwps(self, pwps):
"""Extracts all 'text' from a list of dicitonaries and concats it to a string.
"""
return ''.join([ pwp.text for pwp in pwps ])
def get_word_from_part_obj(self, word_part_obj):
"""Extracts all 'text' from a list of dicitonaries and concats it to a string.
"""
return ''.join([ dict['text'] for dict in word_part_obj])
def get_word_object_multi_char_x(self, word_part_obj_dict):
"""Returns the x of the last char of word_part_object.
TODO: get real widths from svg_file!!!
"""
WIDTHFACTOR = 2.6
return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR
def update_title_and_manuscript(self, title, update_manuscript=True):
"""Updates title and manuscript.
"""
self.title = title
if update_manuscript or not bool(self.manuscript_file):
self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml'
if not isfile(self.manuscript_file):
self.manuscript_tree = ET.ElementTree(ET.Element('manuscript', attrib={"title": self.title}))
write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile')
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to extract the position of the words in a svg file and write them to a xml file.
svgscripts/extractWordPosition.py [OPTIONS]
svg file OR xml target file containing file name of svg file as "/page/@source".
directory containing svg files
OPTIONS:
-h|--help: show help
-c|--compare-to-pdf compare words to pdf and autocorrect
-d|--xml-dir=xmlDir: target directory for the xml output file(s)
-m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s)
-p|--page=pageNumber: page number of the current page. For use with _one_ file only.
-P|--PDF=pdfFile: pdf file - used for word correction
-s|--svg=svgFile: svg web file
-t|--title=title: title of the manuscript to which the current page(s) belong(s)
-x|--xml-target-file=xmlOutputFile: xml target file
:return: exit code (int)
"""
compare2pdf = True
manuscript_file = None
+ multipage_index = -1
page_number = None
pdfFile = None
svg_file = None
title = None
xml_target_file = None
xml_dir = ".{}xml".format(sep)
try:
opts, args = getopt.getopt(argv, "hcd:m:t:p:s:x:P:", ["help", "compare-to-pdf", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-c', '--compare-to-pdf'):
compare2pdf = True
elif opt in ('-d', '--xml-dir'):
xml_dir = arg
elif opt in ('-m', '--manuscript-file'):
manuscript_file = arg
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-p', '--page'):
page_number = str(arg)
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-P', '--PDF'):
pdfFile = arg
elif opt in ('-x', '--xml-target-file'):
xml_target_file = str(arg)
files_to_process = list()
for arg in args:
if isfile(arg):
files_to_process.append(arg)
elif isdir(arg):
files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg)))
else:
print("'{}' does not exist!".format(arg))
return 2
if len(files_to_process) < 1 or args[0].endswith('xml'):
if xml_target_file is None:
xml_target_file = args[0] if len(args) > 0 else None
if xml_target_file is not None and isfile(xml_target_file):
target_file_tree = ET.parse(xml_target_file)
file_name = target_file_tree.getroot().get('source')
+ multipage_index = int(target_file_tree.getroot().get('multipage-index'))\
+ if bool(target_file_tree.getroot().get('multipage-index'))\
+ else multipage_index
+ print(multipage_index)
title = target_file_tree.getroot().get('title') if title is None else title
page_number = target_file_tree.getroot().get('number') if page_number is None else page_number
if svg_file is None:
if len(target_file_tree.xpath('//svg-image')) > 0:
svg_file = target_file_tree.xpath('.//svg-image/@file-name')[0]\
if len(target_file_tree.xpath('.//svg-image/@file-name')) > 0 else None
else:
svg_file = target_file_tree.xpath('.//svg/@file')[0]\
if len(target_file_tree.xpath('.//svg/@file')) > 0 else None
files_to_process.insert(0, file_name)
if xml_target_file in files_to_process:
files_to_process.remove(xml_target_file)
else:
usage()
return 2
if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)):
print("ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!")
usage()
return 2
extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, compare2pdf=compare2pdf)
for file in files_to_process:
- extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file)
+ extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file, multipage_index=multipage_index)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/convertPDF2SVG4Web.py
===================================================================
--- svgscripts/convertPDF2SVG4Web.py (revision 112)
+++ svgscripts/convertPDF2SVG4Web.py (revision 113)
@@ -1,215 +1,216 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert pdf to svg files with the external program 'pdf2svg'.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import getopt
import re
import subprocess
import sys
import PyPDF2
from os import system, sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Converter:
"""
This class can be used to convert pdf to svg files with the external program 'pdf2svg'.
Args:
[target_dir (str): target directory]
[title (str): title as first part of target file]
[add_to_page_number (int): correction to the page number of source file]
"""
def __init__(self, target_dir=None, title=None, add_to_page_number=0):
if bool(target_dir):
self.target_dir = target_dir
not isdir(self.target_dir) and mkdir(self.target_dir)
else:
self.target_dir = 'svg' if(isdir('svg')) else ''
self.title = title.replace(' ', '_') if(bool(title)) else None
self.page_number = None
self.add_to_page_number = add_to_page_number
self.latest_converted_files = []
try:
cp = subprocess.run(["which", "pdf2svg"], stdout=subprocess.PIPE, check=True)
self.path_to_pdf2svg = cp.stdout.decode().strip()
if not bool(self.path_to_pdf2svg) or not isfile(self.path_to_pdf2svg):
raise FileNotFoundError("External command 'pdf2svg' not found!\nPlease install 'pdf2svg', check the output of 'which pdf2svg' and retry.")
except subprocess.CalledProcessError:
print("External command 'pdf2svg' not found!\nPlease install 'pdf2svg', check the output of 'which pdf2svg' and retry.")
raise
def get_page_number(self, file_name, page_number=None):
""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
"""
if not bool(page_number) and bool(re.search(r'\d', file_name)):
"""if page_number=None and filename contains digits,
then split filename into its parts that contain only digits, remove empty strings
and return the last part containing only digits.
"""
page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop()
if self.add_to_page_number > 0:
page_number = str(self.add_to_page_number + int(page_number))
if bool(page_number):
leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else ''
return leading_zeros + str(page_number)
else:
return ''
def get_file_name(self, file_name, is_part_of_multi_page_doc=False, page_number=None, pdf_name_dictionary=None):
"""Returns the file_name of the target svg file.
"""
dir_name = self.target_dir + sep if(bool(self.target_dir)) else ''
if not is_part_of_multi_page_doc:
if bool(self.title):
return dir_name + self.title + '_page' + self.get_page_number(file_name, page_number=page_number) + '_web.svg'
else:
return '{}{}'.format(dir_name, path.basename(file_name).replace('.pdf', '_web.svg'))
else:
if bool(self.title):
return dir_name + self.title.replace(' ', '_') + '_page%03d_web.svg'
else:
return dir_name + path.basename(file_name).replace('.pdf', '_page%03d_web.svg')
def pdf2svg(self, file_name, page_number=None, svg_file_name=None, name_dictionary=None):
"""Converts pdf to svg files using external program 'pdf2svg'.
:returns: return_code (int) of subprocess executing pdf2svg
"""
if isfile(file_name):
self.latest_converted_files = []
return_code = 0
pdfFileObj = open(file_name, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj,strict=False)
if(pdfReader.numPages == 1):
svg_file_name = self.get_file_name(file_name, page_number=page_number)\
if svg_file_name is None else svg_file_name
cp = subprocess.run([self.path_to_pdf2svg, file_name, svg_file_name], check=True)
return_code = cp.returncode
+ print(svg_file_name)
if return_code == 0:
self.latest_converted_files.append(svg_file_name)
else:
if name_dictionary is None:
name_dictionary = {}
dir_name = self.target_dir + sep if(bool(self.target_dir)) else ''
if bool(self.title):
name_dictionary = { index: dir_name + svg_file_name.replace('TITLE', self.title).replace('.svg', '') + '.svg' for index, svg_file_name in name_dictionary.items() }
else:
name_dictionary = { index: dir_name + svg_file_name.replace('.svg', '') + '.svg' for index, svg_file_name in name_dictionary.items() }
if len(name_dictionary) == 0:
name_dictionary = { "all": self.get_file_name(file_name, True) }
for index, svg_file_name in name_dictionary.items():
cp = subprocess.run([self.path_to_pdf2svg, file_name, svg_file_name, str(index)], check=True)
return_code = cp.returncode
if return_code == 0:
if str(index) == "all":
for i in range(1, pdfReader.numPages+1):
self.latest_converted_files.append(svg_file_name % i)
else:
self.latest_converted_files.append(svg_file_name)
pdfFileObj.close()
return return_code
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program converts pdf to svg files with the help of the external program 'pdf2svg'.
svgscripts/convertPDF2SVG4Web.py [-h|--help, -a|--add-to-page-number=value, -d|--dir=targetDir -t|--title=title] ...
svgscripts/convertPDF2SVG4Web.py [-h|--help, -a|--add-to-page-number=value, -t|--title=title, -p|--page=pageNumber,
-d|--dir=targetDir -n|--name-dict='{"pageNumber": "file_name", ...}']
OPTIONS:
-h|--help: show help
-a|--add-to-page-number=value: value to add to the page number specification of the pdf file that will be used as the
file name of the target svg file, e.g. -a 2 TITLE_page001.pdf -> TITLE_page003.svg
-d|--dir=targetDir: target directory for the svg file(s)
-t|--title=title: title that will be used as part of the target svg file(s)' filename
-p|--page=pageNumber: page number of the target svg file. For use with _one_ file only.
-n|--name-dict='{"pageNumber": "file_name", ...}': For a multipage pdf, --name-dict can be used to pass a dictionary with
page numbers (str) as keys and file names (str) as values.
The script will extract only those pages for whiche there are keys.
E.g. -n '{"3":"TITLE_page001","5":"TITLE_page003"}' TITLE_multipage.pdf -> TITLE_page001.svg
TITLE_page003.svg
:return: exit code (int)
"""
target_dir = ".{}svg".format(sep)
title = None
page_number = None
add_to_page_number = 0
name_dictionary = {}
try:
opts, args = getopt.getopt(argv, "ha:d:t:p:n:", ["help", "add-to-page-number=", "dir=", "title=", "page=", "name-dict="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-a', '--add-to-page-number'):
add_to_page_number = int(arg)
elif opt in ('-d', '--dir'):
target_dir = arg
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-p', '--page'):
page_number = str(arg)
elif opt in ('-n', '--name-dict'):
name_dictionary = eval(arg)
if not args:
usage()
return 2
files_to_process = list()
for arg in args:
if isfile(arg):
files_to_process.append(arg)
elif isdir(arg):
files_to_process = files_to_process + list(filter(lambda file: '.pdf' in file, listdir(arg)))
else:
print("'{}' does not exist!".format(arg))
return 2
converter = Converter(target_dir=target_dir, title=title, add_to_page_number=add_to_page_number)
if len(files_to_process) > 1 and (bool(page_number) or bool(name_dictionary)):
print("ERROR: too many input files: option --page and --name-dict presuppose one input file!")
usage()
return 2
for file in files_to_process:
converter.pdf2svg(file, name_dictionary=name_dictionary, page_number=page_number)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/join_faksimileAndTranskription.py
===================================================================
--- svgscripts/join_faksimileAndTranskription.py (revision 112)
+++ svgscripts/join_faksimileAndTranskription.py (revision 113)
@@ -1,666 +1,667 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path
from progress.bar import Bar
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from convert_wordPositions import create_pdf_with_highlighted_words, create_svg_with_highlighted_words
from create_task import CorrectWords
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.lineNumber import LineNumber
from datatypes.page import Page, STATUS_MERGED_OK
from datatypes.transkriptionField import TranskriptionField
from process_files import update_svgposfile_status
from process_words_post_merging import post_merging_processing_and_saving
from util import ExternalViewer, create_highlighted_svg_file, get_empty_node_ids, record_changes,\
record_changes_on_svg_file_to_page, record_changes_on_xml_file_to_page, get_mismatching_ids,\
replace_chars
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation)
PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"')
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
SINGLE_WORD_PATTERN = r"^[\w{}]$".format(string.punctuation)
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
def create_task_correct_words(target_dir, xml_source_file=None, source_svg_file=None, page=None, unmatched_word_ids=None, unmatched_node_ids=None):
"""Create a task CorrectWords or process corrected files.
"""
exit_status = 0
if xml_source_file is None or source_svg_file is None:
if xml_source_file is None and page is not None and isfile(page.page_tree.docinfo.URL):
xml_source_file = page.page_tree.docinfo.URL if xml_source_file is None else xml_source_file
elif xml_source_file is None:
raise Exception('create_task_correct_words needs a xml_source_file or a page that has a valid tree source!')
if source_svg_file is None and page is not None and isfile(page.faksimile_svgFile):
source_svg_file = page.faksimile_svgFile if source_svg_file is None else source_svg_file
elif source_svg_file is None:
raise Exception('create_task_correct_words needs a source_svg_file or a page that has a faksimile_svgFile!')
if page is None:
page = Page(xml_source_file)
correct_words = CorrectWords(xml_source_file, source_svg_file, target_dir, page=page,\
unmatched_node_ids=unmatched_node_ids)
if not correct_words.has_been_created(page):
if not page.is_locked():
reference_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.(xml|svg)')
lock_dict = { 'reference_file': reference_file,\
'message': 'Run:$ python3 {0} -c {1} {2}'.format(__file__, target_dir, source_svg_file)}
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file, script_name=__file__,\
file_type=FILE_TYPE_SVG_WORD_POSITION, **lock_dict)
correct_words.create()
if not UNITTESTING:
print('Created a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description))
elif correct_words.has_been_finished(page):
msg = 'Task "correct words" for page {} has been finished!'.format(str(page.number))
xml_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.xml', is_finished=True)
transkription_svg = correct_words.get_target_filepath(page, is_faksimile_svg=False, is_finished=True)
faksimile_svg = correct_words.get_target_filepath(page, is_finished=True)
faksimile_file = faksimile_svg if isfile(faksimile_svg) else source_svg_file
if isfile(xml_file):
msg += '\n Words loaded from file {}.'.format(xml_file)
page = record_changes_on_xml_file_to_page(xml_source_file, xml_file)
page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=xml_file)
elif isfile(transkription_svg):
msg += '\n Words loaded from file {}.'.format(transkription_svg)
page = record_changes_on_svg_file_to_page(xml_source_file, transkription_svg, word_ids=unmatched_word_ids)
page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=transkription_svg)
msg += '\n Faksimile loaded from file {}.'.format(faksimile_file)
if not UNITTESTING:
print(msg)
exit_status = old_join_faksimileAndTranskription(faksimile_file, page=page)
elif not UNITTESTING:
print('There is a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description))
return exit_status
def debug_function(words, input=''):
"""Custon debug function.
"""
if len([ word for word in words if word.debug_container.get('marked') ]) > 0:
print(Fore.RED + 'marked word(s): {}'.format([ word.text for word in words if word.debug_container.get('marked') ]))
if input != '':
print('input: {}'.format(input))
print(Fore.RESET)
def fix_errors(faksimile_file, unmerged_faksimile_positions, unmerged_words, text_field_id=None, faksimile_page=None, xml_source_file=None, manuscript_file=None, namespaces={}):
"""Creates a faksimile svg file and a pdf file highlighting the positions of the word positions
that could not been merged. After correction, results are inserted into origianl file and processed again.
:return: exit status (int)
"""
parser = ET.XMLParser(remove_blank_text=True)
faksimile_tree = ET.parse(faksimile_file, parser)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
if faksimile_page is None:
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if text_field_id is not None\
and text_field_id in [ faksimile_page.text_field.id for faksimile_page in faksimile_pages ]:
faksimile_page = [ faksimile_page for faksimile_page in faksimile_pages if faksimile_page.text_field.id == text_field_id ][0]
else:
faksimile_page = faksimile_pages[0]
if xml_source_file is None or manuscript_file is None:
xml_source_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)
tmp_dir = tempfile.mkdtemp()
tmp_pdf_file = tmp_dir + sep + 'output.pdf'
tmp_svg_file = tmp_dir + sep + 'output.svg'
tmp_faksimile = tmp_dir + sep + 'faksimile.svg'
empyt_node_ids = get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces)\
if len(unmerged_faksimile_positions) < len(unmerged_words) else []
highlight_node_ids = [ faksimile_position.id for faksimile_position in unmerged_faksimile_positions ]
highlight_node_ids += empyt_node_ids
create_highlighted_svg_file(faksimile_tree, highlight_node_ids, target_file=tmp_faksimile,
local_image_path=faksimile_page.faksimile_image.local_path, namespaces=namespaces, highlight_color=HIGHLIGHT_COLOR)
#create_pdf_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, pdf_file_name=tmp_pdf_file, bg_color=HIGHLIGHT_COLOR)
create_svg_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, svg_file_name=tmp_svg_file, bg_color=HIGHLIGHT_COLOR)
exit_status = 2
if isfile(tmp_svg_file) and isfile(tmp_faksimile):
ExternalViewer.show_files(list_of_files=[tmp_svg_file, tmp_faksimile])
record_changes_on_svg_file_to_page(xml_source_file, tmp_svg_file, word_ids=[ word.id for word in unmerged_words ])
record_changes(faksimile_file, tmp_faksimile, highlight_node_ids, namespaces=namespaces)
shutil.rmtree(tmp_dir)
exit_status = old_join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, do_fix_errors=False, join_single_char_words=True)
return exit_status
def get_filelist_and_manuscript_file(file_a, file_b=None, correction_dir=None):
"""Returns a file list and a manuscript file (or None)
"""
file_list = []
manuscript_file = None
if isfile(file_a) and file_a.endswith('svg'):
file_list.append(file_a)
if file_b is not None and isfile(file_b):
manuscript_file = file_b
elif isfile(file_a) and file_a.endswith('xml'):
manuscript_file = file_a
if file_b is not None and isfile(file_b):
file_list.append(file_b)
elif file_b is not None and isdir(file_b):
file_list = [ svgfile for svgfile in listdir(file_b) if svgfile.endswith('svg') ]
elif correction_dir is not None and isdir(correction_dir)\
and Path(correction_dir, CorrectWords.finish_dir).is_dir():
finish_dir = Path(correction_dir, CorrectWords.finish_dir)
xml_files = list(finish_dir.glob('*.xml'))
svg_files = list(finish_dir.glob('*.svg'))
if len(xml_files + svg_files) > 1:
manuscript_tree = ET.parse(manuscript_file)
for xml_file in xml_files:
output = manuscript_tree.xpath(f'.//page[contains(@output, "{xml_file.name}")]/@output')
if len(output) > 0:
file_list.append(output[0])
elif isdir(file_a):
file_list = [ file_a + sep + svgfile for svgfile in listdir(file_a) if svgfile.endswith('svg') ]
if file_b is not None and isfile(file_b):
manuscript_file = file_b
return file_list, manuscript_file
def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None, redo_ok=False):
"""Return svg_pos_file and manuscript_file if they are ready for processing.
"""
svg_pos_file = None
manuscript_tree = None
if manuscript_file is not None:
#and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')):
manuscript_tree = ET.parse(manuscript_file)
else:
title_string = faksimile_page.title.replace(' ', '_')
manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\
if isdir('.{}xml'.format(sep)) else title_string + '.xml'
if isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if manuscript_tree is not None:
if redo_ok and len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
else:
if not UNITTESTING:
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
msg = Fore.LIGHTBLUE_EX +'->' + Fore.CYAN + 'Data from page {0} already merged with {1}!'.format(\
faksimile_page.page_number,\
manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)[0])
else:
msg = Fore.MAGENTA + 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number)
print(msg, end='')
print(Style.RESET_ALL)
return svg_pos_file, manuscript_file
def old_join_faksimileAndTranskription(faksimile_file, manuscript_file=None, page=None, do_fix_errors=False, redo_ok=False, debug_word_text='', **kwargs):
"""Joins the data of a faksimile file with the data of svgposfile.
"""
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='')
print(Style.RESET_ALL)
if not do_fix_errors and 'do_fix_errors' in kwargs.keys():
do_fix_errors = kwargs.get('do_fix_errors')
if not redo_ok and 'redo_ok' in kwargs.keys():
redo_ok = kwargs.get('redo_ok')
if debug_word_text == '' and 'debug_word_text' in kwargs.keys():
debug_word_text = kwargs.get('debug_word_text')
faksimile_tree = ET.parse(faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if page is not None:
faksimile_pages = [ faksimile_page for faksimile_page in faksimile_pages\
if get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)[0]\
== page.page_tree.docinfo.URL ]
exit_status = 0
for faksimile_page in faksimile_pages:
svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)
if svg_pos_file is not None:
image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field)
if page is None:
page = Page(svg_pos_file, faksimile_image=image4page, faksimile_svgFile=faksimile_file)
write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__,\
file_type=FILE_TYPE_SVG_WORD_POSITION)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + '->', end='')
print(Fore.CYAN + 'Joining data from page {0} with file {1} ... '.format(faksimile_page.page_number, svg_pos_file), end='')
words = sort_words(page)
if debug_word_text != '' and len([ word for word in words if word.text == debug_word_text ]) > 0:
for word in words:
if word.text == debug_word_text:
word.debug_container.update({'marked': True})
if bool(kwargs.get('join_single_char_words')):
removed_words = join_single_char_words(words)
page.words = words
page.update_and_attach_words2tree()
#print([ word.text for word in page.words if word in removed_words ])
faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions)
new_words = []
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
#faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions, unique_faksimile_words)
+ print('a', unique_faksimile_words)
for word_text in unique_faksimile_words:
old_process_word_text(new_words, word_text, faksimile_positions, words)
if False not in [ word.joined for word in words if word.text != '.' ]\
and False not in [ position.joined for position in faksimile_positions]\
and not UNITTESTING:
post_merging_processing_and_saving(svg_pos_file=svg_pos_file, new_words=new_words, page=page, manuscript_file=manuscript_file)
print(Fore.GREEN + '[OK]')
print(Style.RESET_ALL)
elif not UNITTESTING:
mismatch_words, mismatch_faksimile_positions = get_mismatching_ids(words, faksimile_positions)
not_joined_fp = [ (position.id, position.text) for position in sorted(mismatch_faksimile_positions, key=lambda fp: fp.top) ]
plural_fp = '' if len(not_joined_fp) < 2 else 's'
not_joined_tw = [ (word.id, word.line_number, word.text) for word in sorted(mismatch_words, key=lambda word: word.transkription_positions[0].top) ]
plural_tw = '' if len(not_joined_tw) < 2 else 's'
print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp))
print([(position.id, position.text) for position in faksimile_positions if not position.joined])
print('--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw))
print([(word.id, word.line_number, word.text) for word in words if not word.joined ])
debug_function(new_words, input='new_words')
debug_function(words, input='words')
print(Style.RESET_ALL)
exit_status = 2
elif False in [ word.joined for word in words ]:
print([ (word.id, word.text) for word in words if not word.joined ])
exit_status = 2
page = None
return exit_status
def add_faksimile_image(page, faksimile_page):
"""Add faksimile image to page.
"""
if page.faksimile_image is None:
if faksimile_page.faksimile_image.text_field is None\
and faksimile_page.text_field is not None:
faksimile_page.faksimile_image.text_field = faksimile_page.text_field
page.faksimile_image = faksimile_page.faksimile_image
page.faksimile_image.attach_object_to_tree(page.page_tree)
page.update_data_source(faksimile_svgFile=faksimile_page.svg_source_file)
def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, page=None):
"""Joins the data of a faksimile file with the data of svgposfile.
"""
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='')
print(Style.RESET_ALL)
faksimile_tree = ET.parse(faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if page is not None:
faksimile_pages = [ faksimile_page for faksimile_page in faksimile_pages\
if faksimile_page.page_number == page.number ]
exit_status = 0
for faksimile_page in faksimile_pages:
svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)
if svg_pos_file is not None:
if page is None:
page = Page(svg_pos_file)
if page.faksimile_image is None:
add_faksimile_image(page, faksimile_page)
write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__,\
file_type=FILE_TYPE_SVG_WORD_POSITION)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + '->', end='')
print(Fore.CYAN + 'Joining data from page {0} with file {1} ... '.format(faksimile_page.page_number, svg_pos_file), end='')
words = sort_words(page)
faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions)
new_words = []
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
#faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions, unique_faksimile_words)
for word_text in unique_faksimile_words:
process_word_text(new_words, word_text, faksimile_positions, words)
if False not in [ word.joined for word in words if word.text != '.' ]\
and False not in [ position.joined for position in faksimile_positions]\
and not UNITTESTING:
if page.is_locked():
page.unlock()
post_merging_processing_and_saving(svg_pos_file=svg_pos_file, new_words=new_words, page=page, manuscript_file=manuscript_file)
print(Fore.GREEN + '[OK]')
print(Style.RESET_ALL)
elif not UNITTESTING:
mismatch_words, mismatch_faksimile_positions = get_mismatching_ids(words, faksimile_positions)
not_joined_fp = [ (position.id, position.text) for position in sorted(mismatch_faksimile_positions, key=lambda fp: fp.top) ]
plural_fp = '' if len(not_joined_fp) < 2 else 's'
not_joined_tw = [ (word.id, word.line_number, word.text) for word in sorted(mismatch_words, key=lambda word: word.transkription_positions[0].top) ]
plural_tw = '' if len(not_joined_tw) < 2 else 's'
print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp))
print([(position.id, position.text) for position in faksimile_positions if not position.joined])
print('--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw))
print([(word.id, word.line_number, word.text) for word in words if not word.joined ])
debug_function(new_words, input='new_words')
debug_function(words, input='words')
print(Style.RESET_ALL)
exit_status = 2
elif False in [ word.joined for word in words ]:
print([ (word.id, word.text) for word in words if not word.joined ])
exit_status = 2
page = None
return exit_status
def join_single_char_words(words, threshold_x=5, threshold_y=5):
"""Join single char words.
:return: a list of removed words
"""
#all_single_char_words = [ word for word in words if re.match(r'^\w$', word.text) ]
removed_words = []
all_single_char_words = [ word for word in words if re.match(SINGLE_WORD_PATTERN, word.text) ]
if not UNITTESTING:
bar = Bar('Joining single char words', max=len(all_single_char_words))
line_numbers = sorted(set(word.line_number for word in all_single_char_words))
for line_number in line_numbers:
single_char_words = [ word for word in all_single_char_words if word.line_number == line_number ]
index = len(single_char_words)
while index > 0:
index -= 1
word = None
not UNITTESTING and bar.next()
if single_char_words[index] in words:
single_char_word_index = words.index(single_char_words[index])
if re.match(SINGLE_PUNCTUATION_PATTERN, single_char_words[index].text)\
and words_close_enough(words[single_char_word_index-1], single_char_words[index], 15, 12):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
#print('{0} -> {1}, {2}'.format(word.text, words[single_char_word_index-1].text))
elif index > 0\
and words_close_enough(single_char_words[index-1], single_char_words[index], threshold_x, threshold_y):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
elif single_char_word_index > 0\
and words[single_char_word_index-1].line_number == line_number\
and words_close_enough(words[single_char_word_index-1], single_char_words[index], threshold_x, threshold_y):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
not UNITTESTING and bar.finish()
return removed_words
def old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text='', min_length_split=5):
"""Joins faksimile_positions with text == word_text with words with text == word_text.
"""
text = word_text if alt_word_text == '' else alt_word_text
fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ]
words4word = [ word for word in words if word.text == word_text and not word.joined ]
if alt_word_text != '':
words4word += [ word for word in words if word.text == text and not word.joined ]
words4word = sorted(words4word, key=attrgetter('id'))
if len(fposition4word) == len(words4word):
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words[words.index(words4word[index])].joined = True
new_words.append(words4word[index])
elif len(words4word) < len(fposition4word):
if re.match(r'(.*)ss(.*)', text):
alt_word_text = re.sub(r'ss', 'ß', text)
old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
elif re.match(SINGLE_PUNCTUATION_PATTERN, text):
if text == '-':
alt_word_text = text.replace('-', '–')
old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
print('single', word_text, len(fposition4word), len(words4word))
"""
elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text):
alt_word_text = re.sub(PUNCTUATION_PATTERN, "", text)
debug_function(words4word, input='elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text) text {0}'.format(text))
if alt_word_text != '':
pattern = r'(.*){0}(.*)'.format(alt_word_text)
words4word += [ word for word in words if re.match(pattern, word.text) and not word.joined ]
if len(words4word) < len(fposition4word):
old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
if words4word[index].text[len(words4word[index].text)-1] != word_text[len(word_text)-1]\
and words.index(words4word[index])+1 < len(words)\
and words[words.index(words4word[index])+1].text == word_text[len(word_text)-1]:
words4word[index].join(words[words.index(words4word[index])+1])
words[words.index(words4word[index])+1].joined = True
words[words.index(words4word[index])].joined = True
words4word[index].text = word_text
new_words.append(words4word[index])
elif len(text) >= min_length_split and len([ word for word in words if word.text.startswith(text) and not word.joined ]) == len(fposition4word):
new_words4word = [ word for word in words if word.text.startswith(text) and not word.joined ]
debug_function(new_words4word, input='word.startswith {}'.format(text))
for index, fposition in enumerate(fposition4word):
old_word = new_words4word[index]
none_word, new_word, next_word = old_word.split(text, start_id=old_word.id)
fposition4word[index].joined = True
new_word.faksimile_positions = [ fposition4word[index] ]
words[words.index(old_word)] = new_word
if next_word is not None:
next_word.id = len(words)
next_word.joined = False
words.append(next_word)
new_word.joined = True
new_words.append(new_word)
elif len(text) >= min_length_split and len([ word for word in words if word.text.endswith(text) and not word.joined ]) == len(fposition4word):
new_words4word = [ word for word in words if word.text.endswith(text) and not word.joined ]
debug_function(new_words4word, input='word.endswith {}'.format(text))
for index, fposition in enumerate(fposition4word):
old_word = new_words4word[index]
before_word, new_word, none_word = old_word.split(text, start_id=old_word.id)
fposition4word[index].joined = True
new_word.faksimile_positions = [ fposition4word[index] ]
words[words.index(old_word)] = new_word
if before_word is not None:
before_word.id = len(words)
before_word.joined = False
words.append(before_word)
new_word.joined = True
new_words.append(new_word)
else:
if len(text) > 1:
new_words4word = [ word for word in words if word.text == text[1:] and not word.joined ]
debug_function(new_words4word, input='else text {0}'.format(text))
if len(new_words4word) == 0:
alt_word_text = text[1:]
old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
for new_word in new_words4word:
collected_text = new_word.text
current_word = new_word
while collected_text != word_text and word_text.endswith(collected_text) and current_word.id > 0:
previous_word = words[current_word.id-1]
if word_text.endswith(previous_word.text + collected_text):
words[current_word.id].joined = True
previous_word.join(current_word)
current_word = previous_word
collected_text = current_word.text
else:
collected_text = previous_word.text + collected_text
words4word.append(current_word)
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
if index < len(words4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words4word[index].text = word_text
words[words.index(words4word[index])].joined = True
new_words.append(words4word[index])
else:
print('<{0}> f{1}/t{2}, ids: {3}'.\
format(word_text, len(fposition4word), len(words4word), [ position.id for position in fposition4word ]))
"""
else:
print('<{0}> f{1}/t{2}'.format(word_text, len(fposition4word), len(words4word)))
def process_word_text(new_words, word_text, faksimile_positions, words):
"""Joins faksimile_positions with text == word_text with words with text == word_text.
"""
fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ]
words4word = [ word for word in words if word.text == word_text and not word.joined ]
if len(fposition4word) == len(words4word):
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words[words.index(words4word[index])].joined = True
new_words.append(words4word[index])
else:
print('<{0}> f{1}/t{2}'.format(word_text, len(fposition4word), len(words4word)))
def _compare_word_positions(wordA, wordB) ->int:
"""Return ordering of wordA and wordB
"""
tpA = wordA.transkription_positions[0] if len(wordA.word_parts) == 0 else wordA.word_parts[0].transkription_positions[0]
tpB = wordB.transkription_positions[0] if len(wordB.word_parts) == 0 else wordB.word_parts[0].transkription_positions[0]
if abs(tpA.bottom-tpB.bottom) < tpA.height/2:
return tpA.left - tpB.left
else:
return tpA.bottom - tpB. bottom
def sort_words(page)->list:
"""Returns sorted words (from top left to bottom right).
"""
words = []
for line_number in page.line_numbers:
word_on_line = [ word for word in page.words if word.line_number == line_number.id or (len(word.word_parts) > 0 and word.word_parts[0].line_number == line_number.id) ]
words += sorted(word_on_line, key=cmp_to_key(_compare_word_positions))
"""
if line_number.id % 2 == 0:
words += sorted(word_on_line, key=lambda word: word.transkription_positions[0].left)
else:
words += sorted(word_on_line, key=cmp_to_key(\
lambda wordA, wordB: -1\
if wordA.transkription_positions[0].left < wordB.transkription_positions[0].left\
and abs(wordA.transkription_positions[0].bottom-wordB.transkription_positions[0].bottom) < wordA.transkription_positions[0].height/2\
else 1))
"""
for index, word in enumerate(words):
words[index].id = index
words[index].joined = len(words[index].faksimile_positions) > 0 and words[index].verified
return words
def sort_faksimile_positions(faksimile_positions, reference_list=None):
"""Returns sorted words (from top left to bottom right).
"""
for faksimile_position in faksimile_positions:
faksimile_position.joined = False\
if reference_list is None\
else faksimile_position in reference_list
return sorted(faksimile_positions)
"""
return sorted(faksimile_positions, key=cmp_to_key(\
lambda positionA, positionB: -1\
if positionA.left+positionA.width/2 <= positionB.left+positionB.width/2\
and positionA.top+positionA.height/2 <= positionB.top+positionB.height/2\
else 1\
)\
)
"""
@deprecated(reason="Writing process id is now set to word not word_position, TODO: check faksimile_positions for split candidates!")
def update_writing_process(word):
"""Updates the writing process of the faksimile word position by
synchronizing it with the corresponding transkription word position.
If there are several transkription positions belonging to different writing
processes but just one faksimile position, then we skip the update.
We will fix these faksimile positions by manually adding more word positions
and processing those additions in a later stage.
"""
writing_processes = [ writing_process_id for writing_process_id in set( tp.writing_process_id for tp in word.transkription_positions ) ]
if len(writing_processes) == 1 and len(word.faksimile_positions) > 0:
word.faksimile_positions[0].writing_process_id = writing_processes[0]
def words_close_enough(wordA, wordB, threshold_x=10, threshold_y=5):
"""Return true if words are closer than thresholds
"""
return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left)\
-wordB.transkription_positions[0].left) < threshold_x\
and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y
#return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left+wordA.transkription_positions[len(wordA.transkription_positions)-1].width)\
# -wordB.transkription_positions[0].left) < threshold_x\
# and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
svgscripts/old_join_faksimileAndTranskription.py [OPTIONS] [xmlManuscriptFile]
a directory containing
a svg file containing information about the word positions on the faksimile.
a xml file about a manuscript, containing information about its pages.
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if exists(file_a):
file_b = None
if len(args) > 1 and exists(args[1]):
file_b = args[1]
file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b)
for faksimile_file in file_list:
join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/process_files.py
===================================================================
--- svgscripts/process_files.py (revision 112)
+++ svgscripts/process_files.py (revision 113)
@@ -1,448 +1,527 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract information from all text svg files in directory.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import csv
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from convertPDF2SVG4Web import Converter
+from datatypes.image import SVGImage
from datatypes.page_creator import PageCreator
from datatypes.transkriptionField import TranskriptionField
+from datatypes.text_field import TextField
from extractWordPosition import Extractor
from fix_missing_glyphs import fix_missing_glyphs
from util import update_svgposfile_status, update_manuscript_file
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
WARN_MISSING_USE_NODE = f'with warnings:{PageCreator.WARNING_MISSING_USE_NODE4PWP}:'
WARN_MISSING_GLYPH = f'with warnings:{PageCreator.WARNING_MISSING_GLYPH_ID4WIM}:'
class MyCSVHandler:
- """This class can be used to handle csv files that contain information about the tile and layout of the svg files.
+ """This class can be used to handle csv files that contain information about the title and layout of the svg files.
"""
ENTRY_KEY_PAGE = 'pdf_page_number'
ENTRY_KEY_FILE = 'svg_source_file'
ENTRY_KEY_TITLE = 'manuscript_title'
ENTRY_KEY_PAGE_NAMES = 'page_names'
ENTRY_KEY_MARG_PAGE = 'marginals_page_entry'
MANUSCRIPT_AE_REMOVAL = re.compile('[a-e]')
MANUSCRIPT_KEY = 'Ms'
MANUSCRIPT_PATTERN = re.compile(r'(\d+)(>\s)(.*)')
MANUSCRIPT_TITLE_EXTENSION = 'Mp'
MANUSCRIPT_TITLE_PARTS = re.compile(r'([I-X]+[a-e]*)(\s)(\d+\w*)(/\d+\w*)*')
MARGINALS_PAGE = re.compile(r'([I-X]+[a-e]*)(\s)(\d+\w*)(\s)(Marg)')
REMOVE_NONNUMERIC = re.compile('\D')
- def __init__(self, csv_file_name, pdf_file, svg_dir, title=None):
+ def __init__(self, csv_file_name, pdf_file, svg_dir=None, title=None, createBlanks=False):
+ self.createBlanks = createBlanks
self.csv_entries = []
self.pdf_file = pdf_file
self.svg_dir = svg_dir
self.title = title
self._init_csv_entries(csv_file_name)
def _init_csv_entries(self, csv_file_name):
"""Init csv entries by reading the csv_file.
"""
with open(csv_file_name, newline='') as csvfile:
reader = csv.DictReader(csvfile)
list_of_svg_files = [ svg_file for svg_file in listdir(self.svg_dir) if svg_file.endswith('.svg') ]
marg_entry = None
for row in reader:
ms_string = row[self.MANUSCRIPT_KEY]
manuscript_match = re.match(self.MANUSCRIPT_PATTERN, ms_string)
if manuscript_match is not None:
page_number = int(manuscript_match.group(1))
files_matching = [ svg_file for svg_file in list_of_svg_files\
if re.match(rf'([0]*{page_number})(.svg)', svg_file.replace(re.split(r'\d+\.svg', svg_file)[0], '')) ]
- if len(files_matching) > 0:
- svg_file = files_matching[0]
+ if self.createBlanks or len(files_matching) > 0:
+ svg_file = files_matching[0] if len(files_matching) > 0 else None
title_parts = re.match(self.MANUSCRIPT_TITLE_PARTS, manuscript_match.group(3))
marginals_page = re.match(self.MARGINALS_PAGE, manuscript_match.group(3))
if marginals_page is not None:
marg_entry = { self.ENTRY_KEY_PAGE: page_number, self.ENTRY_KEY_FILE: svg_file }
elif title_parts is not None:
title = self.MANUSCRIPT_AE_REMOVAL.sub('', title_parts.group(1))
manuscript_title = f'{self.MANUSCRIPT_TITLE_EXTENSION} {title}'
entry = { self.ENTRY_KEY_PAGE: page_number,\
self.ENTRY_KEY_FILE: svg_file,\
self.ENTRY_KEY_TITLE: manuscript_title,\
self.ENTRY_KEY_PAGE_NAMES: [ f'{title_parts.group(3)}' ] }
if title_parts.group(4) is not None:
entry[self.ENTRY_KEY_PAGE_NAMES].append(title_parts.group(4).replace('/',''))
if marg_entry is not None\
and marg_entry[self.ENTRY_KEY_PAGE] == page_number-1:
entry[self.ENTRY_KEY_MARG_PAGE] = marg_entry
marg_entry = None
if self.title is None\
or self.title == manuscript_title:
self.csv_entries.append(entry)
def process_files(self, svg_target_dir, xml_target_dir, error_handler=None) -> int:
"""Process files and return exit status.
"""
exit_status = 0
if len(self.csv_entries) > 0:
converter = Converter(target_dir=svg_target_dir)
extractor = Extractor(xml_dir=xml_target_dir)
for entry in self.csv_entries:
title = entry[self.ENTRY_KEY_TITLE]
extractor.update_title_and_manuscript(title)
#converter.title = title.replace(' ', '_')
pdf_page_number = entry[self.ENTRY_KEY_PAGE]
svgfile = f'{self.svg_dir}{sep}{entry[self.ENTRY_KEY_FILE]}'
for index, page_number in enumerate(entry[self.ENTRY_KEY_PAGE_NAMES]):
pdf_name_dictionary = { pdf_page_number: title.replace(' ', '_') + '_' + str(page_number) + '_web' }
multipage_index = -1\
if len(entry[self.ENTRY_KEY_PAGE_NAMES]) == 1\
else index
marginals_page = None\
if not bool(entry.get(self.ENTRY_KEY_MARG_PAGE))\
else f'{self.svg_dir}{sep}{entry[self.ENTRY_KEY_MARG_PAGE][self.ENTRY_KEY_FILE]}'
try:
- if page_has_status(WARN_MISSING_USE_NODE,\
+ if self.createBlanks:
+ svg_pos_file = f'.{sep}xml{sep}' + title.replace(' ', '_') + '_page' + str(page_number) + '.xml'
+ exit_status = process_blank_file(converter, extractor, self.pdf_file, page_number, pdf_name_dictionary, pdf_name_dictionary[pdf_page_number], svg_pos_file,
+ multipage_index=multipage_index)
+ elif page_has_status(WARN_MISSING_USE_NODE,\
manuscript_file=extractor.manuscript_file, page_number=page_number)\
or page_has_status(WARN_MISSING_GLYPH,\
manuscript_file=extractor.manuscript_file, page_number=page_number):
svg_pos_file = get_page_output_file(page_number, manuscript_file=extractor.manuscript_file)
if svg_pos_file is not None and isfile(svg_pos_file):
fix_missing_glyphs(svg_pos_file, manuscript_file=extractor.manuscript_file)
elif not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
exit_status = process_file(converter, extractor, svgfile, self.pdf_file, page_number,\
pdf_name_dictionary=pdf_name_dictionary, multipage_index=multipage_index,\
marginals_page=marginals_page)
except Exception as err:
print(err)
if error_handler is not None:
error_handler.record_error(svgfile, self.pdf_file, title, page_number, error=err)
if not UNITTESTING:
print(Fore.RED)
print('There was an error ->', err)
print(Style.RESET_ALL)
if error_handler is not None:
error_handler.write()
return exit_status
class MyErrorHandler:
"""This class can be used to handle errors executing extractWordPosition.Extractor.extractAndWriteInformation.
"""
ERROR_LOG = 'error_log.xml'
def __init__(self):
self.tree = ET.ElementTree(ET.Element('error-log'))
if isfile(MyErrorHandler.ERROR_LOG):
parser = ET.XMLParser(remove_blank_text=True)
self.tree = ET.parse(MyErrorHandler.ERROR_LOG, parser)
def record_error(self, svgfile, pdffile, title, page_number, error=None):
"""Records an error.
"""
if len(self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))) > 0:
error_node = self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))[0]
else:
error_node = ET.SubElement(self.tree.getroot(), 'error', attrib={'title': title, 'number': page_number})
ET.SubElement(error_node, 'svgfile').text = svgfile
ET.SubElement(error_node, 'pdffile').text = pdffile
if error is not None:
error_node.set('type', str(type(error).__name__))
if str(error) != '':
error_msg = ET.SubElement(error_node, 'error-msg')
error_msg.text = str(error)
if str(type(error).__name__) == 'ExpatError':
error_msg.text += '->svgfile is empty!'
def run(self, title=None, page_number=None, error_type=None):
"""Run all or some errors
[:return:] exit status (int)
"""
xpath = '//error'
if title is not None and page_number is not None:
xpath = '//error[@title="{0}" and @number="{1}"]'.format(title, page_number)
elif title is not None:
xpath = '//error[@title="{0}"]'.format(title)
elif page_number is not None:
xpath = '//error[@number="{0}"]'.format(page_number)
if error_type is not None:
xpath = xpath + '[@type="{0}"]'.format(error_type)\
if title is None and page_number is None\
else xpath.replace(']', ' ') + 'and @type="{0}"]'.format(error_type)
exit_status = 0
for error in self.tree.xpath(xpath):
title = error.get('title')
page_number = error.get('number')
svgfile = error.xpath('./svgfile/text()')[0]\
if len(error.xpath('./svgfile/text()')) > 0 else None
pdffile = error.xpath('./pdffile/text()')[0]\
if len(error.xpath('./pdffile/text()')) > 0 else None
if svgfile is not None:
converter = Converter(title=title)
extractor = Extractor(title=title, compare2pdf=True)
status = process_file(converter, extractor, svgfile, pdffile, page_number)
if status > 0:
exit_status = status
if status < 2:
error.getparent().remove(error)
self.write()
return exit_status
def write(self):
"""Writes error log.
"""
write_pretty(xml_element_tree=self.tree, file_name=MyErrorHandler.ERROR_LOG, script_name=__file__, file_type='xmlErrorLog')
def get_page_output_file(page_number: str, manuscript_file=None, manuscript_tree=None) ->str:
"""Return filename of xml output file for page with page number page_number.
"""
if manuscript_tree is None:
if manuscript_file is None or not isfile(manuscript_file):
msg = f'File {manuscript_file} does not exist!'\
if manuscript_file is not None\
else 'Please specify either manuscript_file or manuscript_tree'
raise Exception(msg)
manuscript_tree = ET.parse(manuscript_file)
if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')
return None
def is_page_ok(manuscript_file=None, page_number=None):
"""Returns true if page status is 'OK'.
"""
return page_has_status('OK', manuscript_file=manuscript_file, page_number=page_number)
def page_has_status(status, manuscript_file=None, page_number=None):
"""Returns true if page status is 'OK'.
"""
if manuscript_file is not None and isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if page_number is not None\
and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('status') == status
return False
def is_svg_ok(manuscript_file=None, page_number=None):
"""Returns true if svgfile contains a valid svg graphic location.
"""
if manuscript_file is not None and isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if page_number is not None\
and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0\
and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')):
xml_source_tree = ET.parse(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output'))
return len(xml_source_tree.xpath('//svg/@file')) > 0 and isfile(xml_source_tree.xpath('//svg/@file')[0])
return False
+def _update_svg_image_according_to_extended_text_field(page, text_field):
+ page.svg_image.width = text_field.width
+ page.svg_image.height = text_field.height
+ page.svg_image.text_field = text_field
+ page.svg_image.attach_object_to_tree(page.page_tree)
+ tf = TranskriptionField(page.svg_image.file_name, multipage_index=page.multipage_index)
+ tf.xmin = text_field.left
+ tf.ymin = text_field.top
+ tf.width = text_field.width
+ tf.height = text_field.height
+ tf.shrink_svg_to_transkription_field(redo=True)
+
+
+def process_blank_file(converter, extractor, pdffile, page_number, pdf_name_dictionary, path_svg_file, xml_target_file, multipage_index):
+ """Processes file as blank.
+
+ [:return:] exit status (int)
+ """
+ if not UNITTESTING:
+ print(Fore.LIGHTBLUE_EX + f'Processing page {page_number} of {pdffile} ...')
+ print(Style.RESET_ALL)
+ if converter.pdf2svg(pdffile, page_number=page_number, name_dictionary=pdf_name_dictionary) == 0:
+ svg_file = converter.target_dir + sep + path_svg_file + '.svg'
+ if isfile(svg_file):
+ text_field = get_extended_text_field(svg_file, multipage_index=multipage_index)
+ if isfile(xml_target_file):
+ page = Page.create_cls(xml_target_file)
+ _update_svg_image_according_to_extended_text_field(page, text_field)
+ write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
+ else:
+ page = PageCreator(xml_target_file, title=extractor.title, page_number=page_number, svg_file=svg_file, multipage_index=multipage_index, svg_text_field=text_field)
+ _update_svg_image_according_to_extended_text_field(page, text_field)
+ write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
+ update_svgposfile_status(xml_target_file, manuscript_file=extractor.manuscript_file, status='blank')
+ return 0
+ return 2
+ return 2
+
+def get_extended_text_field(svg_file, multipage_index=-1) ->TextField:
+ """Get an extended text field, i.e. the text_field and the marginals.
+ """
+ tf = TranskriptionField(svg_file, multipage_index=multipage_index)
+ svg_tree = ET.parse(svg_file)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ max_y = tf.second_field.ymin_without_title if tf.second_field is not None else tf.documentHeight-30
+ if tf.second_field is None and tf.is_shrunk():
+ max_y = sorted([ float(y) for y in svg_tree.xpath(f'//ns:use[@y>{tf.ymax}]/@y', namespaces=namespaces)], reverse=True)[0]-40\
+ if len(svg_tree.xpath(f'//ns:use[@y>{tf.ymax}]/@y', namespaces=namespaces)) > 0\
+ else tf.documentHeight+100
+ sorted_left_x = sorted([ float(x) for x in svg_tree.xpath(f'//ns:use[@x<{tf.xmin} and @y>{tf.ymin} and @y <{max_y}]/@x', namespaces=namespaces)])
+ sorted_right_x = sorted([ float(x) for x in svg_tree.xpath(f'//ns:use[@x>{tf.xmax} and @y>{tf.ymin} and @y <{max_y}]/@x', namespaces=namespaces)], reverse=True)
+ sorted_y = sorted([ float(y) for y in svg_tree.xpath(f'//ns:use[@y>{tf.ymax} and @y <{max_y}]/@y', namespaces=namespaces)], reverse=True)
+ xmin = sorted_left_x[0]-5 if len(sorted_left_x) > 0 and sorted_left_x[0]-5 < tf.xmin else tf.xmin
+ xmax = sorted_right_x[0]+5 if len(sorted_right_x) > 0 and sorted_right_x[0]+5 > tf.xmax else tf.xmax
+ ymax = sorted_y[0]+5 if len(sorted_y) > 0 else tf.ymax
+ return TextField(x=xmin, y=tf.ymin, width=xmax-xmin, height=ymax-tf.ymin)
+
def process_file(converter, extractor, svgfile, pdffile, page_number, pdf_name_dictionary=None, multipage_index=-1, marginals_page=None):
"""Processes file.
[:return:] exit status (int)
"""
exit_status = 0
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} ...'.format(svgfile))
print(Style.RESET_ALL)
if converter.pdf2svg(pdffile, page_number=page_number, name_dictionary=pdf_name_dictionary) == 0:
for path_svg_file in converter.latest_converted_files:
transkriptionField = TranskriptionField(path_svg_file, multipage_index=multipage_index)
transkriptionField.shrink_svg_to_transkription_field()
xml_target_file = extractor.get_file_name(svgfile, page_number)
extraction_status = extractor.extractAndWriteInformation(svgfile, xml_target_file=xml_target_file,\
page_number=page_number, pdfFile=pdffile, svg_file=path_svg_file, record_warnings=True,\
multipage_index=multipage_index, marginals_page=marginals_page)
if extraction_status < 2 and extractor.manuscript_file is not None:
status = 'OK'
if extraction_status == 1:
status = extractor.latest_status
exit_status = 1
update_svgposfile_status(xml_target_file, manuscript_file=extractor.manuscript_file, status=status)
return exit_status
def update_graphical_svg(converter, svgfile, pdffile, page_number, xml_target_file):
"""Create a new graphical svg file and update xml output file.
[:return:] exit status (int)
"""
exit_status = 0
if isfile(xml_target_file):
path_svg_file = converter.get_file_name(pdffile, page_number=page_number)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Creating file {} ...'.format(path_svg_file))
print(Style.RESET_ALL)
if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0:
transkriptionField = TranskriptionField(path_svg_file)
transkriptionField.shrink_svg_to_transkription_field()
page = PageCreator(xml_target_file, svg_file=path_svg_file)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
else:
exit_status = 2
return exit_status
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to extract information from all text svg files in a directory.
svgscripts/process_files.py [OPTIONS]
svgscripts/process_files.py [OPTIONS]
+ svgscripts/process_files.py --blank
svgscripts/process_files.py [OPTIONS]
Directory containing pdfs corresponding to svg files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
Directory containing svg files corresponding to pdf files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
OPTIONS:
-h|--help: show help
+ -b|--blank Create web svg files and empty xml files.
-e|--run-error Rerun error cases.
-g|--check-graphic-svg Check that graphical svg file exists or generate a new svg file.
-n|--number=pageNumber Use this with OPTION -e|--run-error in order to specify an error case.
-t|--title=title: title of the manuscript to which all files belong.
-T|--error-type: error type, use this with OPTION -e|--run-error in order to specify an error case.
-s|--svg-target-dir=svg-target-dir target directory for path svg files, i.e. svg files that can be displayed on the web.
-x|--xml-target-dir=xml-target-dir target directory for xml files.
:return: exit code (int)
"""
+ blank = False
check_graphic_svg_exists = False
csv_handler = None
error_handler = MyErrorHandler()
error_type = None
number = None
rerun_errors = False
svg_target_dir = ".{}svg".format(sep)
title = None
xml_target_dir = ".{}xml".format(sep)
try:
- opts, args = getopt.getopt(argv, "hegn:s:t:T:x:", ["help", "run-error", "check-graphic-svg", "number=", "svg-target-dir=", "title=", "error-type=", "xml-target-dir="])
+ opts, args = getopt.getopt(argv, "hbegn:s:t:T:x:", ["help", "blank", "run-error", "check-graphic-svg", "number=", "svg-target-dir=", "title=", "error-type=", "xml-target-dir="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
+ elif opt in ('-b', '--blank'):
+ blank = True
elif opt in ('-e', '--run-error'):
rerun_errors = True
elif opt in ('-g', '--check-graphic-svg'):
check_graphic_svg_exists = True
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-T', '--error-type'):
error_type = arg
elif opt in ('-n', '--number'):
number = arg
elif opt in ('-s', '--svg-target-dir'):
svg_target_dir = arg
elif opt in ('-x', '--xml-target-dir'):
xml_target_dir = arg
-
+
+ if blank:
+ if len(args) == 2\
+ and isfile(args[0]) and args[0].endswith('.csv')\
+ and isfile(args[1]) and args[1].endswith('.pdf'):
+ csv_handler = MyCSVHandler(args[0], args[1], title=title, createBlanks=blank)
+ return csv_handler.process_files(svg_target_dir, xml_target_dir, error_handler)
+ else:
+ print("Please specify both CSV- and PDF-file!")
+ usage()
+ return 2
if rerun_errors:
return error_handler.run(title=title, page_number=number, error_type=error_type)
if len(args) == 1 and args[0].endswith('.xml'):
source_tree = ET.parse(args[0])
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
svg_word_file_tree = ET.parse(source_tree.xpath('//page/@output')[0])
svg_dir = dirname(svg_word_file_tree.xpath('//page/@source')[0])
pdf_dir = dirname(svg_word_file_tree.xpath('//page/pdf/@file')[0])
else:
print('File {} is not of type {}'.format(args[0], FILE_TYPE_XML_MANUSCRIPT))
usage()
return 2
elif len(args) < 1 or\
(len(args) == 1\
and (True not in [ pdffile.endswith('pdf') for pdffile in listdir(args[0]) ]\
or True not in [ svgfile.endswith('svg') for svgfile in listdir(args[0]) ])\
):
print("Please specify both PDFDIR and TEXT_SVG_DIR!")
usage()
return 2
elif len(args) < 2:
pdf_dir, svg_dir = args[0], args[0]
elif isdir(args[0]) and isdir(args[1]):
pdf_dir, svg_dir = args[0], args[1]
if True in [ svgfile.endswith('pdf') for svgfile in listdir(args[1]) ]:
pdf_dir, svg_dir = args[1], args[0]
elif len(args) == 3\
and isfile(args[0]) and args[0].endswith('.csv')\
and isfile(args[1]) and args[1].endswith('.pdf')\
and isdir(args[2]):
csv_handler = MyCSVHandler(args[0], args[1], args[2], title=title)
return csv_handler.process_files(svg_target_dir, xml_target_dir, error_handler)
else:
not_existing = args[0] if not isdir(args[0]) else args[1]
print("ERROR directory {} does not exist!".format(not_existing))
return 2
list_of_svg = [ svgfile for svgfile in listdir(svg_dir) if svgfile.endswith('svg') ]
list_of_pdf = [ pdffile for pdffile in listdir(pdf_dir) if pdffile.endswith('pdf') ]
converter = Converter(target_dir=svg_target_dir, title=title)
extractor = Extractor(xml_dir=xml_target_dir, title=title, compare2pdf=True)
exit_status = 0
for svgfile in list_of_svg:
if svgfile.replace('.svg', '.pdf') in list_of_pdf:
title = re.split(r'(^[A-Z]+p*_[A-Z]*_[0-9]*)', svgfile)[1].replace('_', ' ')
if extractor.title is None or extractor.title != title:
extractor.update_title_and_manuscript(title)
if converter.title is None or converter.title != title:
converter.title = title.replace(' ', '_')
if 'page' in svgfile:
page_number = svgfile.replace('.svg','').split('page')[1]
else:
page_number = svgfile.replace('.svg','').split('_')[-1]
pdffile = '{}{}{}'.format(pdf_dir, sep, svgfile.replace('.svg', '.pdf'))
if not check_graphic_svg_exists and not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
try:
svgfile = '{}{}{}'.format(svg_dir, sep, svgfile)
exit_status = process_file(converter, extractor, svgfile, pdffile, page_number)
except Exception as err:
error_handler.record_error(svgfile, pdffile, title, page_number, error=err)
if not UNITTESTING:
print(Fore.RED)
print('There was an error ->', err)
print(Style.RESET_ALL)
elif not is_svg_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
update_graphical_svg(converter, svgfile, pdffile, page_number, extractor.get_file_name(svgfile, page_number))
error_handler.write()
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/datatypes/super_page.py
===================================================================
--- svgscripts/datatypes/super_page.py (revision 112)
+++ svgscripts/datatypes/super_page.py (revision 113)
@@ -1,296 +1,299 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a super page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile, basename, dirname
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import sys
import warnings
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .mark_foreign_hands import MarkForeignHands
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .writing_process import WritingProcess
class SuperPage:
"""
This super class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition'
FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile'
ADD2Y = 7
PAGE_RECTO = 'recto'
PAGE_VERSO = 'verso'
STATUS_MERGED_OK = 'faksimile merged'
STATUS_POSTMERGED_OK = 'words processed'
UNITTESTING = False
XML_TAG = 'page'
def __init__(self, xml_file, title=None, page_number='', orientation='North', multipage_index=-1, page_type=PAGE_VERSO, should_xml_file_exist=False):
self.properties_dictionary = {\
'faksimile_image': (FaksimileImage.XML_TAG, None, FaksimileImage),\
'faksimile_svgFile': ('data-source/@file', None, str),\
'multipage_index': ('page/@multipage-index', multipage_index, int),\
'marginals_source': ('page/@marginals-source', None, str),\
'number': ('page/@number', str(page_number), str),\
'orientation': ('page/@orientation', orientation, str),\
'page_type': ('page/@pageType', page_type, str),\
'pdfFile': ('pdf/@file', None, str),\
'source': ('page/@source', None, str),\
+ 'status': ('page/@status', 'blank', str),\
'svg_file': ('svg/@file', None, str),\
'svg_image': (SVGImage.XML_TAG, None, SVGImage),\
'text_field': (FaksimileImage.XML_TAG + '/' + TextField.XML_TAG, None, TextField),\
'title': ('page/@title', title, str),\
}
self.bak_file = None
self.online_properties = []
+ self.editor_comments = []
self.imprints = []
self.line_numbers = []
self.lines = []
self.mark_foreign_hands = []
self.page_tree = None
self.sonderzeichen_list = []
+ self.status = 'blank'
self.style_dict = {}
self.text_connection_marks = []
self.word_deletion_paths = []
self.word_insertion_marks = []
self.words = []
self.writing_processes = []
self.xml_file = xml_file
if not self.is_page_source_xml_file():
msg = f'ERROR: xml_source_file {self.xml_file} is not of type "{FILE_TYPE_SVG_WORD_POSITION}"'
raise Exception(msg)
self._init_tree(should_xml_file_exist=should_xml_file_exist)
def add_style(self, sonderzeichen_list=None, letterspacing_list=None, style_dict=None, style_node=None):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self.sonderzeichen_list = sonderzeichen_list if sonderzeichen_list is not None else []
self.letterspacing_list = letterspacing_list if letterspacing_list is not None else []
self.style_dict = style_dict if style_dict is not None else {}
if style_node is not None:
self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') }
self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ]
self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('letterspacing-list')) ]
elif bool(self.style_dict):
for node in self.page_tree.xpath('//style'): node.getparent().remove(node)
style_node = ET.SubElement(self.page_tree.getroot(), 'style')
if len(self.sonderzeichen_list) > 0:
style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list))
if len(self.letterspacing_list) > 0:
style_node.set('letterspacing-list', ' '.join(self.letterspacing_list))
for key in self.style_dict.keys():
self.style_dict[key]['name'] = key
ET.SubElement(style_node, 'class', attrib=self.style_dict[key])
fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value }
fontsizes = sorted(fontsize_dict.values(), reverse=True)
# create a mapping between fontsizes and word stages
self.fontsizekey2stage_mapping = {}
for fontsize_key, value in fontsize_dict.items():
if value >= fontsizes[0]-1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION })
elif value <= fontsizes[-1]+1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION })
else:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION })
def get_biggest_fontSize4styles(self, style_set={}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if bool(self.style_dict):
sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True)
return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1
else:
return 1
def get_line_number(self, y):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if len(self.line_numbers) > 0:
result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ]
return result_list[0] if len(result_list) > 0 else -1
else:
return -1
def init_all_properties(self, overwrite=False):
"""Initialize all properties.
"""
for property_key in self.properties_dictionary.keys():
if property_key not in self.online_properties:
self.init_property(property_key, overwrite=overwrite)
def init_property(self, property_key, value=None, overwrite=False):
"""Initialize all properties.
Args:
property_key: key of property in self.__dict__
value: new value to set to property
overwrite: whether or not to update values from xml_file (default: read only)
"""
if value is None:
if property_key not in self.online_properties:
xpath, value, cls = self.properties_dictionary.get(property_key)
if len(self.page_tree.xpath('//' + xpath)) > 0:
value = self.page_tree.xpath('//' + xpath)[0]
if value is not None:
if cls.__module__ == 'builtins':
self.update_tree(value, xpath)
self.__dict__.update({property_key: cls(value)})
else:
value = cls(node=value)\
if type(value) != cls\
else value
self.__dict__.update({property_key: value})
self.__dict__.get(property_key).attach_object_to_tree(self.page_tree)
else:
self.__dict__.update({property_key: value})
self.online_properties.append(property_key)
elif overwrite or property_key not in self.online_properties:
xpath, default_value, cls = self.properties_dictionary.get(property_key)
if cls.__module__ == 'builtins':
self.__dict__.update({property_key: cls(value)})
self.update_tree(value, xpath)
else:
self.__dict__.update({property_key: value})
self.__dict__.get(property_key).attach_object_to_tree(self.page_tree)
self.online_properties.append(property_key)
def is_locked(self):
"""Return true if page is locked.
"""
return len(self.page_tree.xpath('//metadata/lock')) > 0
def is_page_source_xml_file(self, source_tree=None):
"""Return true if xml_file is of type FILE_TYPE_SVG_WORD_POSITION.
"""
if not isfile(self.xml_file):
return True
if source_tree is None:
source_tree = ET.parse(self.xml_file)
return source_tree.getroot().find('metadata/type').text == self.FILE_TYPE_SVG_WORD_POSITION
def lock(self, reference_file, message=''):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if not self.is_locked():
metadata = self.page_tree.xpath('./metadata')[0]\
if len(self.page_tree.xpath('./metadata')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'metadata')
lock = ET.SubElement(metadata, 'lock')
ET.SubElement(lock, 'reference-file').text = reference_file
if message != '':
ET.SubElement(lock, 'message').text = message
def unlock(self):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if self.is_locked():
lock = self.page_tree.xpath('//metadata/lock')[0]
lock.getparent().remove(lock)
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_property_dictionary(self, property_key, default_value):
"""Update properties_dictionary.
"""
content = self.properties_dictionary.get(property_key)
if content is not None:
self.properties_dictionary.update({property_key: (content[0], default_value, content[2])})
else:
msg = f'ERROR: properties_dictionary does not contain a key {property_key}!'
raise Exception(msg)
def update_tree(self, value, xpath):
"""Update tree.
"""
node_name = dirname(xpath)
node = self.page_tree.xpath('//' + node_name)[0]\
if len(self.page_tree.xpath('//' + node_name)) > 0\
else ET.SubElement(self.page_tree.getroot(), node_name)
node.set(basename(xpath).replace('@', ''), str(value))
def _init_tree(self, should_xml_file_exist=False):
"""Initialize page_tree from xml_file if it exists.
"""
if isfile(self.xml_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(self.xml_file, parser)
elif not should_xml_file_exist:
self.page_tree = ET.ElementTree(ET.Element('page'))
self.page_tree.docinfo.URL = self.xml_file
else:
msg = f'ERROR: xml_source_file {self.xml_file} does not exist!'
raise FileNotFoundError(msg)
Index: svgscripts/datatypes/archival_manuscript.py
===================================================================
--- svgscripts/datatypes/archival_manuscript.py (revision 112)
+++ svgscripts/datatypes/archival_manuscript.py (revision 113)
@@ -1,158 +1,198 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent an archival unity of manuscript pages, i.e. workbooks, notebooks, folders of handwritten pages.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
+import re
from os.path import isfile
import sys
from .color import Color
from .description import Description
from .earlier_description import EarlierDescription
from .manuscript import ManuscriptUnity
from .page import Page, FILE_TYPE_XML_MANUSCRIPT, FILE_TYPE_SVG_WORD_POSITION
from .reconstructed_konvolut import ReconstructedKonvolut
sys.path.append('py2ttl')
from class_spec import SemanticClass
sys.path.append('shared_util')
-from myxmlwriter import parse_xml_of_type, write_pretty, xml_has_type
+from myxmlwriter import copy_to_bak_dir, parse_xml_of_type, write_pretty, xml_has_type
class ArchivalManuscriptUnity(ManuscriptUnity):
"""
This class represents an archival unity of manuscript pages (workbooks, notebooks and portfolios of handwritten pages).
@label archival unity of manuscript pages
Args:
title title of archival unity
manuscript_type type of manuscript: 'Arbeitsheft', 'Notizheft', 'Mappe'
manuscript_tree lxml.ElementTree
"""
+ STATUS_COMPLETE = 'complete'
+ STATUS_MIXED = 'mixed'
+ STATUS_BLANK = 'blank'
XML_TAG = 'manuscript'
XML_COLORS_TAG = 'colors'
XML_GSA_PATH = 'signature/gsa'
UNITTESTING = False
- def __init__(self, title='', gsa_signature=None, manuscript_type='', manuscript_tree=None):
+ def __init__(self, title='', gsa_signature=None, manuscript_type='', manuscript_tree=None, status=STATUS_BLANK):
super(ArchivalManuscriptUnity,self).__init__(title=title, manuscript_type=manuscript_type,manuscript_tree=manuscript_tree)
self.colors = []
self.earlier_descriptions = []
self.gsa_signature = gsa_signature
self.reconstructed_konvoluts = []
+ self.status = status
self.styles = []
+ @staticmethod
+ def ADD_ALIASES(xml_manuscript_file, alias_file_name):
+ """Add alias for faksimile image names to pages.
+ """
+ with open(alias_file_name, 'r') as alias_file:
+ lines = alias_file.readlines()
+ manuscript_tree = parse_xml_of_type(xml_manuscript_file, FILE_TYPE_XML_MANUSCRIPT)
+ for line in lines:
+ parts = re.split('\s*->\s*', line)
+ if len(parts) == 2:
+ alias = parts[0]
+ rest = re.split('\s', parts[1])
+ for item in rest:
+ if re.match('\d+[a-z]+', item) and len(manuscript_tree.xpath(f'//pages/page[@number="{item}"]')) > 0:
+ page_node = manuscript_tree.xpath(f'//pages/page[@number="{item}"]')[0]
+ page_node.set('alias', alias)
+ write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_tree.docinfo.URL,\
+ script_name=__file__, backup=True, file_type=FILE_TYPE_XML_MANUSCRIPT)
+
+
@classmethod
def create_cls(cls, xml_manuscript_file, page_status_list=None, page_xpath='', update_page_styles=False):
"""Create an instance of ArchivalManuscriptUnity from a xml file of type FILE_TYPE_XML_MANUSCRIPT.
:return: ArchivalManuscriptUnity
"""
manuscript = super(ArchivalManuscriptUnity,cls).create_cls(xml_manuscript_file)
manuscript_tree = manuscript.manuscript_tree
manuscript.colors = [ Color.create_cls(node=color_node) for color_node in manuscript_tree.xpath('.//' + cls.XML_COLORS_TAG + '/' + Color.XML_TAG) ]
if page_xpath == '':
page_status = ''
if page_status_list is not None\
and type(page_status_list) is list\
and len(page_status_list) > 0:
page_status = '[' + ' and '.join([ f'contains(@status, "{status}")' for status in page_status_list ]) + ']'
page_xpath = f'//pages/page{page_status}/@output'
+ hasBlanks = len(manuscript_tree.xpath('//pages/page[contains(@status,"blank")]')) > 0
+ hasComplete = len(manuscript_tree.xpath('//pages/page[contains(@status,"OK")]')) > 0
+ if hasBlanks and hasComplete:
+ manuscript.status = cls.STATUS_MIXED
+ elif hasComplete:
+ manuscript.status = cls.STATUS_COMPLETE
included_page_list = [ page_source\
for page_source in manuscript_tree.xpath(page_xpath)\
if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ]
- manuscript.pages = [ Page.create_cls(page_source, create_dummy_page=(page_source not in included_page_list))\
+ blank_list = []
+ if hasBlanks:
+ blank_list = [ page_source for page_source in manuscript_tree.xpath('//pages/page[contains(@status,"blank")]/@output')\
+ if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ]
+ manuscript.pages = [ Page.create_cls(page_source,\
+ isBlank=(page_source in blank_list),\
+ create_dummy_page=(page_source not in included_page_list and page_source not in blank_list))\
for page_source in manuscript_tree.xpath('//pages/page/@output')\
if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ]
if update_page_styles:
for page in manuscript.pages:
if 'xml_file' in page.__dict__.keys():
page.update_styles(manuscript=manuscript, add_to_parents=True, create_css=True)
description_node = manuscript_tree.xpath(Description.XML_TAG)[0]\
if len(manuscript_tree.xpath(Description.XML_TAG)) > 0\
else None
if description_node is not None:
manuscript.description = Description.create_cls_from_node(description_node.xpath(Description.ROOT_TAG)[0])\
if len(description_node.xpath(Description.ROOT_TAG)) > 0\
else None
for earlier_description_node in description_node.xpath(EarlierDescription.ROOT_TAG):
earlier_description = EarlierDescription.create_cls_from_node(earlier_description_node)
if earlier_description is not None:
manuscript.earlier_descriptions.append(earlier_description)
manuscript.reconstructed_konvoluts = [ ReconstructedKonvolut.create_cls(rk_node.get('output'), page_status_list=page_status_list, page_xpath=page_xpath)\
for rk_node in manuscript_tree.xpath(ReconstructedKonvolut.XML_TAG) ]
manuscript.gsa_signature = manuscript.manuscript_tree.xpath(f'//{cls.XML_GSA_PATH}')[0].text\
if len(manuscript.manuscript_tree.xpath(f'//{cls.XML_GSA_PATH}')) > 0\
else None
return manuscript
def get_color(self, hex_color) -> Color:
"""Return color if it exists or None.
"""
if hex_color in [ color.hex_color for color in self.colors ]:
return [ color for color in self.colors if color.hex_color == hex_color ][0]
return None
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(ArchivalManuscriptUnity,cls).get_semantic_dictionary()
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('status', str,\
+ name='archivalManuscriptUnityHasDataProcessingStatus', label='status of data processing',\
+ comment='The status of the data processing of this archival manuscript unity'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', list))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('gsa_signature', str))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('reconstructed_konvoluts', ReconstructedKonvolut,\
name='partsBelongToReconstructedKonvolut',label='parts of manuscript belong to reconstructed convolut',\
comment='Some of the pages of this manuscript belong to a reconstructed convolut of pages.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_descriptions', EarlierDescription))
return cls.return_dictionary_after_updating_super_classes(dictionary)
def update_colors(self, color):
"""Update manuscript colors if color is not contained.
"""
if self.get_color(color.hex_color) is None:
self.colors.append(color)
if self.manuscript_tree is not None:
if len(self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)) > 0:
self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)[0].getparent().remove(self.manuscript_tree.xpath('.//' + self.XML_COLORS_TAG)[0])
colors_node = ET.SubElement(self.manuscript_tree.getroot(), self.XML_COLORS_TAG)
for color in self.colors:
color.attach_object_to_tree(colors_node)
if not self.UNITTESTING:
write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_tree.docinfo.URL,\
script_name=__file__, backup=True,\
file_type=FILE_TYPE_XML_MANUSCRIPT)
def update_styles(self, *styles):
"""Update manuscript styles.
"""
for style in styles:
if style not in self.styles:
#print(style.css_styles)
self.styles.append(style)
Index: svgscripts/datatypes/word.py
===================================================================
--- svgscripts/datatypes/word.py (revision 112)
+++ svgscripts/datatypes/word.py (revision 113)
@@ -1,921 +1,942 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
import inspect
from lxml import etree as ET
from operator import attrgetter
import re
import string
import sys
import warnings
from .box import Box
from .editor_comment import EditorComment
from .matrix import Matrix
from .path import Path
from .simple_word import SimpleWord
from .style import Style
+from .standoff_tag import StandoffTag
from .word_deletion_path import WordDeletionPath
from .word_position import WordPosition
from .transkription_position import TranskriptionPosition
from .writing_process import WritingProcess
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
def execute_function_on_parts(word_parts, func_name):
"""Execute function on parts and add those parts instead of original word to word_parts.
:return: new word_parts, output from func
"""
copy_parts = word_parts[:]
for word in word_parts:
output = eval('word.{0}()'.format(func_name))
if len(word.word_parts) > 0:
for part_word in word.word_parts:
copy_parts.insert(copy_parts.index(word), part_word)
copy_parts.remove(word)
word.word_parts = []
return copy_parts, output
def update_transkription_position_ids(word):
"""Update transkription_position' ids according to index.
"""
word_part_ids = [ wp.id for wp in word.word_parts ]
if len(word_part_ids) != len(set(word_part_ids)):
for id, wp in enumerate(word.word_parts):
wp.id = id
for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
transkription_position.id = index
transkription_position.has_box = None
transkription_position.deleted = False
class Word(SimpleWord):
"""
This class represents a word.
"""
COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ]
APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' }
DATA = 'debug-data'
RDFS_SUBCLASSOF_LIST = ['https://www.e-editiones.ch/ontology/text#HandwrittenText']
XML_TAG = 'word'
XML_EARLIER_VERSION = 'earlier-version'
XML_OVERWRITES = 'overwrites'
XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\
'isDeletionOfWord': 'deletesEarlierPart',\
'isExtensionOfWord': 'extendsEarlierVersion',\
- 'isTransformationOfWord': 'transformsEarlierPart' }
+ 'isTransformationOfWord': 'transformsEarlierPart',\
+ 'undosCorrectionOfWord': 'undosCorrectionOfPart'}
def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None):
super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
self.corrections = []
self.clean_edited_text = None
self.deleted = deleted
self.deletion_paths = []
self.deletion_paths_near_word = []
self.debug_container = {}
self.debug_msg = None
self.earlier_version = earlier_version
self.edited_text = None
#self.editor_comment = None
self.editor_comments = []
self.isClarificationOfWord = None
self.isDeletionOfWord = None
self.isExtensionOfWord = None
self.isTransformationOfWord = None
+ self.undosCorrectionOfWord = None
if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0:
self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ])
self.overwrites_word = None
+ self.markup4partThatOverwritesWord = None
self.process_flags = []
self.styles = styles\
if styles is not None\
else []
self.undeleted_from_deletion_paths = []
self.verified = None
self.writing_process_id = writing_process_id
self.writing_processes = []
self.word_insertion_mark = None
self.word_box = None
self.word_parts = word_parts if word_parts is not None else []
self.word_part_objs = word_part_objs if word_part_objs is not None else []
def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Add a word deletion path to word.
"""
if len(self.word_parts) > 0:
for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
elif self.deleted:
index = 0
while len(self.deletion_paths) == 0 and index < len(self.transkription_positions):
include_pwps = (len(self.transkription_positions[index].positional_word_parts) > 0
and abs(self.transkription_positions[index].left-self.transkription_positions[index].positional_word_parts[0].left) < 10)
word_path = Path.create_path_from_transkription_position(self.transkription_positions[index],\
tr_xmin=tr_xmin, tr_ymin=tr_ymin, include_pwps=include_pwps)
self.deletion_paths += [ deletion_path for deletion_path in deletion_paths\
if not Path.is_path_contained(self.deletion_paths, deletion_path)\
and deletion_path.do_paths_intersect(word_path) ]
index += 1
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
word_node = super(Word,self).attach_word_to_tree(target_tree)
if self.deleted is not None:
word_node.set('deleted', str(self.deleted).lower())
+ if len(self.undeleted_from_deletion_paths) > 0:
+ word_node.set('undeleted', 'true')
if self.verified is not None:
word_node.set('verified', str(self.verified).lower())
if self.edited_text is not None:
word_node.set('edited-text', self.edited_text)
#if self.editor_comment is not None:
# self.editor_comment.attach_object_to_tree(word_node)
for editor_comment in self.editor_comments:
editor_comment.attach_object_to_tree(word_node)
if self.writing_process_id > -1:
word_node.set('writing-process-id', str(self.writing_process_id))
if len(self.process_flags) > 0:
word_node.set('process-flags', ' '.join(self.process_flags))
for index, word_part in enumerate(self.word_parts):
word_part.id = index
word_part.attach_word_to_tree(word_node)
if self.earlier_version is not None:
earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION)
self.earlier_version.attach_word_to_tree(earlier_node)
if self.overwrites_word is not None\
and len(self.overwrites_word.transkription_positions) > 0:
overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES)
self.overwrites_word.attach_word_to_tree(overwrite_node)
if self.word_box is not None:
self.word_box.attach_object_to_tree(word_node)
if len(self.corrections) > 0:
word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ])))
for deletion_id, deletion_path in enumerate(self.deletion_paths):
deletion_path.id = deletion_id
deletion_path.tag = WordDeletionPath.XML_TAG
deletion_path.attach_object_to_tree(word_node)
for key in self.XML_CORRECTION_DICT.keys():
if self.__dict__[key] is not None:
word_node.set(self.XML_CORRECTION_DICT[key], 'true')
return word_node
def belongs_to_multiple_writing_processes(self, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if len(self.word_parts) > 0 and include_parts:
return len(set(word.writing_process_id for word in self.word_parts)) > 1
return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1
def set_parent_word_writing_process_id(self):
"""Set writing_process_id for parent word.
"""
ids = set(word.transkription_positions[0].style for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None)
if len(ids) > 1:
self.writing_process_id = max([style.writing_process_id for style in ids])
if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\
for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\
> 1:
self.writing_process_id += 1
@classmethod
def create_cls(cls, word_node):
"""Creates a word from a (lxml.Element) node.
[:return:] Word
"""
cls = super(Word,cls).create_cls(word_node)
cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
cls.split_strings = None
cls.join_string = word_node.get('join')
if bool(word_node.get('split')):
cls.split_strings = word_node.get('split').split(' ')
if ''.join(cls.split_strings) != cls.text:
error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\
format(word_node.getroottree().docinfo.URL, str(cls.id))\
+ 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\
+ 'Text attribute: "{0}".\n'.format(cls.text)
raise Exception(error_msg)
cls.verified = word_node.get('verified') == 'true'\
if bool(word_node.get('verified')) else None
cls.deleted = word_node.get('deleted') == 'true'\
if bool(word_node.get('deleted')) else None
cls.edited_text = word_node.get('edited-text')
if cls.edited_text is not None:
cls.clean_edited_text = cls._create_clean_text(cls.edited_text)
cls.editor_comments = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ]
cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ]
if bool(word_node.get('corrections')):
- for index in [ int(i) for i in word_node.get('corrections').split(' ') ]:
- if index < len(cls.word_parts):
- cls.corrections.append(cls.word_parts[index])
+ try:
+ for index in [ int(i) for i in word_node.get('corrections').split(' ') ]:
+ if index < len(cls.word_parts):
+ cls.corrections.append(cls.word_parts[index])
+ except Exception:
+ corrections = word_node.get('corrections')
+ msg = f'Word {cls.id} ln: {cls.line_number} "{cls.text}": There has been an error with the corrections of this cls: "{corrections}"!'
+ warnings.warn(msg)
cls.earlier_version = None
if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0:
cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0]
for key_value in cls.XML_CORRECTION_DICT.values():
if word_node.get(key_value) == 'true':
cls.__dict__[key_value] = True
if cls.earlier_version is not None:
for word_part in cls.word_parts:
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\
and len(cls.word_parts) <= len(cls.earlier_version.word_parts):
try:
word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id]
except Exception:
msg = f'{cls.id} {cls.text}: {word_part.id}'
raise Exception(msg)
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls.earlier_version
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls
cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\
if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\
else None
+ if cls.overwrites_word is not None:
+ cls.markup4partThatOverwritesWord = [ StandoffTag.create_cls_from_node(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/bold')][0]\
+ if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/bold')) > 0\
+ else None
cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\
if len(word_node.xpath('./' + Box.XML_TAG)) > 0\
else None
cls.deletion_paths = [ Path(node=node) for node in word_node.xpath(f'./{WordDeletionPath.XML_TAG}') ]
if bool(word_node.get('undeleted')):
cls.undeleted_from_deletion_paths = cls.deletion_paths
cls.process_flags = word_node.get('process-flags').split(' ')\
if bool(word_node.get('process-flags'))\
else []
return cls
@classmethod
def join_words(cls, list_of_words, add_white_space_between_words=False):
"""Creates a word from a list of words.
[:return:] Word
"""
if len(list_of_words) > 1:
deleted = True in [ word.deleted for word in list_of_words ]\
and len(set([ word.deleted for word in list_of_words ])) == 1
line_number = list_of_words[0].line_number\
if len(set([ word.line_number for word in list_of_words ])) == 1\
else -1
faksimile_positions = []
for word in list_of_words:
if len(word.word_parts) > 0:
faksimile_positions += word.faksimile_positions
index = list_of_words.index(word)
list_of_words.remove(word)
for part_word in reversed(word.word_parts):
list_of_words.insert(index, part_word)
new_word_text = ''.join([word.text for word in list_of_words])\
if not add_white_space_between_words\
else ' '.join([word.text for word in list_of_words])
new_word = cls(id=list_of_words[0].id, text=new_word_text, faksimile_positions=faksimile_positions,\
line_number=line_number, deleted=deleted, word_parts=list_of_words)
if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]:
change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0]
new_word.edited_text = new_word.text.replace(change_text, change_text[:-1])
for id, word in enumerate(new_word.word_parts): word.id = id
return new_word
if len(list_of_words) > 0:
return list_of_words[0]
else:
return None
def create_earlier_version(self, root_word=None, id=0):
"""Create an earlier version of word.
"""
if root_word is None:
root_word = self
root_word.set_parent_word_writing_process_id()
word_parts = []
non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\
if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ]
non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts)
if non_single_punctuation_word_parts_length > 0\
and len([ word_part for word_part in non_single_punctuation_word_parts\
if word_part.deleted ])\
== non_single_punctuation_word_parts_length:
self.deleted = True
for word_part in non_single_punctuation_word_parts: word_part.deleted = False
for id, word_part in enumerate(self.word_parts):
earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id)
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
elif word_part.overwrites_word is not None\
and ((len(word_part.transkription_positions) > 0\
and word_part.overwrites_word.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style\
!= word_part.overwrites_word.transkription_positions[0].style)
or word_part.word_box.earlier_version):
word_part.overwrites_word.id = word_part.id
word_parts.append(word_part.overwrites_word)
word_part.isTransformationOfWord = word_part.overwrites_word
#print(f'transform: {self.text}')
if word_part not in self.corrections:
self.corrections.append(word_part)
elif root_word.writing_process_id > -1\
and (len(word_part.transkription_positions) > 0\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style.writing_process_id\
== root_word.writing_process_id):
word_part.extendsEarlierVersion = True
#print('extends')
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
#print(f'default: {self.text}')
word_parts.append(earlierWordPart)
text = ''.join([ word.text for word in word_parts ])\
if len(word_parts) > 0\
else self.text
if len(word_parts) == 1:
self.transkription_positions += word_parts[0].transkription_positions
self.faksimile_positions += word_parts[0].faksimile_positions
word_parts = []
new_transkription_positions = copy.deepcopy(self.transkription_positions)
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None:
writing_process_id = self.transkription_positions[0].style.writing_process_id
for new_tp in new_transkription_positions:
new_tp.style.writing_process_id = writing_process_id
return Word(id=id, text=text, transkription_positions=new_transkription_positions,\
faksimile_positions=self.faksimile_positions, line_number=self.line_number,\
word_parts=word_parts)
def create_correction_history(self, page=None, box_style=None):
"""Create correction history.
"""
if self.word_box is not None:
manuscript = self.transkription_positions[0].style.manuscript\
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None\
else None
style = Style()
if box_style is not None:
style = box_style
if page is not None:
style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript)
for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]:
style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions)
for transkription_position in transkription_positions:
transkription_position.style = style
self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\
line_number=self.line_number)
for word_part in self.word_parts:
word_part.create_correction_history(page=page, box_style=box_style)
if len(self.word_parts) > 0:
earlier_version = self.create_earlier_version()
extending_words = self._get_parts_with_property_key('extendsEarlierVersion')
if len(extending_words) > 0:
for word in extending_words:
word.isExtensionOfWord = earlier_version
if self.has_mixed_status('deleted', include_parts=True):
self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ])
if len(self.corrections) > 0:
self.earlier_version = earlier_version
@staticmethod
def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
"""Creates a word from a (lxml.Element) node or word_part_objs.
[:return:] Word
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
text = word_node.get('text')
deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
if len(word_node.findall('.//' + Word.DATA)) > 0\
else [ item.attrib for item in word_node.findall('.//part')]
return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
WIDTH = 5
TOPCORRECTION = 2.0
FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
height = height
x = round(float(word_part_objs[0]['x']), 3)
if(page is not None and bool(page.style_dict)):
HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
if endSign is not None and '%' in endSign:
lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
if bool(page.style_dict[key].get('font-size'))]
lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
elif endSign is not None and '%' in endSign:
endX = float(endX) + WIDTH
bottom = round(float(word_part_objs[0]['y']), 3)
y = round(bottom - height + TOPCORRECTION, 3)
width = round(float(endX) - x, 3)
transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
text = ''.join([ dict['text'] for dict in word_part_objs])
line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
word.debug_msg = debug_msg
return word
else:
error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(Word,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\
cardinality=1, cardinality_restriction='minCardinality',\
name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\
name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\
name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\
comment='Word has been deleted by the author using a deletion path.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('undeleted_from_deletion_paths', WordDeletionPath,\
name='wordIsUndeletedFromPath', label='word has been undeleted',\
comment='Word has been undeleted by the author using dots.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comments', EditorComment,\
name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\
name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\
name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('clean_edited_text', str,\
name='hasCleanEditedText', label='word has an edited text without punctuation',\
comment='Word has a text without punctuation that is edited automatically by removing deleted parts or hyphens.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\
name='isClarificationOfWord', label='word is a clarification of word',\
comment='The author has used this part of the word in order to clarify the appearance of that word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\
name='isDeletionOfWord', label='word is a deletion of word',\
comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\
name='isExtensionOfWord', label='word is a extension of word',\
comment='The author has used this part of a word in order to extend an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\
name='isTransformationOfWord', label='word is a transformation of word',\
comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.'))
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('undosCorrectionOfWord', Word,\
+ name='undosCorrectionOfWord', label='word undos the correction done by word',\
+ comment='The author has used this part of a word in order to undo the correction done by that word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\
name='overwritesWord', label='word overwrites word',\
comment='The author has used this word in order to overwrite that word.'))
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('markup4partThatOverwritesWord', StandoffTag,\
+ name='hasStandoffMarkup4PartThatOverwritesWord', label='word has standoff markup for the part that overwrites a word',\
+ comment='word has standoff markup that highlights the part of its text that overwrites a word.'))
# This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING,
# cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class.
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\
name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\
subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING))
super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\
name='isCorrectionOfWord', label='word is a correction of word',\
comment='The author has used this word in order to correct that word.')
for key in cls.XML_CORRECTION_DICT.keys():
correction_dict = dictionary[cls.PROPERTIES_KEY].get(key)
correction_dict.update(super_property_dictionary)
dictionary[cls.PROPERTIES_KEY].update({key: correction_dict})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def has_mixed_status(self, property_key, include_parts=False, concerns_word=True):
"""Returns true if transkription_positions have mixed status concerning the property_key in their __dict__.
"""
if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions):
return False
if len(self.word_parts) > 0 and include_parts:
if concerns_word:
if False in set(property_key in word.__dict__.keys() for word in self.word_parts):
return False
return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1
else:
return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\
if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1
return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1
def init_word(self, page):
"""Initialize word with objects from page.
"""
super(Word,self).init_word(page)
if self.writing_process_id > -1:
self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ]
writing_processes = self.writing_processes
for word_part in self.word_parts:
word_part.init_word(page)
self.lines += word_part.lines
self.writing_processes += word_part.writing_processes
self.lines = [ line for line in set(self.lines) ]
self.writing_processes = [ wp for wp in set(self.writing_processes)]
if self.overwrites_word is not None:
self.overwrites_word.init_word(page)
if self.earlier_version is not None:
if self.earlier_version.writing_process_id == -1:
self.earlier_version.writing_process_id = self.writing_process_id-1
if self.earlier_version.line_number == -1:
self.earlier_version.line_number = self.line_number
self.earlier_version.init_word(page)
self.deletion_paths = [ page.get_word_deletion_path(path) for path in self.deletion_paths if path.path is not None ]
def join(self, other_word, append_at_end_of_new_word=True, add_white_space_between_words=False):
"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
"""
if append_at_end_of_new_word:
self.text = self.text + other_word.text\
if not add_white_space_between_words\
else self.text + ' ' + other_word.text
for position in other_word.transkription_positions:
position.id = str(len(self.transkription_positions))
self.transkription_positions.append(position)
for position in other_word.faksimile_positions:
position.id = str(len(self.faksimile_positions))
self.faksimile_positions.append(position)
else:
self.text = other_word.text + self.text
index = 0
for position in other_word.transkription_positions:
self.transkription_positions.insert(index, position)
index += 1
while index < len(self.transkription_positions):
self.transkription_positions[index].id = str(index)
index += 1
index = 0
for position in other_word.faksimile_positions:
self.faksimile_positions.insert(indexposition)
index += 1
while index < len(self.faksimile_positions):
self.faksimile_positions[index].id = str(index)
index += 1
self.simplify_transkription_positions()
def partition_according_to_deletion(self):
"""Partition a word according to its transkription_positions' deletion status
->split word and add partial words as its parts.
"""
if self.has_mixed_status('deleted'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.deleted != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
for tp in transkription_positions:
newWord.deletion_paths += tp._deletion_paths
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.deleted
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
for tp in transkription_positions:
newWord.deletion_paths += tp._deletion_paths
self.word_parts.append(newWord)
self.transkription_positions = []
self.line_number = -1
self.deleted = False
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion')
elif not self.deleted\
and len(self.transkription_positions) > 0\
and self.transkription_positions[0].deleted:
self.deleted = True
for tp in self.transkription_positions:
self.deletion_paths += tp._deletion_paths
def partition_according_to_writing_process_id(self):
"""Partition a word according to its transkription_positions' writing_process_ids
->split word and add partial words as its parts.
"""
if self.belongs_to_multiple_writing_processes():
last_writing_process_id = -1
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.writing_process_id != last_writing_process_id\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_writing_process_id = transkription_position.writing_process_id
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id')
if self.belongs_to_multiple_writing_processes(include_parts=True):
self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0]
elif len(self.transkription_positions) > 0:
self.writing_process_id = self.transkription_positions[0].writing_process_id
def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False):
"""Determines whether word is over a word box.
"""
word_over_box = None
if len(self.word_parts) > 0:
for word in self.word_parts:
current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None))
if current_word is not None and current_word.word_box is not None:
word_over_box = current_word
else:
new_tp_dict = {}
for index, transkription_position in enumerate(self.transkription_positions):
if previous_word_has_box and index == 0:
if len(transkription_position.positional_word_parts) > 0:
transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2
#print(f'{self.text}: {transkription_position.positional_word_parts[0].left}')
else:
transkription_position.left += 1
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
containing_boxes = [ box_path for box_path in box_paths\
if word_path.is_partially_contained_by(box_path)\
or box_path.do_paths_intersect(word_path) ]
if len(containing_boxes) > 0:
if previous_word_has_box:
print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}')
self._set_box_to_transkription_position(containing_boxes[0], word_path,\
transkription_position, new_tp_dict, tr_xmin)
box_paths.remove(containing_boxes[0])
for replace_tp in new_tp_dict.keys():
for tp in new_tp_dict.get(replace_tp):
self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp)
self.transkription_positions.remove(replace_tp)
word_over_box = self._get_partial_word_over_box()
update_transkription_position_ids(self)
return word_over_box
def set_word_insertion_mark(self, word_insertion_mark):
"""Sets word_insertion_mark
"""
self.word_insertion_mark = word_insertion_mark
def set_writing_process_id_to_transkription_positions(self, page):
"""Determines the writing process id of the transkription_positions.
"""
for transkription_position in self.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
if font_key in page.fontsizekey2stage_mapping.keys():
transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
def simplify_transkription_positions(self):
"""Merge transkription_positions if possible.
"""
index = len(self.transkription_positions)-1
while index > 0\
and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
current_tp = self.transkription_positions[index]
index -= 1
previous_tp = self.transkription_positions[index]
if previous_tp.is_mergebale_with(current_tp):
positional_word_parts = previous_tp.positional_word_parts
positional_word_parts += current_tp.positional_word_parts
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
if len(transkription_positions) == 1:
transkription_positions[0].writing_process_id = previous_tp.writing_process_id\
if previous_tp.writing_process_id != -1\
else current_tp.writing_process_id
self.transkription_positions.pop(index+1)
self.transkription_positions[index] = transkription_positions[0]
#print(self.text, len(self.transkription_positions))
def split(self, split_string, start_id=0):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString, currentString, nextString = self.text.partition(split_string)
currentWord = None
previousWord = None
nextWord = None
previousIndex = 0
current_id = start_id
all_positional_word_parts = []
for position in self.transkription_positions:
all_positional_word_parts += position.positional_word_parts
if len(all_positional_word_parts) == 0:
warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString))
if len(previousString) > 0:
previous_pwps = []
while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
previous_pwps.append(all_positional_word_parts[previousIndex])
previousIndex += 1
if previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString))
else:
previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split')
previous_text = ''.join([ pwp.text for pwp in previous_pwps ])
previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions)
previousWord.faksimile_positions = self.faksimile_positions
current_id += 1
all_positional_word_parts = all_positional_word_parts[previousIndex:]
if len(nextString) > 0:
tmp_pwps = []
index = 0
while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
tmp_pwps.append(all_positional_word_parts[index])
index += 1
if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString))
else:
next_pwps = all_positional_word_parts[index:]
next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split')
next_text = ''.join([ pwp.text for pwp in next_pwps ])
nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions)
nextWord.faksimile_positions = self.faksimile_positions
all_positional_word_parts = all_positional_word_parts[:index]
current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split')
current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ])
currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions)
currentWord.faksimile_positions = self.faksimile_positions
return previousWord, currentWord, nextWord
def split_according_to_status(self, status, splits_are_parts=False):
"""Split a word according to its transkription_positions' text.
:return: a list of new word.Word
"""
new_words = []
if self.has_mixed_status(status):
last_status = None
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.__dict__[status] != last_status\
and len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.__dict__[status]
if len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
if splits_are_parts:
self.word_parts += new_words
if len(self.word_parts) > 0:
self.transkription_positions = []
return new_words
def undo_partitioning(self):
"""Undo partitioning.
"""
if len(self.word_parts) > 0:
for word_part in self.word_parts:
word_part.undo_partitioning()
if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]):
self.transkription_positions += word_part.transkription_positions
self.earlier_version = None
self.edited_text = None
self.word_box = None
self.word_parts = []
self.corrections = []
self.earlier_versions = []
self.box_paths = []
def _create_new_word(self, transkription_positions, status, new_id=0):
"""Create a new word from self and transkription_positions.
"""
newWord = Word(id=new_id, transkription_positions=transkription_positions)
for key in self.COPY_PROPERTY_KEY:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys():
newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status])
else:
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
return newWord
def _get_parts_with_property_key(self, property_key):
"""Return a list of word_parts with property == property_key.
"""
word_parts = []
for word_part in self.word_parts:
if property_key in word_part.__dict__.keys():
word_parts.append(word_part)
else:
word_parts += word_part._get_parts_with_property_key(property_key)
return word_parts
def _get_partial_word_over_box(self):
"""Partition a word according to its transkription_positions' has_box
->split word and add partial words as its parts.
:return: word over box or self
"""
word_over_box = None
if self.has_mixed_status('has_box'):
transkription_positions = []
last_word_box = None
for transkription_position in self.transkription_positions:
if transkription_position.has_box != last_word_box\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
transkription_positions = []
transkription_positions.append(transkription_position)
last_word_box = transkription_position.has_box
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
self.transkription_positions = []
elif len(self.word_parts) > 0:
#self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box')
for word_part in self.word_parts:
if word_over_box is None:
word_over_box = word_part._get_partial_word_over_box()
else:
break
elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1:
word_over_box = self
word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box
return word_over_box
def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin):
"""Set box_path to transkription_position that is contained by box_path.
Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary.
"""
if box_path.contains_path(word_path):
transkription_position.has_box = box_path
elif box_path.contains_start_of_path(word_path):
split_position = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[0].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
elif box_path.contains_end_of_path(word_path):
split_position = box_path.path.bbox()[0] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
else: # box_path in the middle of word_pathz
split_position1 = box_path.path.bbox()[0] - tr_xmin
split_position2 = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position1, split_position2)
if len(new_tps) >= 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
def do_paths_intersect_saveMode(mypath1, mypath2):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try:
return mypath1.path.intersect(mypath2.path, justonemode=True)\
or mypath1.is_partially_contained_by(mypath2)
except AssertionError:
return False
Index: svgscripts/datatypes/page.py
===================================================================
--- svgscripts/datatypes/page.py (revision 112)
+++ svgscripts/datatypes/page.py (revision 113)
@@ -1,432 +1,462 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile, basename
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import re
import sys
import warnings
from .box import Box
from .color import Color
from .image import Image, SVGImage
+from .editor_comment import EditorComment
from .faksimile_image import FaksimileImage
from .faksimile_position import FaksimilePosition
from .imprint import Imprint
from .lineNumber import LineNumber
from .line import Line
from .mark_foreign_hands import MarkForeignHands
from .matrix import Matrix
from .path import Path
from .positional_word_part import PositionalWordPart
from .super_page import SuperPage
from .style import Style
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
from .word_deletion_path import WordDeletionPath
from .word_insertion_mark import WordInsertionMark
sys.path.append('py2ttl')
from class_spec import SemanticClass
sys.path.append('shared_util')
from main_util import extract_paths_on_tf, get_paths_near_position
FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION
FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT
STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK
STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK
class Page(SemanticClass,SuperPage):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
faksimile_image: FaksimileImage.
faksimile_svgFile: svg file containing information about word positions.
"""
UNITTESTING = False
def __init__(self, xml_source_file=None, faksimile_image=None, faksimile_svgFile=None, add_paths_near_words=False, warn=False, number=None):
if xml_source_file is not None:
super(Page,self).__init__(xml_source_file)
self.update_property_dictionary('faksimile_image', faksimile_image)
self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile)
self.init_all_properties()
self.add_style(style_node=self.page_tree.getroot().find('.//style'))
self.faksimile_text_field = None
self.svg_text_field = None
self.init_node_objects()
self.warn = warn
self.add_deletion_paths_to_words(add_paths_near_words)
else:
self.page_tree = None
self.number = number
def add_deletion_paths_to_words(self, add_paths_near_words=False):
"""Add deletion paths to words.
"""
words = [ word for word in self.words if (len(word.word_parts) == 0 and word.deleted and len(word.deletion_paths) == 0)\
or 'add_paths_near_words' in word.process_flags ]
words += [ word for word in self.words\
if len(word.word_parts) > 0 and True in\
[ (wp.deleted and len(wp.deletion_paths) == 0) for wp in word.word_parts ]]
if len(words) > 0 and ((self.svg_file is not None and isfile(self.svg_file))\
or (self.source is not None and isfile(self.source))):
svg_file = self.svg_file if self.svg_file is not None else self.source
transkription_field = TranskriptionField(svg_file)
tr_xmin = transkription_field.xmin if (self.svg_image is None or self.svg_image.text_field is None) else 0
tr_ymin = transkription_field.ymin if (self.svg_image is None or self.svg_image.text_field is None) else 0
word_deletion_paths = self.word_deletion_paths
index = 0
dp_updated = False
while index < len(words):
word = words[index]
word.add_deletion_paths(word_deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
if len(word.deletion_paths) > 0 or True in [ len(w.deletion_paths) > 0 for w in word.word_parts ]:
deletion_paths = word.deletion_paths
for wp in word.word_parts: deletion_paths += wp.deletion_paths
for deletion_path in deletion_paths:
if deletion_path not in self.word_deletion_paths:
self.word_deletion_paths.append(deletion_path)
elif not dp_updated:
word_deletion_paths = extract_paths_on_tf(self)
dp_updated = True
index -= 1
if add_paths_near_words\
and ('add_paths_near_words' in word.process_flags\
or ((word.deleted and len(word.deletion_paths) == 0)\
or True in [ (w.deleted and len(w.deletion_paths) == 0) for w in word.word_parts ])):
if not dp_updated\
and 'add_paths_near_words' in word.process_flags:
word_deletion_paths = extract_paths_on_tf(self)
dp_updated = True
transform = None
tp = None
target_word = word
paths_near_word = []
if word.deleted and len(word.transkription_positions) > 0:
transform = word.transkription_positions[0].transform
for tp in word.transkription_positions:
word.deletion_paths_near_word += get_paths_near_position(tp, word_deletion_paths)
elif len(word.word_parts) > 0:
for wp in word.word_parts:
if wp.deleted and len(wp.transkription_positions) > 0:
target_word = wp
for tp in wp.transkription_positions:
wp.deletion_paths_near_word = get_paths_near_position(tp, word_deletion_paths)
if self.warn and (word.deleted and len(word.deletion_paths) == 0):
warnings.warn(\
f'WARNING: {self.title} {self.number}: {word.id} on {word.line_number}, {word.text} has no deletion paths! {target_word.deletion_paths_near_word}, {transform}')
index += 1
@classmethod
- def create_cls(cls, xml_source_file=None, create_dummy_page=False, page_node=None):
+ def create_cls(cls, xml_source_file=None, create_dummy_page=False, isBlank=False, page_node=None):
"""Create a Page.
"""
if not create_dummy_page:
- return cls(xml_source_file)
+ page = cls(xml_source_file)
+ page.status = 'complete'
+ if isBlank:
+ page.status = 'blank'
+ page.words = []
+ page.lines = []
+ page.word_deletion_paths = []
+ page.word_insertion_marks = []
+ return page
else:
m = re.match(r'(.*)(page[0]*)(.*)(\.xml)', xml_source_file)
if m is not None and len(m.groups()) > 3:
number = m.group(3)
else:
number = basename(xml_source_file).replace('.xml','')
return cls(number=number)
@classmethod
def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree = ET.parse(xml_file)
if source_tree.getroot().find('metadata/type').text == cls.FILE_TYPE_SVG_WORD_POSITION:
page = cls(xml_file)
if word_selection_function is None or len(word_selection_function(page.words)) > 0:
return [ page ]
else:
return []
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
pages = []
xpath = '//page/@output'
if status_contains != '' and status_not_contain != '':
xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain)
elif status_contains != '':
xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains)
elif status_not_contain != '':
xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain)
for xml_source_file in source_tree.xpath(xpath):
if isfile(xml_source_file):
pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function)
return pages
else:
return []
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'number': { 'class': str, 'cardinality': 1}}
+ properties.update(cls.create_semantic_property_dictionary('status', str,\
+ name='pageHasDataProcessingStatus', label='status of data processing',\
+ comment='The status of the data processing of this page'))
properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage, subPropertyOf=cls.HAS_IMAGE))
properties.update(cls.create_semantic_property_dictionary('faksimile_text_field', TextField,\
name='pageIsOnFaksimileTextField', label='page is on faksimile text field',\
comment='Relates a page to the text field on a svg image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
properties.update(cls.create_semantic_property_dictionary('orientation', str))
+ properties.update(cls.create_semantic_property_dictionary('status', str,\
+ name='pageHasDataProcessingStatus', label='status of data processing',\
+ comment='The status of the data processing of this page'))
properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage, subPropertyOf=cls.HAS_IMAGE))
properties.update(cls.create_semantic_property_dictionary('svg_text_field', TextField,\
name='pageIsOnSVGTextField', label='page is on svg text field',\
comment='Relates a page to the text field on a faksimile image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
- for key in [ 'lines','imprints', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']:
+ for key in [ 'lines','imprints', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks', 'editor_comments']:
properties.update(cls.create_semantic_property_dictionary(key, list))
dictionary.update({cls.CLASS_KEY: class_dict})
dictionary.update({cls.PROPERTIES_KEY: properties})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def get_word_deletion_path(self, path=None, d_attribute=None) ->WordDeletionPath:
"""Return a word deletion path that belongs to page.
"""
if path is None and d_attribute is None:
raise Exception('ERROR: get_word_deletion_path needs a path or a d_attribute!')
if d_attribute is None:
d_attribute = path.d_attribute
page_paths = [ dpath for dpath in self.word_deletion_paths if dpath.d_attribute == d_attribute ]
if len(page_paths) > 0:
return page_paths[0]
else:
dpath = WordDeletionPath.create_cls(self, path=path, d_attribute=d_attribute)
if dpath is not None:
dpath.id = len(self.word_deletion_paths)
self.word_deletion_paths.append(dpath)
dpath.attach_object_to_tree(self.page_tree)
return dpath
def init_node_objects(self):
"""Initialize all node objects.
"""
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ]
- self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ]
+ self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('./' + MarkForeignHands.XML_TAG) ]
#self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ]
self.words += [ TextConnectionMark.instantiate_as_word(node, id=index+len(self.words))\
for index, node in enumerate(self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG)) ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.imprints = [ Imprint.create_cls_from_node(imprint_node, self.lines) for imprint_node in self.page_tree.getroot().xpath('//' + Imprint.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
self.word_deletion_paths = [ WordDeletionPath.create_cls(self, node=node) for node in self.page_tree.xpath('./' + WordDeletionPath.XML_TAG) ]
+ self.editor_comments = [ EditorComment.create_cls_from_node(node=node) for node in self.page_tree.xpath('./' + EditorComment.XML_TAG) ]
if self.faksimile_image is not None and self.faksimile_image.text_field is not None:
self.faksimile_text_field = self.faksimile_image.text_field
if self.svg_image is not None and self.svg_image.text_field is not None:
self.svg_text_field = self.svg_image.text_field
for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks:
simple_word.init_word(self)
for wim in self.word_insertion_marks:
if wim.line_number > -1:
wim.line = [ line for line in self.lines if line.id == wim.line_number ][0]
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None):
"""Update the data source of page.
"""
if faksimile_svgFile is not None:
self.faksimile_svgFile = faksimile_svgFile
data_node = self.page_tree.xpath('.//data-source')[0]\
if len(self.page_tree.xpath('.//data-source')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'data-source')
data_node.set('file', self.faksimile_svgFile)
if xml_correction_file is not None:
data_node.set('xml-corrected-words', xml_correction_file)
def update_line_number_area(self, transkription_field, svg_tree=None, set_to_text_field_zero=True):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD = 0.4
if svg_tree is None:
svg_tree = ET.parse(self.source)
if len(self.line_numbers) > 1:
line_number = self.line_numbers[9]\
if transkription_field.is_page_verso() and len(self.line_numbers) > 8\
else self.line_numbers[1]
ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\
if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\
and LineNumber.IS_A_LINE_NUMBER(item)\
and LineNumber(raw_text_node=item).id == line_number.id ]
if len(ln_nodes) > 0:
matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform'))
if transkription_field.is_page_verso():
transkription_field.add_line_number_area_width(matrix.getX())
elif self.svg_file is not None and isfile(self.svg_file):
svg_path_tree = ET.parse(self.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
svg_x = matrix.getX()
svg_y = self.line_numbers[1].bottom + transkription_field.ymin\
if set_to_text_field_zero\
else self.line_numbers[1].bottom
use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
if len(use_nodes) > 0:
symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
if len(d_strings) > 0 and d_strings[0] != '':
path = parse_path(d_strings[0])
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
transkription_field.add_line_number_area_width(matrix.getX() + width)
def update_page_type(self, transkription_field=None):
"""Adds a source to page and attaches it to page_tree.
"""
if self.number.endswith('r')\
or self.number.endswith('v'):
self.page_type = Page.PAGE_VERSO\
if self.number.endswith('v')\
else Page.PAGE_RECTO
else:
if transkription_field is None:
if self.source is None or not isfile(self.source):
raise FileNotFoundError('Page does not have a source!')
transkription_field = TranskriptionField(self.source, multipage_index=self.multipage_index)
self.page_type = Page.PAGE_VERSO\
if transkription_field.is_page_verso()\
else Page.PAGE_RECTO
self.page_tree.getroot().set('pageType', self.page_type)
- def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False):
+ def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False, parentsPWPs=None):
"""Update styles of words and add them to their transkription_positions.
Args:
add_to_parents: Add styles also to word (and if not None to manuscript).
partition_according_to_styles: Partition word if its transkription_positions have different styles.
"""
style_dictionary = {}
if words is None:
words = self.words
for word in words:
if len(word.word_parts) > 0:
self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\
- add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles)
+ add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles, parentsPWPs=parentsPWPs)
+ overwritten = [] if word.overwrites_word is None else [ word.overwrites_word ]
+ if word.earlier_version is not None:
+ overwritten.append(word.earlier_version)
+ if len(overwritten) > 0:
+ parentsPWPs = parentsPWPs if parentsPWPs is not None else []
+ if len(parentsPWPs) == 0:
+ cword = word.word_parts[0] if len(word.word_parts) > 0 else word
+ for tp in cword.transkription_positions:
+ parentsPWPs += tp.positional_word_parts
+ self.update_styles(words=overwritten, manuscript=manuscript, create_css=create_css,\
+ add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles, parentsPWPs=parentsPWPs)
for transkription_position in word.transkription_positions:
- if len(transkription_position.positional_word_parts) > 0:
- style_class = transkription_position.positional_word_parts[0].style_class
+ positional_word_parts = transkription_position.positional_word_parts\
+ if len(transkription_position.positional_word_parts) > 0\
+ else parentsPWPs
+ if len(positional_word_parts) > 0:
+ style_class = positional_word_parts[0].style_class
writing_process_id = -1
for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]:
writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id)
if create_css:
if style_dictionary.get((style_class_key, word.deleted)) is None:
color = None
if len(word.deletion_paths) > 0:
if word.deletion_paths[0].style_class is not None\
and word.deletion_paths[0].style_class != ''\
and self.style_dict.get(word.deletion_paths[0].style_class) is not None:
color = Color.create_cls_from_style_object(self.style_dict.get(word.deletion_paths[0].style_class))
else:
color = Color()
style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\
create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] )
transkription_position.style = style_dictionary[(style_class_key, word.deleted)]
#print(style_dictionary[(style_class_key, word.deleted)])
else:
if style_dictionary.get(style_class_key) is None:
style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css)
style_dictionary[style_class_key].writing_process_id = style_class_key[1]
transkription_position.style = style_dictionary[style_class_key]
if add_to_parents and transkription_position.style not in word.styles:
word.styles.append(transkription_position.style)
if partition_according_to_styles:
word.split_according_to_status('style', splits_are_parts=True)
if manuscript is not None\
and add_to_parents:
manuscript.update_styles(*style_dictionary.values())
def __eq__(self, other):
"""Returns true if self is qualitatively identical to other.
"""
if other is None:
return False
if self.page_tree is None and other.page_tree is None:
return self.number == other.number
if self.page_tree is None or other.page_tree is None:
return False
return self.page_tree.docinfo.URL == other.page_tree.docinfo.URL
def __hash__(self):
"""Return a hash value for self.
"""
try:
if self.page_tree is None:
return hash(self.number)
except AttributeError:
print(self)
return hash(self.number)
return hash(self.page_tree.docinfo.URL)
Index: svgscripts/datatypes/mark_foreign_hands.py
===================================================================
--- svgscripts/datatypes/mark_foreign_hands.py (revision 112)
+++ svgscripts/datatypes/mark_foreign_hands.py (revision 113)
@@ -1,160 +1,182 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent the mark for text by some foreign hand.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from .matrix import Matrix
from .special_word import SpecialWord
from .standoff_tag import StandoffTag
from .text import Text
class MarkForeignHands(SpecialWord):
"""
This class represents the mark for text by some foreign hand.
"""
XML_TAG = 'mark-foreign-hands'
XML_SUB_TAG = 'text'
+ XML_OVERWRITES = 'overwrites'
CLASS_MARK = '$'
REPLACE_DICT = { '+': 'x' }
- def __init__(self, id=0, line_number=-1, text=CLASS_MARK, foreign_hands_text=None, pen='', resolution=None, transkription_positions=[], faksimile_positions=[]):
+ def __init__(self, id=0, line_number=-1, text=CLASS_MARK, foreign_hands_text=None, pen='', resolution=None, comment=None, transkription_positions=[], faksimile_positions=[]):
super(MarkForeignHands, self).__init__(id=id, text=text, line_number=line_number,\
transkription_positions=transkription_positions, faksimile_positions=faksimile_positions)
self.foreign_hands_text = foreign_hands_text
self.pen = pen
+ self.comment = comment
self.resolution = resolution
+ self.overwrites_mark = None
def add_content(self, node):
"""Adds content to MarkForeignHands.
"""
self.pen = node.get('pen')
self.resolution = node.get('resolution')
+ self.comment = node.get('comment')
if node.text is not None:
self.foreign_hands_text = Text(content=node.text)
else:
standoff_markups = [ StandoffTag.create_cls_from_node(stf) for stf in node.xpath('./' + '|./'.join(StandoffTag.MARKUP_STYLES))]
content = node.xpath('./content')[0].text if len(node.xpath('./content')) > 0 else None
self.foreign_hands_text = Text(content=content, standoff_markups=standoff_markups, tag='content')
+ self.overwrites_mark = [ self.create_cls(omf) for omf in node.getparent().xpath('./' + self.XML_OVERWRITES + '/' + self.XML_TAG) ][0]\
+ if len(node.getparent().xpath('./' + self.XML_OVERWRITES + '/' + self.XML_TAG)) > 0\
+ else None
def attach_word_to_tree(self, target_tree):
"""Attaches MarkForeignHands to tree target_tree.
"""
node = super(MarkForeignHands,self).attach_word_to_tree(target_tree)
if self.foreign_hands_text is not None:
- content_node = ET.SubElement(node, MarkForeignHands.XML_SUB_TAG)
- content_node.text = self.foreign_hands_text if type(self.foreign_hands_text) == str else self.foreign_hands_text.content
+ text_node = ET.SubElement(node, MarkForeignHands.XML_SUB_TAG)
+ if type(self.foreign_hands_text) == str:
+ text_node.text = self.foreign_hands_text
+ else:
+ text_node.set('id', str(self.foreign_hands_text.id))
+ self.foreign_hands_text.tag = MarkForeignHands.XML_SUB_TAG
+ self.foreign_hands_text.attach_object_to_tree(node)
if self.pen is not None and self.pen != '':
- content_node.set('pen', self.pen)
+ text_node.set('pen', self.pen)
+ if self.comment is not None and self.comment != '':
+ text_node.set('comment', self.comment)
+ if self.overwrites_mark is not None:
+ overwrites_node = ET.SubElement(node, MarkForeignHands.XML_OVERWRITES)
+ self.overwrites_mark.attach_word_to_tree(overwrites_node)
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(MarkForeignHands,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('foreign_hands_text',\
Text, cardinality=1, name='textOfForeignHands', label='text traces of some foreign hand'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('pen',\
str, cardinality=1, cardinality_restriction='maxCardinality',\
name='penOfForeignHands', label='pen used to write text by some foreign hand'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('resolution',\
str, cardinality=1, cardinality_restriction='maxCardinality',\
name='resolutionOfAbbreviation', label='resolution of the abbreviation'))
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('comment',\
+ str, cardinality=1, cardinality_restriction='maxCardinality',\
+ name='foreignHandHasCommentByEditors', label='there is a comment by the editors about this text of foreign hands'))
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_mark', MarkForeignHands,\
+ name='foreignHandOverwritesForeignHand', label='text written by foreign hand overwrites another text written by foreign hand'))
return cls.return_dictionary_after_updating_super_classes(dictionary)
@classmethod
def get_special_char_list(cls):
"""Returns a list of the chars that define this special word.
"""
return [ cls.CLASS_MARK ]
@staticmethod
def find_content(list_of_special_words, transkription_field, svg_tree, style_dict=None, italic_classes=None, SonderzeichenList=None, marginals_extra=False, set_to_text_field_zero=True):
"""Find content for the MarkForeignHands.
"""
if style_dict is None:
style_dict = {}
if italic_classes is None:
italic_classes = []
if SonderzeichenList is None:
SonderzeichenList = []
if len(style_dict) > 0:
if len(italic_classes) == 0:
italic_classes = [ key for key in style_dict\
if bool(style_dict[key].get('font-family')) and style_dict[key]['font-family'].endswith('Italic') ]
if len(SonderzeichenList) == 0:
SonderzeichenList = [ key for key in style_dict\
if bool(style_dict[key].get('font-family')) and style_dict[key]['font-family'].startswith('Sonderzeichen') ]
nodes_in_margin_field = [ item for item in filter(lambda x: Matrix.IS_IN_MARGIN_FIELD(x.get('transform'), transkription_field, marginals_on_extra_page=marginals_extra),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
provide_tf = transkription_field if set_to_text_field_zero else None
for mark_foreign_hands in list_of_special_words:
relevant_nodes = [ node for node in nodes_in_margin_field\
if is_close((mark_foreign_hands.transkription_positions[0].bottom+mark_foreign_hands.transkription_positions[0].top)/2,\
node.get('transform'), transkription_field=provide_tf) ]
relevant_nodes = sorted(relevant_nodes, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX())
italic_found = False
mark_foreign_hands_text = ''
pen = ''
for node in relevant_nodes:
if len(node.getchildren()) == 0:
if italic_found:
pen += node.text
elif any(style in italic_classes for style in node.get('class').split(' ')):
italic_found = True
pen = node.text
else:
mark_foreign_hands_text += get_text_from_node(node, SonderzeichenList)
else:
for tspan in node.getchildren():
if italic_found:
pen += tspan.text
elif any(style in italic_classes for style in tspan.get('class').split(' ')):
italic_found = True
pen = tspan.text
else:
mark_foreign_hands_text += get_text_from_node(tspan, SonderzeichenList)
mark_foreign_hands.foreign_hands_text = mark_foreign_hands_text
mark_foreign_hands.pen = pen
def get_text_from_node(node, SonderzeichenList):
"""Returns the text of node. Replaces Sonderzeichen if node has a style class in SonderzeichenList.
"""
if any(style in SonderzeichenList for style in node.get('class').split(' '))\
and bool(MarkForeignHands.REPLACE_DICT.get(node.text)):
return MarkForeignHands.REPLACE_DICT[node.text]
else:
return node.text
def is_close(mark_foreign_hands_position, matrix_string, transkription_field=None):
"""Return true if mark_foreign_hands_position is == matrix.getY()+-THRESHOLD_Y
"""
THRESHOLD_Y = 4
matrix = Matrix(transform_matrix_string=matrix_string, transkription_field=transkription_field)
return abs(mark_foreign_hands_position-matrix.getY()) < THRESHOLD_Y
Index: svgscripts/datatypes/image.py
===================================================================
--- svgscripts/datatypes/image.py (revision 112)
+++ svgscripts/datatypes/image.py (revision 113)
@@ -1,157 +1,157 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent all image types.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
-from os.path import isfile
+from os.path import isfile, basename
import sys
from .attachable_object import AttachableObject
from .matrix import Matrix
from .text_field import TextField
sys.path.append('py2ttl')
from class_spec import SemanticClass
class Image(AttachableObject,SemanticClass):
"""
This super class represents all types of images.
Args:
file_name (str): name of the image file.
node (lxml.etree.Element) node, containing information
URL (str): URL of image file.
height (float): height of image
width (float): width of image
text_field (.text_field.TextField) text_field on image representation
"""
stringKeys = [ 'file_name', 'URL', 'local_path' ]
floatKeys = [ 'height', 'width' ]
XML_TAG = 'image'
SECONDARY_URL = 'http://localhost:8000/'
FAKSIMILE_DIR = 'faksimiles/'
def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, matrix=None, text_field=None, tag=XML_TAG):
self.text_field = text_field
self.tag = tag
if node is not None:
self.file_name = node.get('file-name')
self.local_path = node.get('local-path')
self.URL = node.get('URL')
self.height = float(node.get('height'))
self.width = float(node.get('width'))
self.transform = Matrix(node.get('transform')) if bool(node.get('transform')) and 'matrix(' in node.get('transform') else None
if len(node.findall(TextField.XML_TAG)) > 0:
self.text_field = TextField(node=node.find(TextField.XML_TAG))
else:
self.file_name = file_name
self.local_path = local_path
self.URL = URL
self.height = height
self.width = width
self.transform = matrix
self.primaryURL = self.URL
self.secondaryURL = None
if self.file_name is not None:
self.secondaryURL = self.SECONDARY_URL + self.file_name.replace('./','')\
if self.file_name is not None and self.file_name.endswith('svg')\
else self.SECONDARY_URL + self.FAKSIMILE_DIR + self.file_name
self.transform_string = self.transform.toString()\
if self.transform is not None\
else None
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.find('.//' + self.tag) \
if(len(target_tree.findall('.//' + self.tag)) > 0) \
else ET.SubElement(target_tree, self.tag)
for key in self.floatKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3)))
for key in self.stringKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), self.__dict__[key])
if self.transform is not None and self.transform.isRotationMatrix():
obj_node.set('transform', self.transform.toString())
if self.text_field is not None:
self.text_field.attach_object_to_tree(obj_node)
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {}
for floatKey in Image.floatKeys:
properties.update(cls.create_semantic_property_dictionary(floatKey, float, cardinality=1))
properties.update(cls.create_semantic_property_dictionary('file_name', str, cardinality=1))
properties.update(cls.create_semantic_property_dictionary('text_field', TextField))
#properties.update(cls.create_semantic_property_dictionary('transform', str))
properties.update(cls.create_semantic_property_dictionary('transform_string', str, name='hasTransform'))
properties.update(cls.create_semantic_property_dictionary('primaryURL', str, cardinality=1, subPropertyOf=cls.HAS_URL))
properties.update(cls.create_semantic_property_dictionary('secondaryURL', str, cardinality=1, subPropertyOf=cls.HAS_URL))
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
class SVGImage(Image):
"""This class represents a svg image.
"""
XML_TAG = 'svg-image'
ASSETS_FOLDER = '/assets/'
- URL_PREFIX = 'http://existdb-test.dasch.swiss/exist/rest/db/storage/nietzsche/'
+ URL_PREFIX = 'https://nietzsche.philhist.unibas.ch/svg/'
def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG):
if node is not None and node.tag != self.XML_TAG:
file_name = node.get('file')
height = float(node.get('height')) if bool(node.get('height')) else 0.0
width = float(node.get('width')) if bool(node.get('width')) else 0.0
node = None
super(SVGImage, self).__init__(node=node, file_name=file_name, URL=URL,\
height=height, width=width, text_field=text_field, tag=self.XML_TAG)
- self.primaryURL = self.ASSETS_FOLDER + self.file_name.replace('./', '')
- self.secondaryURL = self.URL_PREFIX + self.file_name.replace('./', '')
+ self.primaryURL = self.URL_PREFIX + basename(self.file_name.replace('./', ''))
+ self.secondaryURL= self.ASSETS_FOLDER + self.file_name.replace('./', '')
def decontextualize_file_name(self, update_url=None):
"""Decontextualize file name.
"""
self.file_name = self.file_name.replace('./', '')
if update_url is not None:
self.URL = update_url + self.file_name
# @classmethod
# def get_semantic_dictionary(cls):
# """ Creates and returns a semantic dictionary as specified by SemanticClass.
# """
# dictionary = super(SVGImage,cls).get_semantic_dictionary()
# return cls.return_dictionary_after_updating_super_classes(dictionary)
Index: svgscripts/datatypes/text.py
===================================================================
--- svgscripts/datatypes/text.py (revision 112)
+++ svgscripts/datatypes/text.py (revision 113)
@@ -1,219 +1,223 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a text that may have standoff markup.
"""
# Copyright (C) University of Basel 2020 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
from lxml import etree as ET
import re
import sys
from .attachable_object import AttachableObject
from .standoff_tag import StandoffTag
sys.path.append('py2ttl')
from class_spec import SemanticClass
class Text(AttachableObject,SemanticClass):
"""
This class represents a text that may have standoff markup.
"""
TAG_PATTERN = re.compile(r'([^<]*)(<[^/]+>)')
#START_TAG_PATTERN = re.compile(r'.*<[a-z]+>')
START_TAG_PATTERN = re.compile(r'[^<]*(?!)[^<]*<[a-z]+>')
XML_TAG = 'text-with-markup'
XML_SUB_TAG = 'text'
+ XML_ALT_SUB_TAG = 'content'
def __init__(self, content=None, standoff_markups=None, id=0, tag=XML_TAG):
self.id = str(id)
self.tag = tag
self.content = content
self.standoff_markups = standoff_markups\
if standoff_markups is not None\
else []
def append(self, content: str) -> int:
"""Extend text with content.
[:return:] startIndex of appended content
"""
startIndex = len(self.content)
self.content += content
return startIndex
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.tag)
obj_node.set('id', self.id)
- text_node = ET.SubElement(obj_node, self.XML_SUB_TAG)
+ text_tag = self.XML_SUB_TAG\
+ if self.tag != self.XML_SUB_TAG\
+ else self.XML_ALT_SUB_TAG
+ text_node = ET.SubElement(obj_node, text_tag)
text_node.text = self.content
for index, markup in enumerate(self.standoff_markups):
markup.id = str(index)
markup.attach_object_to_tree(obj_node)
def extract_part(self, text_part, css_filter=';'):
"""Extract part of text for which text_part matchs content.
:return: datatypes.text.Text
"""
if not css_filter.endswith(';'):
css_filter += ';'
if text_part in self.content:
part_start_index = self.content.find(text_part)
part_end_index = part_start_index + len(text_part)
standoff_markups = [ markup for markup in self.standoff_markups\
if markup.css_string.endswith(css_filter)\
if (markup.startIndex <= part_start_index\
and markup.endIndex > part_start_index)\
or (markup.startIndex >= part_start_index\
and markup.startIndex < part_end_index\
and markup.endIndex <= part_end_index)\
or (markup.startIndex < part_end_index\
and markup.endIndex >= part_end_index)]
new_markups = []
for markup in standoff_markups:
startIndex = markup.startIndex - part_start_index\
if markup.startIndex > part_start_index else 0
endIndex = markup.endIndex - part_start_index\
if markup.endIndex <= part_end_index\
else len(text_part)
new_markups.append(StandoffTag(markup.markup, startIndex, endIndex))
return Text(content=text_part, standoff_markups=new_markups)
else:
msg = f'ERRROR {text_part} is not a part of {self.content}!'
raise Exception(msg)
def join(self, other):
"""Join self and other.
"""
correction = self.append(' ' + other.content) + 1
for standoff_markup in other.standoff_markups:
standoff_markup.startIndex += correction
standoff_markup.endIndex += correction
self.standoff_markups += other.standoff_markups
del other
def markup_contains_css_filter(self, css_filter) ->bool:
"""Returns true if markup contains css_filter.
"""
if not css_filter.endswith(';'):
css_filter += ';'
return len([ markup for markup in self.standoff_markups\
if markup.css_string.endswith(css_filter) ]) > 0
@classmethod
def create_cls_from_node(cls, node):
"""Initialize a cls from node.
[:return:] cls
"""
standoff_markups = [ StandoffTag.create_cls_from_node(item) for item in\
node.xpath('./' + '|./'.join(StandoffTag.MARKUP_STYLES)) ]
text = node.xpath('./' + cls.XML_SUB_TAG + '/text()')[0]\
if len(node.xpath('./' + cls.XML_SUB_TAG + '/text()')) > 0\
else ''
return cls(text, standoff_markups=standoff_markups, id=node.get('id'), tag=node.tag)
@classmethod
def create_cls_from_html(cls, html):
"""Creates a Text from a html string.
:return: a (datatypes.text) Text
"""
html = html.replace('<', '<').replace('>', '>')
"""
tag_matched = re.match(cls.TAG_PATTERN, html)
while tag_matched is not None:
tag = tag_matched.group(2)
tags = [ t for t in tag.split('<') if t != '']
tags.reverse()
endTag = ''.join([ '' + t for t in tags])
startIndex = tag_matched.end() - len(tag)
inner_tag_matched = re.match(cls.TAG_PATTERN, html[0:startIndex])
html = html[0:startIndex] + html[tag_matched.end():]
endTag_matched = re.match(rf'(.*)({endTag})', html)
if endTag_matched is not None:
endIndex = endTag_matched.end() - len(endTag)
html = html[0:endIndex] + html[endTag_matched.end():]
for markup in [ StandoffTag.HTML_TAG_DICTIONARY['<'+tag] for tag in tags\
if bool(StandoffTag.HTML_TAG_DICTIONARY.get('<'+tag)) ]:
standoff_markups.append(StandoffTag(markup, startIndex, endIndex))
else:
msg = f'HTML string contains no ending tag for {tag}!'
raise Exception(msg)
tag_matched = re.match(cls.TAG_PATTERN, html)
"""
html, standoff_markups = extract_standoff_data(html)
return cls(html, standoff_markups=standoff_markups)
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
properties = {}
properties.update(cls.create_semantic_property_dictionary('content', str, cardinality=1,\
name='textHasContent', label='content of text', comment='Connects a text with its content.'))
properties.update(cls.create_semantic_property_dictionary('standoff_markups', StandoffTag,\
name='textHasMarkup', label='standoff markup of text', comment='Connects a text with a list of standoff tags.'))
dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties }
return cls.return_dictionary_after_updating_super_classes(dictionary)
def extract_standoff_data(html) ->(str, list):
"""Extract standoff data and return the html string without tags and a list of standoff data.
"""
standoff_markups = []
tag_matched = re.match(Text.START_TAG_PATTERN, html)
while tag_matched:
tag = re.sub(r'>.*', '', re.sub(r'^[^<]+<', '', tag_matched.group(0)))
startIndex = html.index(f'<{tag}>')
html = re.sub(rf'<{tag}>', '', html, count=1)
contains_tag_pattern = rf'.*<[a-z]+>.*{tag}>.*'
if re.match(contains_tag_pattern, html):
html, new_standoff_data = extract_standoff_data(html)
standoff_markups += new_standoff_data
end_tag_pattern = rf'.*{tag}>.*'
endTag_matched = re.match(end_tag_pattern, html)
if endTag_matched is not None:
endIndex = html.index(f'{tag}>')
html = html[0:endIndex] + html[endIndex+len(f'{tag}>'):]
if bool(StandoffTag.HTML_TAG_DICTIONARY.get(f'<{tag}>')):
standoff_markups.append(StandoffTag(StandoffTag.HTML_TAG_DICTIONARY[f'<{tag}>'], startIndex, endIndex))
else:
msg = f'HTML string contains no ending tag for {tag}!'
raise Exception(msg)
tag_matched = re.match(Text.START_TAG_PATTERN, html)
return html, standoff_markups
Index: svgscripts/datatypes/text_field.py
===================================================================
--- svgscripts/datatypes/text_field.py (revision 112)
+++ svgscripts/datatypes/text_field.py (revision 113)
@@ -1,52 +1,70 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class represents a text field on a faksimile image.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import sys
import re
from os import path, sep
import lxml.etree as ET
from .positional_object import PositionalObject
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__version__ = "0.0.1"
class TextField(PositionalObject):
"""
This class represents the text field of a faksimile image.
Args:
id (str): id from svg file.
width (float)
height (float)
x (float)
y (float)
"""
XML_TAG = 'text-field'
+ SVG_TAG = 'rect'
def __init__(self, id=0, node=None, width=0.0, height=0.0, x=0.0, y=0.0, matrix=None):
super(TextField, self).__init__(node=node, id=id, width=width, height=height, x=x, y=y, matrix=matrix, tag=self.XML_TAG)
self.xmin = self.left
self.xmax = self.left + self.width
self.ymin = self.top
self.ymax = self.top + self.height
+
+ def update_rect(self, rect, fill='green', opacity='0.3'):
+ rect.set('x', str(self.left))
+ rect.set('y', str(self.top))
+ rect.set('height', str(self.height))
+ rect.set('width', str(self.width))
+ rect.set('fill', fill)
+ rect.set('opacity', opacity)
+ rect.set('style', '')
+
+ def attach_as_rect(self, node, id=None, fill='green', opacity='0.3'):
+ id = id if id is not None else self.id
+ rect = ET.Element("rect")
+ rect.set('id', id)
+ self.update_rect(rect, fill=fill, opacity=opacity)
+ node.append(rect)
+
Index: svgscripts/datatypes/standoff_tag.py
===================================================================
--- svgscripts/datatypes/standoff_tag.py (revision 112)
+++ svgscripts/datatypes/standoff_tag.py (revision 113)
@@ -1,152 +1,152 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent the standoff markup of a text.
"""
# Copyright (C) University of Basel 2020 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
import re
import sys
from .attachable_object import AttachableObject
sys.path.append('py2ttl')
from class_spec import SemanticClass
class StandoffTag(AttachableObject,SemanticClass):
"""
This class represents the standoff markup of a text.
"""
MARKUP_STYLES = [ 'bold', 'italic', 'delete', 'underline' ]
RDFS_SUBCLASSOF_LIST = ['http://www.nie.org/ontology/standoff#StandoffMarkup']
RELEVANT_STYLE_KEY = 'font-family'
RELEVANT_CONTENT_STARTSWITH = 'Frutiger-'
RELEVANT_PATTERN = re.compile('.*(Italic|Bold)$')
RELEVANT_SUB_PATTERN = re.compile('Frutiger-(Light)*')
STOFF_HAS_CSS_URL_STRING = 'http://www.nie.org/ontology/standoff#hasCSS'
STOFF_HAS_START_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasStartIndex'
STOFF_HAS_END_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasEndIndex'
HTML_TAG_DICTIONARY = { '': 'italic', '': 'bold', '': 'delete', '': 'underline' }
CSS_DICTIONARY = { 'bold': 'font-weight:bold;',
'italic': 'font-style: italic;',
'underline': 'text-decoration:underline;',
'delete': 'text-decoration:line-through;' }
def __init__(self, markup: str, startIndex: int, endIndex: int, id=0):
self.id = str(id)
self.css_string = self.CSS_DICTIONARY.get(markup)
self.markup = markup
self.startIndex = startIndex
self.endIndex = endIndex
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.markup)
obj_node.set('id', self.id)
obj_node.set('start', str(self.startIndex))
obj_node.set('end', str(self.endIndex))
@classmethod
def create_cls(cls, start_index, end_index, style_string, page=None, style_dict=None):
"""Creates a StandoffTag from a style_string.
:return: a list of (datatypes.standoff_tag) StandoffTag
"""
if page is not None:
style_dict = cls.create_relevant_style_dictionary(page)
relevant_keys = [ key for key in set(style_string.split(' '))\
if key in style_dict.keys() ]
standoff_tags = []
if style_dict is None or len(style_dict) == 0:
return standoff_tags
for relevant_key in relevant_keys:
font_family = style_dict[relevant_key][cls.RELEVANT_STYLE_KEY]
if re.match(cls.RELEVANT_PATTERN, font_family):
markup = re.sub(cls.RELEVANT_SUB_PATTERN, '', font_family).lower()
standoff_tags.append(cls(markup, start_index, end_index))
return standoff_tags
@classmethod
def create_cls_from_node(cls, node):
"""Creates a StandoffTag from a node.
:return: (datatypes.standoff_tag) StandoffTag
"""
return cls(node.tag, int(node.get('start')), int(node.get('end')), id=node.get('id'))
@classmethod
def create_relevant_style_dictionary(cls, page):
"""Return a style dictionary that contains only relevant keys and contents.
"""
return { key: key_dict for key, key_dict in page.style_dict.items()\
if cls.RELEVANT_STYLE_KEY in key_dict.keys()\
and key_dict[cls.RELEVANT_STYLE_KEY].startswith(cls.RELEVANT_CONTENT_STARTSWITH) }
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
properties = {}
#properties.update(cls.create_semantic_property_dictionary('markup', str, cardinality=1,\
# name='standoffTagHasMarkup', label='standoff tag has a specific markup', comment='Connects a standoff tag with its markup, e.g. bold or italic'))
properties.update(cls.create_semantic_property_dictionary('startIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_START_INDEX,\
name='standoffTagHasStartIndex', label='standoff tag has a start index', comment='Connects a standoff tag with its start index.'))
properties.update(cls.create_semantic_property_dictionary('endIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_END_INDEX,\
name='standoffTagHasEndIndex', label='standoff tag has a end index', comment='Connects a standoff tag with its end index.'))
properties.update(cls.create_semantic_property_dictionary('css_string', str,\
subPropertyOf=cls.STOFF_HAS_CSS_URL_STRING,\
name='standoffTagHasCSS', label='standoff tag has css', comment='Connects a standoff tag with CSS style.'))
dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties }
return cls.return_dictionary_after_updating_super_classes(dictionary)
def is_joinable(self, other):
- """Return true if self and other have same markup and self.endIndex == other.startIndex.
+ """Return true if self and other have same markup and self.endIndex+1 == other.startIndex.
"""
- return self.markup == other.markup and self.endIndex == other.startIndex
+ return self.markup == other.markup and self.endIndex+1 == other.startIndex
def join(self, other):
"""Join self with other.
"""
self.endIndex = other.endIndex
def join_list(self, others):
"""Join all others that are joinable, return remaining others as a list.
"""
unjoinable_others = []
for other in others:
if self.is_joinable(other):
self.join(other)
else:
unjoinable_others.append(other)
return unjoinable_others
Index: svgscripts/datatypes/faksimile.py
===================================================================
--- svgscripts/datatypes/faksimile.py (revision 112)
+++ svgscripts/datatypes/faksimile.py (revision 113)
@@ -1,205 +1,209 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a faksimile page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import re
from lxml import etree as ET
from os import path
from os.path import isdir, isfile, sep, basename
from svgpathtools.parser import parse_path
from .faksimile_image import FaksimileImage
from .matrix import Matrix
from .text_field import TextField
from .word_position import WordPosition
class FaksimilePage:
"""
This class represents a faksimile page.
Args:
xml_target_file (str): name of the xml file to which page info will be written.
xml_source_file (str): name of the xml file that will be instantiated.
"""
XML_TAG = 'faksimile-page'
def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, svg_source_file=None, faksimile_image=None, text_field=None):
xml_file = xml_source_file if xml_source_file is not None else xml_target_file
self.title = title
self.page_number = page_number
self.xml_file = xml_file
if xml_file is not None and isfile(xml_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_file, parser)
self.title = self.page_tree.getroot().get('title')
self.page_number = self.page_tree.getroot().get('page-number')
self.width = float(self.page_tree.getroot().get('width')) if bool(self.page_tree.getroot().get('width')) else 0.0
self.height = float(self.page_tree.getroot().get('height')) if bool(self.page_tree.getroot().get('height')) else 0.0
else:
self.page_tree = ET.ElementTree(ET.Element(self.XML_TAG))
if title is not None:
self.page_tree.getroot().set('title', title)
if page_number is not None:
self.page_tree.getroot().set('page-number', str(page_number))
if xml_target_file is not None:
self.remove_tags_from_page_tree([WordPosition.FAKSIMILE])
if svg_source_file is not None:
self.page_tree.getroot().set('svg-source-file', svg_source_file)
if faksimile_image is not None:
faksimile_image.attach_object_to_tree(self.page_tree)
if text_field is not None:
text_field.attach_object_to_tree(self.page_tree)
self.svg_source_file = self.page_tree.getroot().get('svg-source-file')
self.faksimile_image = FaksimileImage(node=self.page_tree.getroot().find('.//' + FaksimileImage.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + FaksimileImage.XML_TAG)) > 0 else None
self.text_field = TextField(node=self.page_tree.getroot().find('.//' + TextField.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + TextField.XML_TAG)) > 0 else None
self.word_positions = [ WordPosition(node=node) for node in self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE) ]\
if len(self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE)) > 0 else []
def append_word_position(self, word_position):
"""Appends word_position to word_positions and attaches it to page_tree.
"""
self.word_positions.append(word_position)
word_position.attach_object_to_tree(self.page_tree)
@classmethod
- def get_faksimile_pages(cls, svg_file, page_number='') -> list:
+ def get_faksimile_pages(cls, svg_file, page_number='', isBlank=False) -> list:
"""Creates and returns text fields contained in a svg_file as a list.
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
- return cls.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces, page_number=page_number)
+ return cls.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces, page_number=page_number, isBlank=isBlank)
@staticmethod
- def GET_FAKSIMILEPAGES(svg_tree, namespaces=None, page_number='') -> list:
+ def GET_FAKSIMILEPAGES(svg_tree, namespaces=None, page_number='', isBlank=False) -> list:
"""Creates and returns text fields contained in a svg_tree as a list.
"""
THRESHOLD_X = 10
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
source_file_name = svg_tree.docinfo.URL
image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap), source_file_name)
xml_dir = '.{}xml'.format(sep)
faksimile_pages = list()
title_string = re.sub(r'[,_][0-9]+.*\.svg', '', basename(source_file_name))
- if re.match(r'.*-\d+[a-z]$', title_string):
- title_string = re.sub(r'-\d+[a-z]$', '', title_string)
title = title_string.replace('-', ' ')
+ if re.match(r'.*-\d+[a-z]*$', title_string):
+ title_string = re.sub(r'-\d+[a-z]*$', '', title_string)
rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap)\
if rect.get('id', svg_tree.getroot().nsmap).startswith(title_string)\
and rect.get('id', svg_tree.getroot().nsmap).endswith(str(page_number)) ]
+ if isBlank:
+ rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap) ]
+ if len(rect_list) == 0 and page_number != '':
+ return FaksimilePage.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces)
for text_field_rect in rect_list:
tf_x = float(text_field_rect.get('x', svg_tree.getroot().nsmap)) - image.x
tf_y = float(text_field_rect.get('y', svg_tree.getroot().nsmap)) - image.y
tf_width = float(text_field_rect.get('width', svg_tree.getroot().nsmap))
tf_height = float(text_field_rect.get('height', svg_tree.getroot().nsmap))
tf_matrix = Matrix(transform_matrix_string=text_field_rect.get('transform'))\
if bool(text_field_rect.get('transform'))\
else None
id = text_field_rect.get('id', svg_tree.getroot().nsmap)
target_file_name = xml_dir + sep + id + '.xml' if isdir(xml_dir) else id + '.xml'
- page_number = re.sub(r'.*[,_]', '', id)
+ page_number = re.sub(r'.*[,_-]', '', id)
if page_number.startswith('0'):
page_number = page_number.lstrip('0')
text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y, matrix=tf_matrix)
faksimile_page = FaksimilePage(xml_target_file=target_file_name, svg_source_file=source_file_name,\
title=title, page_number=page_number, faksimile_image=image, text_field=text_field)
x_min = text_field.xmin + image.x
y_min = text_field.ymin + image.y
#rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\
# x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces)
rect_titles = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\
y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces)
rect_titles += get_paths_inside_rect(svg_tree, '//ns:path/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\
y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces)
for rect_title in rect_titles:
rect = rect_title.getparent()
x, y, height, width = 0.0, 0.0, 0.0, 0.0
if rect.tag.endswith('path'):
path = parse_path(rect.get('d'))
x, xmax, y, ymax = path.bbox()
width = xmax - x
height = ymax - y
else:
x = float(rect.get('x', svg_tree.getroot().nsmap))
y = float(rect.get('y', svg_tree.getroot().nsmap))
height = float(rect.get('height', svg_tree.getroot().nsmap))
width = width=float(rect.get('width', svg_tree.getroot().nsmap))
matrix = None
if bool(rect.get('transform')):
matrix = Matrix(transform_matrix_string=rect.get('transform'))
text = re.sub(r'(\s(?=[-;:.,…?!’–])|(?<=[-;:.,…?!’–])\s)', '', rect_title.text)
faksimile_page.append_word_position(\
WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=text, height=height,\
width=width, x=x, y=y, matrix=matrix, tag=WordPosition.FAKSIMILE))
faksimile_pages.append(faksimile_page)
return faksimile_pages
def remove_tags_from_page_tree(self, list_of_tags_to_remove):
"""Removes the tags specified in the list from the target tree.
"""
for xpath2remove in list_of_tags_to_remove:
for node in self.page_tree.xpath('//' + xpath2remove):
node.getparent().remove(node)
def get_paths_inside_rect(svg_tree, xpath, x_min, x_max, y_min, y_max, not_id, namespaces={}):
"""Returns a list of all paths selected by xpath that are inside x_min, x_max, y_min, y_max and do not have id == not_id.
"""
paths = []
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
for path_node in svg_tree.xpath(xpath, namespaces=namespaces):
append_node = path_node
if not path_node.tag.endswith('path') and not path_node.tag.endswith('rect'):
path_node = path_node.getparent()
x, xmax, y, ymax = -1, -1, -1, -1
init_xy = False
if path_node.tag.endswith('rect'):
x = float(path_node.get('x')) if bool(path_node.get('x')) else -1
y = float(path_node.get('y')) if bool(path_node.get('y')) else -1
xmax = x + float(path_node.get('width')) if bool(path_node.get('width')) else -1
ymax = y + float(path_node.get('height')) if bool(path_node.get('height')) else -1
init_xy = True
elif path_node.tag.endswith('path') and bool(path_node.get('d')) and path_node.get('d') != 0:
path = parse_path(path_node.get('d'))
x, xmax, y, ymax = path.bbox()
init_xy = True
if init_xy:
if bool(path_node.get('transform')):
matrix = Matrix(transform_matrix_string=path_node.get('transform'))
x, xmax = matrix.get_new_x(x=x, y=y), matrix.get_new_x(x=xmax, y=ymax)
y, ymax = matrix.get_new_y(x=x, y=y), matrix.get_new_y(x=xmax, y=ymax)
width = xmax - x
height = ymax - y
if x > x_min and x < x_max\
and y > y_min and y < y_max\
and path_node.get('id') != not_id:
paths.append(append_node)
return paths
Index: svgscripts/datatypes/faksimile_image.py
===================================================================
--- svgscripts/datatypes/faksimile_image.py (revision 112)
+++ svgscripts/datatypes/faksimile_image.py (revision 113)
@@ -1,122 +1,127 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent faksimile images.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import fnmatch
from lxml import etree as ET
import os
from os.path import basename, dirname, isfile, realpath, sep
import sys
from .image import Image
from .matrix import Matrix
from .text_field import TextField
sys.path.append('svgscripts')
from local_config import FAKSIMILE_LOCATION
class FaksimileImage(Image):
"""
This class represents a faksimile image.
Args:
file_name (str): name of the image file.
node (lxml.etree.Element) node, containing information
URL (str): URL of image file.
height (float): height of image
width (float): width of image
x (float): x
y (float): y
"""
XML_TAG = 'faksimile-image'
- NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/download/'
+ OLD_NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/download/'
+ NIETZSCHE_SOURCES_URL = 'https://nietzsche.philhist.unibas.ch/faksimiles/'
NIETZSCHE_SOURCES_API_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/'
NIETZSCHE_SOURCES_IMAGE_API_URL = 'http://www.nietzschesource.org/DFGAapi/images/DFGA/'
def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, text_field=None):
super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL, local_path=local_path,\
height=height, width=width, matrix=matrix, text_field=text_field, tag=self.XML_TAG)
self.x = x
self.y = y
self.apiURL = None
self.thumbURL = None
self.mediumURL = None
if self.file_name is not None:
nsource_page_name = self.file_name.replace('.jpg','')
nsource_manuscript_name = nsource_page_name.split(',')[0]
+ self.primaryURL = self.NIETZSCHE_SOURCES_URL + self.file_name
+ self.thumbURL = self.NIETZSCHE_SOURCES_URL + self.file_name.replace('.jpg', '_thumb.jpg')
+ # self.NIETZSCHE_SOURCES_IMAGE_API_URL + nsource_manuscript_name + '/mini/' + self.file_name
self.apiURL = self.NIETZSCHE_SOURCES_API_URL + nsource_page_name
- self.thumbURL = self.NIETZSCHE_SOURCES_IMAGE_API_URL + nsource_manuscript_name + '/mini/' + self.file_name
self.mediumURL = self.NIETZSCHE_SOURCES_IMAGE_API_URL + nsource_manuscript_name + '/medium/' + self.file_name
+ """
if self.primaryURL is not None and self.primaryURL.startswith(self.NIETZSCHE_SOURCES_API_URL):
self.apiURL = self.primaryURL
self.primaryURL = self.NIETZSCHE_SOURCES_URL + basename(self.primaryURL)
+ """
def get_image_joined_with_text_field(self, text_field):
"""Returns a new instance of itself that has a text_field (text_field.TextField).
"""
return FaksimileImage(file_name=self.file_name, local_path=self.local_path, URL=self.URL, height=self.height,\
width=self.width, x=self.x, y=self.y, text_field=text_field)
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(FaksimileImage,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('apiURL', str, subPropertyOf=cls.HAS_URL))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('thumbURL', str, subPropertyOf=cls.HAS_URL))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('mediumURL', str, subPropertyOf=cls.HAS_URL))
return cls.return_dictionary_after_updating_super_classes(dictionary)
@staticmethod
def CREATE_IMAGE(image_node, source_file=None):
"""Instantiates a FaksimileImage from a (lxml.etree.Element) image_node.
"""
namespaces = image_node.nsmap
if len(namespaces) == 0:
namespaces = { 'xlink': '' }
local_path = image_node.get('{%s}href' % namespaces['xlink'])
file_name = basename(local_path)
if file_name != local_path and source_file is not None:
local_path = realpath(dirname(source_file)) + sep + local_path
local_path = realpath(local_path)
if not isfile(local_path):
local_path = None
for path, dirs, files in os.walk(os.path.abspath(FAKSIMILE_LOCATION)):
for filename in fnmatch.filter(files, file_name):
local_path = os.path.join(path, filename)
break
URL = FaksimileImage.NIETZSCHE_SOURCES_URL + file_name.replace('.jpg','')
height = float(image_node.get('height')) if bool(image_node.get('height')) else 0.0
width = float(image_node.get('width')) if bool(image_node.get('width')) else 0.0
x = float(image_node.get('x')) if bool(image_node.get('x')) else 0.0
y = float(image_node.get('y')) if bool(image_node.get('y')) else 0.0
matrix = Matrix(transform_matrix_string=image_node.get('transform'))\
if bool(image_node.get('transform'))\
else None
return FaksimileImage(file_name=file_name, local_path=local_path, URL=URL, height=height, width=width, x=x, y=y, matrix=matrix)
Index: svgscripts/datatypes/transkriptionField.py
===================================================================
--- svgscripts/datatypes/transkriptionField.py (revision 112)
+++ svgscripts/datatypes/transkriptionField.py (revision 113)
@@ -1,207 +1,207 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to transform a svg file according to the dimension of its transkription field.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__version__ = "0.0.1"
import sys
from os.path import exists
from svgpathtools import svg_to_paths
import xml.etree.ElementTree as ET
from xml.parsers.expat import ExpatError
from .matrix import Matrix
from .text_field import TextField
MAX_SMALLER_PATH_WIDTH = 50.0
MAX_SMALLER_PATH_HEIGHT = 50.0
MAX_DIFF_DOC_SELF_WIDTH = 100.0
MAX_DIFF_DOC_SELF_HEIGHT = 100.0
MIN_AREA = 2500.0
class TranskriptionField:
"""
A class containing the dimensions of the transkription field.
Args:
filename (str): name of the svg file
"""
def __init__(self, filename, multipage_index=-1):
self.width = 0.0
self.height = 0.0
self.xmin = 0.0
self.xmax = 0.0
self.ymin = 0.0
self.ymin_without_title = 0.0
self.ymax = 0.0
self.documentWidth = 0.0
self.documentHeight = 0.0
self.path = None
self.second_field = None
self.filename = filename
self.line_number_area_width = 0.0
try:
paths, attributes, self.svg_attributes = svg_to_paths.svg2paths(filename, return_svg_attributes=True)
except ExpatError:
raise ExpatError('File {} is empty!'.format(filename))
if len(self.svg_attributes) > 0 and bool(self.svg_attributes.get('viewBox')):
viewBox = (self.svg_attributes['viewBox'].split())
else:
raise Exception('File "{}" does not have an attribute "viewBox"'.format(filename))
self.documentWidth = float(viewBox[2])
self.documentHeight = float(viewBox[3])
if self.is_shrunk():
self.xmin = float(viewBox[0])
self.ymin = float(viewBox[1])
self.width = self.documentWidth
self.height = self.documentHeight
else:
sorted_paths = self.sort_according_to_area_desc(paths, attributes)
if multipage_index < 0 and len(sorted_paths) > 0:
self.path = sorted_paths[0]
elif len(sorted_paths) > 1:
self.path = sorted(sorted_paths[:2], key=lambda path: path.bbox()[0])[multipage_index]
if multipage_index == 0:
self.second_field = TranskriptionField(filename, multipage_index=1)
if self.path is not None:
self.xmin, self.xmax, self.ymin, self.ymax = self.path.bbox()
self.width = self.xmax - self.xmin
self.height = self.ymax - self.ymin
self.ymin_without_title = self.ymin - 10
def add_line_number_area_width(self, end_positionX_of_line_number_area):
"""Adds the width of the line number area.
"""
if self.is_page_verso():
self.line_number_area_width = self.xmin - end_positionX_of_line_number_area
else:
self.line_number_area_width = end_positionX_of_line_number_area - self.xmax
def convert_to_text_field(self) ->TextField:
"""Convert to TextField.
"""
return TextField(width=self.width, height=self.height, x=self.xmin, y=self.ymin)
def is_page_verso(self) -> bool:
"""Returns true if the area right of the TranskriptionField is less than the left area.
"""
return self.documentWidth-self.xmax < self.xmin
def is_shrunk(self) -> bool:
"""Returns True if viewbox[0] and viewBox[1] != 0.
"""
if len(self.svg_attributes) == 0 or not bool(self.svg_attributes.get('viewBox')):
return False
viewBox = self.svg_attributes['viewBox'].split()
return float(viewBox[0]) != 0 and float(viewBox[1]) != 0
def get_svg_attributes(self, attrib_key):
"""Returns the svg attribute for the corresponding key or None if empty.
"""
if self.svg_attributes is None or len(self.svg_attributes) == 0 or not bool(self.svg_attributes.get(attrib_key)):
return None
return self.svg_attributes[attrib_key]
- def shrink_svg_to_transkription_field(self, target_filename=None):
+ def shrink_svg_to_transkription_field(self, target_filename=None, redo=False):
""" Changes the viewBox of the svg graphics to the size of the transkription field.
If a target_filename is specified, the changes are saved to a new file,
otherwise they are saved to the input file.
Args:
target_filename (str): name of the target svg file
"""
if bool(self.svg_attributes.get('xmlns')):
ET.register_namespace('', self.svg_attributes['xmlns'])
if bool(self.svg_attributes.get('xmlns:xlink')):
ET.register_namespace('xlink', self.svg_attributes['xmlns:xlink'])
et = ET.parse(self.filename)
root = et.getroot()
if bool(root.attrib.get('viewBox')):
- if(not self.is_shrunk()):
+ if(redo or not self.is_shrunk()):
root.attrib['viewBox'] = '{} {} {} {}'.format(self.xmin, self.ymin, self.width, self.height)
if bool(root.attrib.get('width')):
root.attrib['width'] = '{}pt'.format(self.width)
if bool(root.attrib.get('height')):
root.attrib['height'] = '{}pt'.format(self.height)
if not bool(target_filename):
target_filename = self.filename
et.write(target_filename)
return 0
else:
#print('File {} already transformed!'.format(self.filename))
return 1
else:
print('ERROR: file {} does not contain a svg/@viewBox!'.format(self.filename)) #TODO: throw error
return 2
"""Return a list of paths sorted according to volume, descending.
"""
def transkription_field_found(self) -> bool:
""" Returns whether transkription field was found in __init__
"""
return self.width > 0.0 and self.height > 0.0 and self.xmin > 0.0 and self.xmax > 0.0 and self.ymin > 0.0 and self.ymax > 0.0
def getWidth(self):
"""Returns documentWidth
"""
return self.documentWidth
def getHeight(self):
"""Returns documentHeight if not is_shrunk, else height.
"""
return self.documentHeight
def get_path_area(self, path, attribute_dict, removal_dict=None) -> float:
"""Return area of path.bbox
"""
try:
if not bool(path)\
or not path.iscontinuous()\
or not path.isclosed():
return 0.0
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
height = ymax - ymin
if 'transform' in attribute_dict.keys():
matrix = Matrix(attribute_dict['transform'])
xmin, ymax, width, height = matrix.get_transformed_positions(xmin, ymin, width, height)
xmax = matrix.get_new_x()
ymin = matrix.get_new_y()
width = xmax - xmin
height = ymax - ymin
if self.documentWidth - width <= MAX_DIFF_DOC_SELF_WIDTH:
return 0.0
if self.documentHeight - height <= MAX_DIFF_DOC_SELF_HEIGHT:
return 0.0
return width * height
except AssertionError:
return 0.0
def sort_according_to_area_desc(self, paths, attributes, removal_dict=None) ->list:
"""Return a sorted list of paths sorted according to the area of their bbox, remove smaller paths.
"""
path_attributes = [ (path, attributes[index]) for index, path in enumerate(paths) if self.get_path_area(path, attributes[index]) > MAX_SMALLER_PATH_HEIGHT*self.documentWidth/4 ]
return [ path_tuple[0] for path_tuple in sorted(path_attributes, key=lambda path_tuple: self.get_path_area(*path_tuple, removal_dict=removal_dict), reverse=True) ]
Index: svgscripts/process_words_post_merging.py
===================================================================
--- svgscripts/process_words_post_merging.py (revision 112)
+++ svgscripts/process_words_post_merging.py (revision 113)
@@ -1,495 +1,501 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path as PathlibPath
from progress.bar import Bar
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
import sys
import tempfile
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.box import Box
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word, do_paths_intersect_saveMode, update_transkription_position_ids
from extract_line_continuation import extract_line_continuations
from util import back_up, process_warnings4status
from process_files import update_svgposfile_status
from process_footnotes import categorize_footnotes
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import extract_paths_on_tf
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
DEBUG_WORD = None
MERGED_DIR = 'merged'
WARNING_FOOTNOTES_ERROR = 'footnotes not processed'
WARNING_LINE_CONTINUATION = 'line continuation fail'
-def categorize_paths(page, transkription_field=None):
+def categorize_paths(page, transkription_field=None, skipProcessWordBoxes=False):
"""Categorize all paths that are part of the transkription field.
:return: a dictionary containig a list for each category of path.
"""
if page.source is not None and isfile(page.source):
MAX_HEIGHT_LINES = 1
max_line = sorted(\
[line_number.bottom-line_number.top for line_number in page.line_numbers if line_number.id % 2 == 0],\
reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17
tr_xmin = 0.0
tr_ymin = 0.0
if (page.svg_image is None or page.svg_image.text_field is None)\
and transkription_field is not None:
tr_xmin = transkription_field.xmin
tr_ymin = transkription_field.ymin
paths, attributes = svg_to_paths.svg2paths(page.source)
allpaths_outside_tf = []
attributes_outside_tf = []
if transkription_field is None:
transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index)
allpaths_on_tf = extract_paths_on_tf(page, outsiders=allpaths_outside_tf, outsider_attributes=attributes_outside_tf, transkription_field=transkription_field)
path_dict = { 'text_area_deletion_paths': [],\
'deletion_or_underline_paths': [],\
'box_paths': [],\
'dots_paths': [],\
'word_connector_paths': [],\
'uncategorized_paths': [] }
for mypath in allpaths_on_tf:
xmin, xmax, ymin, ymax = mypath.path.bbox()
start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin)
if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1:
path_dict.get('dots_paths').append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed():
path_dict.get('box_paths').append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
path_dict.get('word_connector_paths').append(mypath)
elif abs(ymax-ymin) < MAX_HEIGHT_LINES:
mypath.start_line_number = start_line_number
path_dict.get('deletion_or_underline_paths').append(mypath)
elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin):
# Check for "ladder", i.e. a path with 3 segments (seg0 is horizontal on line x, seg1 moves to line x+1, seg2 is horizontal on line x+1)
if start_line_number + 1 == page.get_line_number(mypath.path.end.imag-tr_ymin)\
and len(mypath.path._segments) == 3\
and abs(mypath.path._segments[0].bbox()[3]-mypath.path._segments[0].bbox()[2]) < MAX_HEIGHT_LINES\
and abs(mypath.path._segments[2].bbox()[3]-mypath.path._segments[2].bbox()[2]) < MAX_HEIGHT_LINES:
for index in 0, 2:
new_path = Path(parent_path=mypath, path=SVGPath(mypath.path._segments[index]))
new_path.start_line_number = page.get_line_number(new_path.path.start.imag-tr_ymin)
path_dict.get('deletion_or_underline_paths').append(new_path)
else:
path_dict.get('text_area_deletion_paths').append(mypath)
else:
path_dict.get('uncategorized_paths').append(mypath)
underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin)
path_dict.update({'underline_path': underline_path})
- path_dict['uncategorized_paths'] += process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\
+ if not skipProcessWordBoxes:
+ path_dict['uncategorized_paths'] += process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\
paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line)
return path_dict
elif not UNITTESTING:
error_msg = 'Svg source file {} does not exist!'.format(page.source)\
if page.source is not None else 'Page does not contain a source file!'
raise FileNotFoundError(error_msg)
return {}
def copy_page_to_merged_directory(page, manuscript_file=None):
"""Copy page to directory that contains the first version of all svg_pos_files that have been
merged with the faksimile position data. MERGED_DIR is a subfolder of svg_pos_files-directory.
"""
svg_pos_file = PathlibPath(page.page_tree.docinfo.URL)
target_dir = svg_pos_file.parent / MERGED_DIR
if not target_dir.is_dir():
target_dir.mkdir()
target_pos_file = target_dir / svg_pos_file.name
save_page(page, str(svg_pos_file), target_svg_pos_file=str(target_pos_file), status=STATUS_MERGED_OK, manuscript_file=manuscript_file)
def find_special_words(page, transkription_field=None):
"""Find special words, remove them from words, process their content.
"""
if page.source is None or not isfile(page.source):
raise FileNotFoundError('Page does not have a source!')
if transkription_field is None:
transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index)
set_to_text_field_zero = (page.svg_image is None or page.svg_image.text_field is None)
special_char_list = MarkForeignHands.get_special_char_list()
special_char_list += TextConnectionMark.get_special_char_list()
single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ]
if not UNITTESTING:
bar = Bar('find special words', max=len(single_char_words))
for word in single_char_words:
not bool(UNITTESTING) and bar.next()
if word.text == MarkForeignHands.CLASS_MARK:
id = len(page.mark_foreign_hands)
page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id))
page.words.remove(word)
elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\
or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\
and any(style in page.sonderzeichen_list for style\
in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))):
id = len(page.text_connection_marks)
page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id))
page.words.remove(word)
not bool(UNITTESTING) and bar.finish()
svg_tree = ET.parse(page.source)
page.update_page_type(transkription_field=transkription_field)
page.update_line_number_area(transkription_field, svg_tree=svg_tree, set_to_text_field_zero=set_to_text_field_zero)
if page.marginals_source is not None:
svg_tree = ET.parse(page.marginals_source)
italic_classes = [ key for key in page.style_dict\
if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ]
if len(page.mark_foreign_hands) > 0:
MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\
SonderzeichenList=page.sonderzeichen_list, set_to_text_field_zero=set_to_text_field_zero)
if len(page.text_connection_marks) > 0:
TextConnectionMark.find_content_in_footnotes(page, transkription_field, svg_tree)
def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Marks all words that intersect with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] list of .path.Path that might be word_underline_paths
"""
if not UNITTESTING:
bar = Bar('mark words that intersect with deletion paths', max=len(page.words))
for word in page.words:
not bool(UNITTESTING) and bar.next()
word = mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
for part_word in word.word_parts:
part_word = mark_word_if_it_intersects_with_paths_as_deleted(part_word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
word.partition_according_to_deletion()
not bool(UNITTESTING) and bar.finish()
# return those paths in deletion_paths that are not in page.word_deletion_paths
return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ]
def mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Marks word if it intersects with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] word
"""
word.deleted = False
for transkription_position in word.transkription_positions:
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
intersecting_paths = [ deletion_path for deletion_path in deletion_paths\
if do_paths_intersect_saveMode(deletion_path, word_path) ]
if DEBUG_WORD is not None and word.text == DEBUG_WORD.text and word.line_number == DEBUG_WORD.line_number:
relevant_paths = [ path for path in deletion_paths if path.start_line_number == DEBUG_WORD.line_number ]
#print(word.line_number, word_path.path.bbox(), [ path.path.bbox() for path in relevant_paths])
if len(intersecting_paths) > 0:
#print(f'{word.line_number}: {word.id}, {word.text}: {intersecting_paths}')
transkription_position.deleted = True
transkription_position._deletion_paths += intersecting_paths
for deletion_path in intersecting_paths:
if deletion_path.parent_path is not None:
deletion_path = deletion_path.parent_path
if deletion_path not in page.word_deletion_paths:
deletion_path.tag = Path.WORD_DELETION_PATH_TAG
deletion_path.attach_object_to_tree(page.page_tree)
page.word_deletion_paths.append(deletion_path)
return word
-def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None):
+def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None, skipProcessWordBoxes=False):
"""Process words after merging with faksimile word positions.
"""
if page is None and svg_pos_file is None:
raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!')
if page is None:
page = Page(svg_pos_file)
if page.source is None or not isfile(page.source):
raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file))
if svg_pos_file is None:
svg_pos_file = page.page_tree.docinfo.URL
if new_words is not None:
page.words = sorted(new_words, key=attrgetter('id'))
for word_node in page.page_tree.xpath('.//word'):
word_node.getparent().remove(word_node)
manuscript = ArchivalManuscriptUnity.create_cls(manuscript_file)\
if manuscript_file is not None\
else None
copy_page_to_merged_directory(page, manuscript_file=manuscript_file)
transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index)
update_faksimile_line_positions(page)
status = STATUS_MERGED_OK
page.update_styles(manuscript=manuscript, partition_according_to_styles=True)
save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file)
- categorize_paths(page, transkription_field=transkription_field)
+ categorize_paths(page, transkription_field=transkription_field, skipProcessWordBoxes=skipProcessWordBoxes)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('default')
try:
find_special_words(page, transkription_field=transkription_field)
save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file)
categorize_footnotes(page)
save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file)
extract_line_continuations(page, warning_message=WARNING_LINE_CONTINUATION)
except Exception:
warnings.warn(WARNING_FOOTNOTES_ERROR)
status = process_warnings4status(w, [ WARNING_FOOTNOTES_ERROR, WARNING_LINE_CONTINUATION ], status, STATUS_POSTMERGED_OK)
save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file)
def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17) -> list:
"""Process word boxes: partition words according to word boxes.
[:return:] a list of paths that are not boxes
"""
MAX_HEIGHT_LINES = 1
not_boxes = []
try:
if not UNITTESTING:
bar = Bar('process word boxes', max=len(page.words))
svg_tree = ET.parse(page.source)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
allpaths_on_margin_field = []
tr_xmin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\
else transkription_field.xmin
tr_ymin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\
else transkription_field.ymin
if paths is None or attributes is None:
paths = []
raw_paths, attributes = svg_to_paths.svg2paths(page.source)
for index, raw_path in enumerate(raw_paths):
paths.append(Path.create_cls(id=index, path=raw_path, style_class=attributes[index].get('class'), page=page))
for index, mypath in enumerate(paths):
path = mypath.path
xmin, xmax, ymin, ymax = path.bbox()
attribute = attributes[index]
if len(path) > 0\
and path != transkription_field.path\
and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\
or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\
and abs(ymax-ymin) < max_line:
allpaths_on_margin_field.append(mypath)#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page))
box_line_number_dict = {}
for box_path in sorted(box_paths, key=lambda path: path.get_median_y()):
line_number = page.get_line_number(box_path.get_median_y(tr_ymin=tr_ymin))
if line_number > 0:
if line_number not in box_line_number_dict.keys():
box_line_number_dict.update({ line_number: [ box_path ]})
else:
box_line_number_dict.get(line_number).append(box_path)
boxes = []
for line_number in box_line_number_dict.keys():
box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x())
margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\
if page.get_line_number(margin_box.get_median_y(tr_ymin=tr_ymin)) == line_number ],\
key=lambda path: path.get_x())
threshold = 3 if line_number % 2 == 0 else 1.5
if len(margin_boxes_on_line) > 0:
for box_path in box_paths_on_line:
#print(line_number, box_path.path.d(), len(margin_boxes_on_line))
box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\
namespaces=namespaces, threshold=threshold)
if box is not None:
boxes.append(box)
else:
not_boxes += box_paths_on_line
if len(boxes) > 0 and len(page.words) > 0:
print(len(boxes))
startIndex = 0
steps = round(len(page.words)/4) if not bool(UNITTESTING) else len(page.words)
while startIndex+steps <= len(page.words):
for word in page.words[startIndex:startIndex+steps]:
word.process_boxes(boxes, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
word.create_correction_history(page)
if not bool(UNITTESTING):
bar.next()
elif word.earlier_version is not None:
#print(f'{word.text} -> {word.earlier_version.text}')
if word.earlier_version.earlier_version is not None:
print(f'{word.earlier_version.earlier_version.text}')
save_page(page, page.page_tree.docinfo.URL)
page = Page.create_cls(page.page_tree.docinfo.URL)
startIndex += steps
not bool(UNITTESTING) and bar.finish()
except Exception as e:
print(e)
return not_boxes
def reset_page(page):
"""Reset all words that have word_parts in order to run the script a second time.
"""
svg_pos_file = PathlibPath(page.page_tree.docinfo.URL)
first_merge_version = svg_pos_file.parent / MERGED_DIR / svg_pos_file.name
if first_merge_version.exists():
page = Page(str(first_merge_version))
else:
word_with_wordparts = [ word for word in page.words if len(word.word_parts) > 0 ]
word_with_wordparts += [ word for word in page.words if word.earlier_version is not None ]
page_changed = False
if len(word_with_wordparts) > 0:
for word in word_with_wordparts:
word.undo_partitioning()
update_transkription_position_ids(word)
page_changed = True
no_line_numbers = [ word for word in page.words if word.line_number == -1 ]
if len(no_line_numbers) > 0:
for word in no_line_numbers:
if len(word.transkription_positions) > 0:
word.line_number = page.get_line_number((word.transkription_positions[0].top+word.transkription_positions[0].bottom)/2)
else:
msg = f'Word {word.id} {word.text} has no transkription_position!'
warnings.warn(msg)
page_changed = True
if page_changed:
page.update_and_attach_words2tree()
def save_page(page, svg_pos_file, target_svg_pos_file=None, status=None, manuscript_file=None):
"""Save page to target_file and update status of file.
"""
page.update_and_attach_words2tree()
if not UNITTESTING:
if target_svg_pos_file is None:
target_svg_pos_file = svg_pos_file
if status is not None:
update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status)
write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
def update_faksimile_line_positions(page):
"""Update faksimile_positions of the lines
"""
num_lines = len(page.line_numbers)
ymin = page.text_field.ymin\
if page.text_field is not None\
else 0.0
for line_number in page.line_numbers:
if len([ word.faksimile_positions[0] for word in page.words\
if len(word.word_parts) < 2 and len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) > 0:
line_number.faksimile_inner_top = min([ word.faksimile_positions[0].top for word in page.words\
if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ])
line_number.faksimile_inner_bottom = max([ word.faksimile_positions[0].bottom for word in page.words\
if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ])
if line_number.id % 2 == 0:
line_number.faksimile_outer_top = line_number.faksimile_inner_top - ymin
line_number.faksimile_outer_bottom = line_number.faksimile_inner_bottom - ymin
for index, line_number in enumerate(page.line_numbers):
if line_number.faksimile_inner_bottom == 0.0\
or line_number.faksimile_inner_bottom < line_number.faksimile_inner_top:
if index == 0 and num_lines > 1:
line_number.faksimile_inner_bottom = page.line_numbers[index+1].top
elif index == num_lines-1 and page.text_field is not None:
line_number.faksimile_inner_bottom = round(page.text_field.height + page.text_field.ymin, 3)
elif index > 0 and index < num_lines-1:
line_number.faksimile_inner_bottom = page.line_numbers[index+1].faksimile_inner_top\
if page.line_numbers[index+1].faksimile_inner_top > page.line_numbers[index-1].faksimile_inner_bottom\
else page.line_numbers[index-1].faksimile_inner_bottom
line_number.attach_object_to_tree(page.page_tree)
def update_writing_process_ids(page):
"""Update the writing_process_ids of the words and split accordingly.
"""
for word in page.words:
word.set_writing_process_id_to_transkription_positions(page)
word.partition_according_to_writing_process_id()
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to process words after they have been merged with faksimile data.
svgscripts/process_words_post_merging.py [OPTIONS]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-i|--include-missing-line-number run script on files that contain words without line numbers
-r|--rerun rerun script on a svg_pos_file that has already been processed
+ -s|--skip-process-boxes skip process word boxes
:return: exit code (int)
"""
status_not_contain = STATUS_POSTMERGED_OK
include_missing_line_number = False
+ skipProcessWordBoxes = False
try:
- opts, args = getopt.getopt(argv, "hir", ["help", "include-missing-line-number", "rerun" ])
+ opts, args = getopt.getopt(argv, "hirs", ["help", "include-missing-line-number", "rerun", "skip-process-boxes" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-i', '--include-missing-line-number'):
include_missing_line_number = True
+ elif opt in ('-s', '--skip-process-boxes'):
+ include_missing_line_number = True
elif opt in ('-r', '--rerun'):
status_not_contain = ''
+ skipProcessWordBoxes = True
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
manuscript_file = file_a\
if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
else None
counter = 0
for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=status_not_contain):
reset_page(page)
no_line_numbers = [ word for word in page.words if word.line_number == -1 ]
if not include_missing_line_number and len(no_line_numbers) > 0:
not UNITTESTING and print(Fore.RED + f'Page {page.title}, {page.number} has words with no line number!')
for word in no_line_numbers:
not UNITTESTING and print(f'Word {word.id}: {word.text}')
else:
back_up(page, page.xml_file)
not UNITTESTING and print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL)
- post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file)
+ post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file, skipProcessWordBoxes=skipProcessWordBoxes)
counter += 1
not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: tests_svgscripts/test_page.py
===================================================================
--- tests_svgscripts/test_page.py (revision 112)
+++ tests_svgscripts/test_page.py (revision 113)
@@ -1,164 +1,176 @@
import unittest
from os import sep, path
from os.path import isdir, isfile, dirname, basename
import lxml.etree as ET
import sys
import sys
sys.path.append('svgscripts')
dir_changed = False
if not isdir('datatypes'):
sys.path.append(dirname(sys.path[0]))
dir_changed = True
from datatypes.lineNumber import LineNumber
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.style import Style
from datatypes.writing_process import WritingProcess
from datatypes.word import Word
class TestPage(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.test_file = DATADIR + sep + 'test.xml'
self.test_svg_file = DATADIR + sep + 'test421.svg'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml'
self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.test_tcm_xml = DATADIR + sep + 'N_VII_1_page001.xml'
self.test_manuscript = DATADIR + sep + 'N_VII_1.xml'
self.test_styles_color = DATADIR + sep + 'N_VII_1_page013.xml'
def test_Page(self):
page = Page(self.test_file)
self.assertEqual(page.title, 'Mp XIV 1')
self.assertEqual(page.number, '421')
self.assertEqual(len(page.sonderzeichen_list), 2)
self.assertEqual('st21' in page.sonderzeichen_list, True)
self.assertEqual('st23' in page.sonderzeichen_list, True)
self.assertEqual(page.style_dict['st0']['fill'], '#F8F9F8')
stage0 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 0 ]
stage1 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 1 ]
stage2 = [ key for key, value in page.fontsizekey2stage_mapping.items() if value == 2 ]
fontStage0 = float(page.style_dict.get(stage0[0]).get('font-size').replace('px',''))
fontStage1 = float(page.style_dict.get(stage1[0]).get('font-size').replace('px',''))
fontStage2 = float(page.style_dict.get(stage2[0]).get('font-size').replace('px',''))
self.assertEqual(fontStage0 > fontStage1, True)
self.assertEqual(fontStage1 > fontStage2, True)
page = Page.create_cls(self.test_tcm_xml, create_dummy_page=True)
self.assertEqual(page.number, '1')
+ #:map :w:!python3 -m unittest tests_svgscripts.test_page.TestPage.test_Page
+ """
+ page = Page.create_cls('xml/Mp_XVI_page55r.xml', isBlank=True)
+ self.assertEqual(page.status, 'blank')
+ print(page)
+ """
def test_get_biggest_fontSize4styles(self):
page = Page(self.test_file)
style_set = { 'st12', 'st2', 'st14', 'st13' }
self.assertEqual(page.get_biggest_fontSize4styles(style_set=style_set), 10)
def test_get_words(self):
page = Page(self.test_file)
words = page.words
self.assertEqual(len(words), 440)
self.assertEqual(words[0].text, '$')
self.assertEqual(words[439].text, 'mußte!')
def test_get_word_deletion_path(self):
page = Page('xml/Mp_XIV_page417.xml')
dpath = page.get_word_deletion_path(d_attribute='M 273.343,251.451 L 276.479,251.451 L 276.479,251.751 L 273.343,251.751 L 273.343,251.451')
def test_update_page_type(self):
page = Page(self.pdf_xml)
tf = TranskriptionField(self.pdf_xml_source)
page.update_page_type(transkription_field=tf)
self.assertEqual(page.page_type, Page.PAGE_VERSO)
#page = Page(self.xml_fileB)
#page.update_page_type()
#self.assertEqual(page.page_type, Page.PAGE_RECTO)
def test_update_line_number_area(self):
page = Page(self.xml_file)
transkription_field = TranskriptionField(page.source)
page.update_line_number_area(transkription_field)
self.assertEqual(transkription_field.line_number_area_width > 0, True)
self.assertEqual(transkription_field.line_number_area_width < 15, True)
page = Page(self.xml_fileB)
transkription_field = TranskriptionField(page.source)
page.update_line_number_area(transkription_field)
self.assertEqual(transkription_field.line_number_area_width > 0, True)
self.assertEqual(transkription_field.line_number_area_width < 15, True)
def test_get_pages_from_xml_file(self):
pages = Page.get_pages_from_xml_file(self.test_manuscript)
self.assertEqual(len(pages), 4)
self.assertEqual(pages[0].number, '5')
self.assertEqual(pages[1].number, '6')
pages = Page.get_pages_from_xml_file(self.test_manuscript, status_contains=STATUS_MERGED_OK)
self.assertEqual(len(pages), 2)
self.assertEqual(pages[0].number, '5')
pages = Page.get_pages_from_xml_file(self.test_manuscript, status_contains=STATUS_MERGED_OK, status_not_contain=STATUS_POSTMERGED_OK)
self.assertEqual(len(pages), 1)
def test_get_semantic_dictionary(self):
dictionary = Page.get_semantic_dictionary()
#print(dictionary)
def test_update_styles(self):
#:map :w:!python3 -m unittest tests_svgscripts.test_page.TestPage.test_update_styles
+ page = Page('xml/Mp_XV_page100r.xml')
+ page.words = [ word for word in page.words if word.text == 'werden!' ]
+ page.update_styles(add_to_parents=True, create_css=True)
+ #print(page.words[0].earlier_version.word_parts[1].styles)
+ """
page = Page(self.pdf_xml)
page.words = [ word for word in page.words if word.text == 'Schopenhauer' ]
page.update_styles(add_to_parents=True)
self.assertEqual(len(page.words[0].styles), 1)
self.assertEqual(page.words[0].styles[0].color.name, 'black')
self.assertEqual(page.words[0].styles[0].font, Style.NIETSCHES_FONTS['latin'])
self.assertEqual(page.words[0].styles[0].writing_instrument, Style.WRITING_INSTRUMENTS[('black',False)])
page = Page(self.test_styles_color)
page.words = [ word for word in page.words if word.text == 'Versöhnlichkeit' ]
page.update_styles(add_to_parents=True)
self.assertEqual(len(page.words[0].styles), 1)
self.assertEqual(page.words[0].styles[0].color.name, 'green')
self.assertEqual(page.words[0].styles[0].font, Style.NIETSCHES_FONTS['german'])
self.assertEqual(page.words[0].styles[0].writing_instrument, Style.WRITING_INSTRUMENTS[('green',False)])
self.assertEqual(page.words[0].styles[0].writing_process_id, WritingProcess.INSERTION_AND_ADDITION)
page = Page(self.test_styles_color)
page.words = [ word for word in page.words if word.text == 'Versöhnlichkeit' or word.text == 'gewisse' ]
self.assertEqual(len(page.words), 2)
word = page.words[0]
word.transkription_positions += page.words[1].transkription_positions
page.words = [ word ]
page.update_styles(add_to_parents=True, partition_according_to_styles=True)
self.assertEqual(len(page.words[0].word_parts), 2)
page = Page(self.test_styles_color)
page.update_styles(add_to_parents=True, create_css=True)
for word in page.words:
self.assertTrue(len(word.styles) > 0)
self.assertTrue((not word.deleted and len([ style for style in word.styles if 'line-through' in style.css_string]) == 0)\
or (word.deleted and len([ style for style in word.styles if 'line-through' in style.css_string]) > 0))
+ """
def test_add_deletion_paths_to_words(self):
page = Page('xml/Mp_XIV_page416.xml')
word = [ word for word in page.words if word.deleted or True in [ part.deleted for part in word.word_parts ]][0]
page.add_deletion_paths_to_words()
self.assertTrue(len(word.deletion_paths) > 0)
page = Page('xml/Mp_XIV_page417.xml')
word = [ word for word in page.words if word.text == 'wird.)' ][0]
page.add_deletion_paths_to_words(add_paths_near_words=True)
self.assertTrue(len(word.deletion_paths_near_word) > 0)
def test_lock(self):
page = Page(self.test_tcm_xml)
self.assertEqual(page.is_locked(), False)
page.lock('asdf.txt')
self.assertEqual(page.is_locked(), True)
self.assertEqual(page.page_tree.xpath('//lock/reference-file/text()')[0], 'asdf.txt')
page.unlock()
self.assertEqual(page.is_locked(), False)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_mark_foreign_hands.py
===================================================================
--- tests_svgscripts/test_mark_foreign_hands.py (revision 112)
+++ tests_svgscripts/test_mark_foreign_hands.py (revision 113)
@@ -1,81 +1,85 @@
import unittest
from os import sep, path
from os.path import dirname, isdir
import lxml.etree as ET
import sys
sys.path.append('svgscripts')
from datatypes.matrix import Matrix
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page
from datatypes.word import Word
class TestMarkForeignHands(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
self.xml_file = DATADIR + sep + 'N_VII_1_page008.xml'
self.test_content_svg = DATADIR + sep + 'N_VII_1_xp5_4_page5.svg'
self.test_content_xml = DATADIR + sep + 'N_VII_1_page005.xml'
self.test_contentB_svg = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg'
self.test_contentB_xml = DATADIR + sep + 'N_VII_1_page006.xml'
mylist = {'text': '*', 'id': '0', 'line-number': '2'}
self.node = ET.Element(MarkForeignHands.XML_TAG, attrib=mylist)
word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)'))
self.transkription_positions = [ word_position ]
word_position.attach_object_to_tree(self.node)
def test_create_cls(self):
mark_foreign_hands = MarkForeignHands.create_cls(self.node)
self.assertEqual(mark_foreign_hands.id, 0)
self.assertEqual(mark_foreign_hands.transkription_positions[0].bottom, 11)
self.assertEqual(mark_foreign_hands.transkription_positions[0].height, 10)
self.assertEqual(mark_foreign_hands.transkription_positions[0].top, 1)
self.assertEqual(mark_foreign_hands.transkription_positions[0].left, 0)
self.assertEqual(mark_foreign_hands.transkription_positions[0].width, 10)
self.assertEqual(mark_foreign_hands.text, '*')
self.assertEqual(mark_foreign_hands.line_number, 2)
self.assertEqual(mark_foreign_hands.transkription_positions[0].transform.isRotationMatrix(), True)
def test_attach_word_to_tree(self):
+ empty_tree = ET.ElementTree(ET.Element('page'))
mark_foreign_hands = MarkForeignHands.create_cls(self.node)
mark_foreign_hands.foreign_hands_text = 'test'
mark_foreign_hands.pen= 'Rotstift'
- empty_tree = ET.ElementTree(ET.Element('page'))
+ mark_foreign_hands.overwrites_mark = MarkForeignHands.create_cls(self.node)
+ mark_foreign_hands.overwrites_mark.foreign_hands_text = 'test 2'
+ mark_foreign_hands.overwrites_mark.pen= 'Bleistift'
mark_foreign_hands.attach_word_to_tree(empty_tree)
#print(ET.dump(empty_tree.getroot()))
- for node in empty_tree.xpath('//' + MarkForeignHands.XML_TAG):
+ for node in empty_tree.xpath('./' + MarkForeignHands.XML_TAG):
mark = MarkForeignHands.create_cls(node)
self.assertEqual(mark.pen, 'Rotstift')
self.assertEqual(mark.foreign_hands_text.content, 'test')
+ self.assertTrue(mark.overwrites_mark is not None)
self.assertEqual(mark.id, 0)
self.assertEqual(mark.transkription_positions[0].bottom, 11)
self.assertEqual(mark.transkription_positions[0].height, 10)
self.assertEqual(mark.transkription_positions[0].top, 1)
self.assertEqual(mark.transkription_positions[0].left, 0)
self.assertEqual(mark.transkription_positions[0].width, 10)
self.assertEqual(mark.text, '*')
self.assertEqual(mark.line_number, 2)
self.assertEqual(mark.transkription_positions[0].transform.isRotationMatrix(), True)
#print(empty_tree.xpath('//mark-foreign-hands/content/text()'))
#print(empty_tree.xpath('//mark-foreign-hands/content/@pen'))
def test_get_semanticAndDataDict(self):
dictionary = MarkForeignHands.get_semantic_dictionary()
#print(dictionary)
def test_find_content(self):
page = Page(self.test_contentB_xml)
transkription_field = TranskriptionField(page.source)
svg_tree = ET.parse(page.source)
page.update_line_number_area(transkription_field, svg_tree=svg_tree)
mark_foreign_hands_word = [ word for word in page.words if word.text == MarkForeignHands.CLASS_MARK ][0]
mark_foreign_hands = MarkForeignHands.create_cls_from_word(mark_foreign_hands_word)
MarkForeignHands.find_content([ mark_foreign_hands ] , transkription_field, svg_tree, style_dict=page.style_dict)
self.assertEqual(mark_foreign_hands.foreign_hands_text, 'W III, 104. (MXXIX, 3)')
self.assertEqual(mark_foreign_hands.pen, 'Bleistift')
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_archival_manuscript.py
===================================================================
--- tests_svgscripts/test_archival_manuscript.py (revision 112)
+++ tests_svgscripts/test_archival_manuscript.py (revision 113)
@@ -1,56 +1,62 @@
import unittest
from os import sep, path
from os.path import basename, dirname, isfile
import lxml.etree as ET
import sys
sys.path.append('svgscripts')
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.color import Color
class TestArchivalManuscriptUnity(unittest.TestCase):
def setUp(self):
ArchivalManuscriptUnity.UNITTESTING = True
DATADIR = dirname(__file__) + sep + 'test_data'
self.test_manuscript = DATADIR + sep + 'N_VII_1.xml'
def test_init(self):
title = 'Test I 1'
manuscript = ArchivalManuscriptUnity(title=title)
self.assertEqual(manuscript.title, title)
def test_get_semanticAndDataDict(self):
semantic_dict = ArchivalManuscriptUnity.get_semantic_dictionary()
#print(semantic_dict)
def test_create_cls(self):
manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript)
self.assertTrue(manuscript.description is not None)
self.assertEqual(len(manuscript.earlier_descriptions), 2)
self.assertEqual(manuscript.title, basename(self.test_manuscript).replace('.xml','').replace('_', ' '))
self.assertEqual(manuscript.manuscript_type, 'Notizheft')
self.assertEqual(len(manuscript.pages), 4)
manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript, page_status_list=['faksimile merged'])
self.assertEqual(len([ page for page in manuscript.pages if 'xml_file' in page.__dict__.keys()]), 2)
manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript, page_status_list=['faksimile merged', 'words processed'])
self.assertEqual(len([ page for page in manuscript.pages if 'xml_file' in page.__dict__.keys()]), 1)
manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript, page_xpath='//pages/page/@output')
self.assertEqual(len(manuscript.pages), 4)
def test_get_color(self):
color = Color()
manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript)
self.assertEqual(manuscript.get_color(color.hex_color) is not None, True)
self.assertEqual(manuscript.get_color("#F7F6F5") is None, True)
def test_update_colors(self):
color = Color()
manuscript = ArchivalManuscriptUnity.create_cls(self.test_manuscript)
manuscript.update_colors(color)
self.assertEqual(len(manuscript.colors), 2)
#print(ET.dump(manuscript.manuscript_tree.getroot()))
-
+ def test_aliases(self):
+ pass
+ """
+ xml_manuscript_file = 'xml/Mp_XVIII.xml'
+ alias_file_name = '/home/knister0/ownCloud/myNietzscheDE/KGW-IX_13/Mp-XVIII-Inhalt.txt'
+ ArchivalManuscriptUnity.ADD_ALIASES(xml_manuscript_file, alias_file_name)
+ """
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_faksimile.py
===================================================================
--- tests_svgscripts/test_faksimile.py (revision 112)
+++ tests_svgscripts/test_faksimile.py (revision 113)
@@ -1,92 +1,98 @@
import unittest
from os import sep, path
from os.path import isdir, dirname, basename
import lxml.etree as ET
import sys
import sys
sys.path.append('svgscripts')
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.faksimile_image import FaksimileImage
from datatypes.text_field import TextField
class TestFaksimilePage(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.svg_file = DATADIR + sep + 'W-II-1,49et50.svg'
self.svg_testmatrix = DATADIR + sep + 'TESTMATRIX_1.svg'
self.faksimile_dir = DATADIR + sep + 'faksimile_svg'
self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg'
self.faksimile_rotate90 = self.faksimile_dir + sep + 'Mp-XV-2c,4.svg'
def test_init(self):
image = FaksimileImage(file_name='test.jpg', height=10, width=10)
text_field = TextField(width=10, height=10, x=10, y=10)
faksimile = FaksimilePage(title='test', page_number=1, faksimile_image=image, text_field=text_field)
self.assertEqual(faksimile.page_tree.getroot().get('title'), 'test')
self.assertEqual(faksimile.page_tree.getroot().get('page-number'), '1')
self.assertEqual(faksimile.faksimile_image.width, 10)
self.assertEqual(faksimile.text_field.width, 10)
def test_GET_TEXTFIELDS(self):
- svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Myriam/Mp_XV/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/Mp-XV-2d,16et17.svg')
- pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
+ """
+ svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Myriam/Blank/Mp_XVI/Fertig/Mp_XVI_page52v.svg')
+ pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree, page_number='52v', isBlank=True)
+ print(pages[0].text_field)
+ """
+ svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Eric/Mp_XVI/Fertig/Mp-XVI-1,1.svg')
+ pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree, page_number='1r')
+ self.assertEqual(len(pages), 1)
svg_tree = ET.parse(self.svg_file)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 2)
text_field = pages[0].text_field
self.assertEqual(text_field.width, 663.333)
result_dir = '.{}xml{}'.format(sep, sep) if isdir('xml') else ''
self.assertEqual(pages[0].xml_file, result_dir + 'W-II-1_49.xml')
self.assertEqual(pages[0].title, 'W II 1')
self.assertEqual(pages[0].page_number, '49')
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree, page_number='49')
self.assertEqual(len(pages), 1)
svg_tree = ET.parse(self.svg_testmatrix)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 1)
self.assertEqual(len(pages[0].word_positions), 1)
self.assertEqual(pages[0].word_positions[0].transform.toCSSTransformString(), 'rotate(45deg)')
svg_tree = ET.parse(self.faksimile_file)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 2)
textfield_id = pages[1].title.replace(' ', '-') + '_' + pages[1].page_number
#print([ position.id for position in pages[0].word_positions])
self.assertEqual(textfield_id not in [ position.id for position in pages[0].word_positions ], True)
self.assertEqual('path1237' in [ position.id for position in pages[0].word_positions ], True)
self.assertEqual('Vorgangs' in [ position.text for position in pages[0].word_positions ], False)
svg_tree = ET.parse(self.faksimile_file)
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 2)
self.assertEqual(pages[0].page_number, '5')
"""
svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Eric/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/W-II-1,141et142.svg')
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 1)
"""
svg_tree = ET.parse(self.faksimile_rotate90)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
pages = FaksimilePage.GET_FAKSIMILEPAGES(svg_tree)
self.assertEqual(len(pages), 1)
self.assertEqual(len(pages[0].word_positions), len(svg_tree.xpath('//ns:rect/ns:title', namespaces=namespaces)))
def test_get_paths_inside_rect(self):
svg_tree = ET.parse(self.faksimile_file)
paths = get_paths_inside_rect(svg_tree, '//ns:path', 360, 786, 92, 765, 'N-VII-1_5')
self.assertEqual(len(paths), 1)
svg_tree = ET.parse(self.svg_testmatrix)
paths = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', 0, 2038.72, 0, 974.08002, 'TESTMATRIX_1')
self.assertEqual(len(paths), 1)
svg_tree = ET.parse('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Myriam/Mp_XIV/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/Mp-XIV-1,419a.svg')
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
paths = get_paths_inside_rect(svg_tree, '//ns:rect', 52, 800, 58, 900, 'Mp-XIV-1_419a', namespaces=namespaces)
self.assertEqual(len([ path for path in paths if 'seinen' in path.xpath('./ns:title/text()', namespaces=namespaces)]), 1)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_data/Mp_XV_page79v.svg
===================================================================
--- tests_svgscripts/test_data/Mp_XV_page79v.svg (revision 0)
+++ tests_svgscripts/test_data/Mp_XV_page79v.svg (revision 113)
@@ -0,0 +1,3174 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Mp X V , 79v
+
+
+ 2
+
+
+ 4
+
+
+ 6
+
+
+ 8
+
+
+ 10
+
+
+ 78 r ,5 5
+
+
+ )
+
+
+
+
+ Den Sinn ni c ht in d en Dingen suchen : son d ern ihn
+
+
+ hineinstecke n
+
+
+ !
+
+
+ W ozu noch I d een , w enn ma n Ideale hat! S c höne
+
+
+ A ugen u
+
+
+
+
+ G efühle genügen .
+
+
+ W üns c hb ar keit s a ge ich , ni c ht Ide a l
+
+
+
+
+
+ m eh r
+
+
+ Ma n ißt eine Speise ni c h t
+
+
+ A
+
+
+ a us M o ra l ; so w i rd m a n ein ma l a u c h ni c ht m eh r a us M o ra l
+
+
+
+
+
+ sinnli c h e
+
+
+
+
+
+ „ G utes thun“ .
+
+
+ : in ih m t r iu m phi r t da s
+
+
+ A
+
+
+ Alle g r o
+
+
+ d es V olkes übe r d a s
+
+
+ le n t o
+
+
+ d e r vo r neh m en Geistigkeit . –
+
+
+
+
+
+ gegen d ie G eistli c hkei t
+
+
+
+
+
+
+
+
+
+
+ 2: KGW V III 6 [ 15 ]
+
+
+ 4: KGW V III 6 [ 16 ]
+
+
+ 6: KGW V III 6 [ 17 ]
+
+
+ 8- 1 0: KGW VIII 6 [ 1 8 ]
+
+
+ 4:
+
+
+ ge n ü g
+
+
+ e n
+
+
+ ]
+
+
+ V k
+
+
+
+
+
+
+
+
+
+
+
+ Mp XV a 74-84r Druck.indd 8
+
+
+
+ Mp XV a 74-84r Druck.indd 8
+
+
+
+
+
+
+
+
+
+ 19.07.20 22:03
+
+
+
+ 19.07.20 22:03
+
+
+
+
Index: tests_svgscripts/test_data/N_VII_1_page005.ttl
===================================================================
--- tests_svgscripts/test_data/N_VII_1_page005.ttl (revision 0)
+++ tests_svgscripts/test_data/N_VII_1_page005.ttl (revision 113)
@@ -0,0 +1,2507 @@
+@prefix data: .
+@prefix rdf: .
+@prefix tln: .
+@prefix xsd: .
+
+data:_N_VII_1_Page5 a tln:Page ;
+ tln:hasLines ( data:_N_VII_1_Page5_Line1 data:_N_VII_1_Page5_Line2 data:_N_VII_1_Page5_Line3 data:_N_VII_1_Page5_Line4 data:_N_VII_1_Page5_Line5 data:_N_VII_1_Page5_Line6 data:_N_VII_1_Page5_Line7 data:_N_VII_1_Page5_Line8 data:_N_VII_1_Page5_Line9 data:_N_VII_1_Page5_Line10 data:_N_VII_1_Page5_Line11 data:_N_VII_1_Page5_Line12 data:_N_VII_1_Page5_Line13 data:_N_VII_1_Page5_Line14 data:_N_VII_1_Page5_Line15 data:_N_VII_1_Page5_Line16 data:_N_VII_1_Page5_Line17 data:_N_VII_1_Page5_Line18 data:_N_VII_1_Page5_Line19 data:_N_VII_1_Page5_Line20 data:_N_VII_1_Page5_Line21 data:_N_VII_1_Page5_Line22 data:_N_VII_1_Page5_Line23 data:_N_VII_1_Page5_Line24 data:_N_VII_1_Page5_Line25 data:_N_VII_1_Page5_Line26 data:_N_VII_1_Page5_Line27 data:_N_VII_1_Page5_Line28 data:_N_VII_1_Page5_Line29 data:_N_VII_1_Page5_Line30 data:_N_VII_1_Page5_Line31 data:_N_VII_1_Page5_Line32 data:_N_VII_1_Page5_Line33 data:_N_VII_1_Page5_Line34 data:_N_VII_1_Page5_Line35 data:_N_VII_1_Page5_Line36 data:_N_VII_1_Page5_Line37 data:_N_VII_1_Page5_Line38 data:_N_VII_1_Page5_Line39 data:_N_VII_1_Page5_Line40 data:_N_VII_1_Page5_Line41 data:_N_VII_1_Page5_Line42 data:_N_VII_1_Page5_Line43 ) ;
+ tln:hasNumber "5"^^xsd:string ;
+ tln:hasOrientation "North"^^xsd:string ;
+ tln:hasSvgImage data:_N_VII_1_Page5_SVGImage0 ;
+ tln:hasWordInsertionMarks ( data:_N_VII_1_Page5_WordInsertionMark0 ) ;
+ tln:hasWords ( data:_N_VII_1_Page5_Word0 data:_N_VII_1_Page5_Word1 data:_N_VII_1_Page5_Word2 data:_N_VII_1_Page5_Word3 data:_N_VII_1_Page5_Word4 data:_N_VII_1_Page5_Word5 data:_N_VII_1_Page5_Word6 data:_N_VII_1_Page5_Word7 data:_N_VII_1_Page5_Word8 data:_N_VII_1_Page5_Word9 data:_N_VII_1_Page5_Word10 data:_N_VII_1_Page5_Word11 data:_N_VII_1_Page5_Word12 data:_N_VII_1_Page5_Word13 data:_N_VII_1_Page5_Word14 data:_N_VII_1_Page5_Word15 data:_N_VII_1_Page5_Word16 data:_N_VII_1_Page5_Word17 data:_N_VII_1_Page5_Word18 data:_N_VII_1_Page5_Word19 data:_N_VII_1_Page5_Word20 data:_N_VII_1_Page5_Word21 data:_N_VII_1_Page5_Word22 data:_N_VII_1_Page5_Word23 data:_N_VII_1_Page5_Word24 data:_N_VII_1_Page5_Word25 data:_N_VII_1_Page5_Word26 data:_N_VII_1_Page5_Word27 data:_N_VII_1_Page5_Word28 data:_N_VII_1_Page5_Word29 data:_N_VII_1_Page5_Word30 data:_N_VII_1_Page5_Word31 data:_N_VII_1_Page5_Word32 data:_N_VII_1_Page5_Word33 data:_N_VII_1_Page5_Word34 data:_N_VII_1_Page5_Word35 data:_N_VII_1_Page5_Word36 data:_N_VII_1_Page5_Word37 data:_N_VII_1_Page5_Word38 data:_N_VII_1_Page5_Word39 data:_N_VII_1_Page5_Word40 data:_N_VII_1_Page5_Word41 data:_N_VII_1_Page5_Word42 data:_N_VII_1_Page5_Word43 data:_N_VII_1_Page5_Word44 data:_N_VII_1_Page5_Word45 data:_N_VII_1_Page5_Word46 data:_N_VII_1_Page5_Word47 data:_N_VII_1_Page5_Word48 data:_N_VII_1_Page5_Word49 data:_N_VII_1_Page5_Word50 data:_N_VII_1_Page5_Word51 data:_N_VII_1_Page5_Word52 data:_N_VII_1_Page5_Word53 data:_N_VII_1_Page5_Word54 data:_N_VII_1_Page5_Word55 data:_N_VII_1_Page5_Word56 data:_N_VII_1_Page5_Word57 data:_N_VII_1_Page5_Word58 data:_N_VII_1_Page5_Word59 data:_N_VII_1_Page5_Word60 data:_N_VII_1_Page5_Word61 data:_N_VII_1_Page5_Word62 data:_N_VII_1_Page5_Word63 data:_N_VII_1_Page5_Word64 data:_N_VII_1_Page5_Word65 data:_N_VII_1_Page5_Word66 data:_N_VII_1_Page5_Word67 data:_N_VII_1_Page5_Word68 data:_N_VII_1_Page5_Word69 data:_N_VII_1_Page5_Word70 data:_N_VII_1_Page5_Word71 data:_N_VII_1_Page5_Word72 data:_N_VII_1_Page5_Word73 data:_N_VII_1_Page5_Word74 data:_N_VII_1_Page5_Word75 data:_N_VII_1_Page5_Word76 data:_N_VII_1_Page5_Word77 data:_N_VII_1_Page5_Word78 data:_N_VII_1_Page5_Word79 data:_N_VII_1_Page5_Word80 data:_N_VII_1_Page5_Word81 data:_N_VII_1_Page5_Word82 data:_N_VII_1_Page5_Word83 data:_N_VII_1_Page5_Word84 data:_N_VII_1_Page5_Word85 data:_N_VII_1_Page5_Word86 data:_N_VII_1_Page5_Word87 data:_N_VII_1_Page5_Word88 data:_N_VII_1_Page5_Word89 data:_N_VII_1_Page5_Word90 data:_N_VII_1_Page5_Word91 data:_N_VII_1_Page5_Word92 data:_N_VII_1_Page5_Word93 data:_N_VII_1_Page5_Word94 data:_N_VII_1_Page5_Word95 data:_N_VII_1_Page5_Word96 data:_N_VII_1_Page5_Word97 data:_N_VII_1_Page5_Word98 data:_N_VII_1_Page5_Word99 data:_N_VII_1_Page5_Word100 data:_N_VII_1_Page5_Word101 data:_N_VII_1_Page5_Word102 data:_N_VII_1_Page5_Word103 data:_N_VII_1_Page5_Word104 data:_N_VII_1_Page5_Word105 data:_N_VII_1_Page5_Word106 data:_N_VII_1_Page5_Word107 data:_N_VII_1_Page5_Word108 data:_N_VII_1_Page5_Word109 data:_N_VII_1_Page5_Word110 data:_N_VII_1_Page5_Word111 data:_N_VII_1_Page5_Word112 data:_N_VII_1_Page5_Word113 data:_N_VII_1_Page5_Word114 data:_N_VII_1_Page5_Word115 data:_N_VII_1_Page5_Word116 data:_N_VII_1_Page5_Word117 data:_N_VII_1_Page5_Word118 data:_N_VII_1_Page5_Word119 data:_N_VII_1_Page5_Word120 data:_N_VII_1_Page5_Word121 data:_N_VII_1_Page5_Word122 data:_N_VII_1_Page5_Word123 data:_N_VII_1_Page5_Word124 data:_N_VII_1_Page5_Word125 data:_N_VII_1_Page5_Word126 data:_N_VII_1_Page5_Word127 data:_N_VII_1_Page5_Word128 data:_N_VII_1_Page5_Word129 data:_N_VII_1_Page5_Word130 data:_N_VII_1_Page5_Word131 data:_N_VII_1_Page5_Word132 data:_N_VII_1_Page5_Word133 data:_N_VII_1_Page5_Word134 data:_N_VII_1_Page5_Word135 data:_N_VII_1_Page5_Word136 data:_N_VII_1_Page5_Word137 data:_N_VII_1_Page5_Word138 data:_N_VII_1_Page5_Word139 ) .
+
+data:_N_VII_1_Page5_Line11 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "119.0"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 11 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "118.0"^^xsd:float .
+
+data:_N_VII_1_Page5_Line15 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "165.8"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 15 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "164.8"^^xsd:float .
+
+data:_N_VII_1_Page5_Line17 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "189.2"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 17 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "188.2"^^xsd:float .
+
+data:_N_VII_1_Page5_Line19 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "212.601"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 19 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "211.601"^^xsd:float .
+
+data:_N_VII_1_Page5_Line21 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "236.0"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 21 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "235.0"^^xsd:float .
+
+data:_N_VII_1_Page5_Line23 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "259.4"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 23 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "258.4"^^xsd:float .
+
+data:_N_VII_1_Page5_Line25 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "282.8"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 25 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "281.8"^^xsd:float .
+
+data:_N_VII_1_Page5_Line27 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "306.201"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 27 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "305.201"^^xsd:float .
+
+data:_N_VII_1_Page5_Line29 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "329.6"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 29 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "328.6"^^xsd:float .
+
+data:_N_VII_1_Page5_Line3 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "25.4"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 3 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "24.4"^^xsd:float .
+
+data:_N_VII_1_Page5_Line31 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "353.0"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 31 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "352.0"^^xsd:float .
+
+data:_N_VII_1_Page5_Line33 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "376.4"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 33 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "375.4"^^xsd:float .
+
+data:_N_VII_1_Page5_Line35 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "399.8"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 35 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "398.8"^^xsd:float .
+
+data:_N_VII_1_Page5_Line37 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "423.201"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 37 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "422.201"^^xsd:float .
+
+data:_N_VII_1_Page5_Line39 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "446.6"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 39 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "445.6"^^xsd:float .
+
+data:_N_VII_1_Page5_Line41 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "470.0"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 41 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "469.0"^^xsd:float .
+
+data:_N_VII_1_Page5_Line43 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "531.811"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 43 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "480.201"^^xsd:float .
+
+data:_N_VII_1_Page5_Line5 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "48.8"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 5 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "47.8"^^xsd:float .
+
+data:_N_VII_1_Page5_Line7 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "72.2"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 7 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "71.2"^^xsd:float .
+
+data:_N_VII_1_Page5_Line9 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "95.601"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 9 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "94.601"^^xsd:float .
+
+data:_N_VII_1_Page5_SVGImage0 a tln:SVGImage ;
+ tln:hasFileName "./svg/N_VII_1_page005_web.svg"^^xsd:string ;
+ tln:hasHeight "481.891"^^xsd:float ;
+ tln:hasPrimaryurl "/assets/svg/N_VII_1_page005_web.svg"^^xsd:string ;
+ tln:hasSecondaryurl "http://existdb-test.dasch.swiss/exist/rest/db/storage/nietzsche/svg/N_VII_1_page005_web.svg"^^xsd:string ;
+ tln:hasWidth "297.637"^^xsd:float .
+
+data:_N_VII_1_Page5_Word0 a tln:Word ;
+ tln:hasCleanText "$"^^xsd:string ;
+ tln:hasText "$"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word0_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line1 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word0_Style0 .
+
+data:_N_VII_1_Page5_Word0_Style0 a tln:Style ;
+ tln:styleHasCSS "color:#000000;"^^xsd:string ;
+ tln:styleHasColor data:_N_VII_1_Page5_Word0_Style0_Color0 ;
+ tln:styleHasFont "deutsche Schreibschrift"^^xsd:string ;
+ tln:styleHasWritingInstrument "schwarze Tinte"^^xsd:string .
+
+data:_N_VII_1_Page5_Word0_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "17.7"^^xsd:float ;
+ tln:hasHeight "7.672"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "10.028"^^xsd:float ;
+ tln:hasWidth "5.941"^^xsd:float .
+
+data:_N_VII_1_Page5_Word1 a tln:Word ;
+ tln:hasCleanText "Das"^^xsd:string ;
+ tln:hasText "Das"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word1_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line2 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word10 a tln:Word ;
+ tln:hasCleanText "wird"^^xsd:string ;
+ tln:hasText "wird"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word10_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line4 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word100 a tln:Word ;
+ tln:hasCleanText "ist"^^xsd:string ;
+ tln:hasText "ist"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word100_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line32 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word100_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "376.556"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "102.456"^^xsd:float ;
+ tln:hasTop "365.446"^^xsd:float ;
+ tln:hasWidth "9.309"^^xsd:float .
+
+data:_N_VII_1_Page5_Word101 a tln:Word ;
+ tln:hasCleanText "der"^^xsd:string ;
+ tln:hasText "der"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word101_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line32 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word101_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "376.197"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "114.942"^^xsd:float ;
+ tln:hasTop "365.087"^^xsd:float ;
+ tln:hasWidth "13.032"^^xsd:float .
+
+data:_N_VII_1_Page5_Word102 a tln:Word ;
+ tln:hasCleanText "Grundwille"^^xsd:string ;
+ tln:hasText "Grundwille."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word102_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line32 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word102_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "376.041"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "131.318"^^xsd:float ;
+ tln:hasTop "364.931"^^xsd:float ;
+ tln:hasWidth "47.718"^^xsd:float .
+
+data:_N_VII_1_Page5_Word103 a tln:Word ;
+ tln:hasCleanText "Wo"^^xsd:string ;
+ tln:hasText "Wo"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word103_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line34 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word103_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "399.613"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "388.503"^^xsd:float ;
+ tln:hasWidth "13.038"^^xsd:float .
+
+data:_N_VII_1_Page5_Word104 a tln:Word ;
+ tln:hasCleanText "es"^^xsd:string ;
+ tln:hasText "es"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word104_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line34 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word104_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "401.566"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "30.808"^^xsd:float ;
+ tln:hasTop "390.456"^^xsd:float ;
+ tln:hasWidth "7.832"^^xsd:float .
+
+data:_N_VII_1_Page5_Word105 a tln:Word ;
+ tln:hasCleanText "Gedächtniß"^^xsd:string ;
+ tln:hasText "„Gedächtniß“"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word105_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line34 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word105_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "399.441"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "42.22"^^xsd:float ;
+ tln:hasTop "388.331"^^xsd:float ;
+ tln:hasWidth "54.942"^^xsd:float .
+
+data:_N_VII_1_Page5_Word106 a tln:Word ;
+ tln:hasCleanText "giebt"^^xsd:string ;
+ tln:hasText "giebt,"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word106_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line34 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word106_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "399.519"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "101.662"^^xsd:float ;
+ tln:hasTop "388.409"^^xsd:float ;
+ tln:hasWidth "21.558"^^xsd:float .
+
+data:_N_VII_1_Page5_Word107 a tln:Word ;
+ tln:hasCleanText "hat"^^xsd:string ;
+ tln:hasText "hat"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word107_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line34 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word107_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "399.675"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "127.38"^^xsd:float ;
+ tln:hasTop "388.565"^^xsd:float ;
+ tln:hasWidth "12.849"^^xsd:float .
+
+data:_N_VII_1_Page5_Word108 a tln:Word ;
+ tln:hasCleanText "dieser"^^xsd:string ;
+ tln:hasText "dieser"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word108_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line34 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word108_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "399.597"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "143.406"^^xsd:float ;
+ tln:hasTop "388.487"^^xsd:float ;
+ tln:hasWidth "23.854"^^xsd:float .
+
+data:_N_VII_1_Page5_Word109 a tln:Word ;
+ tln:hasCleanText "Grund"^^xsd:string ;
+ tln:hasText "Grund-"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word109_TranskriptionPosition0,
+ data:_N_VII_1_Page5_Word109_TranskriptionPosition1 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line34 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word13_Style0,
+ data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word109_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "399.441"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "170.604"^^xsd:float ;
+ tln:hasTop "388.331"^^xsd:float ;
+ tln:hasWidth "25.777"^^xsd:float .
+
+data:_N_VII_1_Page5_Word109_TranskriptionPosition1 a tln:TranskriptionPosition ;
+ tln:hasBottom "406.238"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "196.968"^^xsd:float ;
+ tln:hasTop "395.128"^^xsd:float ;
+ tln:hasWidth "2.519"^^xsd:float .
+
+data:_N_VII_1_Page5_Word10_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "48.597"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "90.88"^^xsd:float ;
+ tln:hasTop "37.487"^^xsd:float ;
+ tln:hasWidth "18.186"^^xsd:float .
+
+data:_N_VII_1_Page5_Word11 a tln:Word ;
+ tln:hasCleanText "ein"^^xsd:string ;
+ tln:hasText "ein"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word11_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line4 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word110 a tln:Word ;
+ tln:hasCleanText "wille"^^xsd:string ;
+ tln:hasText "wille"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word110_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line36 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word110_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "423.075"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "411.965"^^xsd:float ;
+ tln:hasWidth "19.233"^^xsd:float .
+
+data:_N_VII_1_Page5_Word111 a tln:Word ;
+ tln:hasCleanText "gewaltet"^^xsd:string ;
+ tln:hasText "gewaltet."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word111_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line36 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word111_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "422.919"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "36.993"^^xsd:float ;
+ tln:hasTop "411.809"^^xsd:float ;
+ tln:hasWidth "36.084"^^xsd:float .
+
+data:_N_VII_1_Page5_Word112 a tln:Word ;
+ tln:hasCleanText "–"^^xsd:string ;
+ tln:hasText "–"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word112_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line36 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word112_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "429.763"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "77.442"^^xsd:float ;
+ tln:hasTop "418.653"^^xsd:float ;
+ tln:hasWidth "5.3"^^xsd:float .
+
+data:_N_VII_1_Page5_Word113 a tln:Word ;
+ tln:hasCleanText "In"^^xsd:string ;
+ tln:hasText "In"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word113_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line36 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word113_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "423.075"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "85.741"^^xsd:float ;
+ tln:hasTop "411.965"^^xsd:float ;
+ tln:hasWidth "8.306"^^xsd:float .
+
+data:_N_VII_1_Page5_Word114 a tln:Word ;
+ tln:hasCleanText "der"^^xsd:string ;
+ tln:hasText "der"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word114_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line36 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word114_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "422.997"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "97.355"^^xsd:float ;
+ tln:hasTop "411.887"^^xsd:float ;
+ tln:hasWidth "13.034"^^xsd:float .
+
+data:_N_VII_1_Page5_Word115 a tln:Word ;
+ tln:hasCleanText "Wirklichkeit"^^xsd:string ;
+ tln:hasText "Wirklichkeit"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word115_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line36 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word115_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "423.013"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "113.734"^^xsd:float ;
+ tln:hasTop "411.903"^^xsd:float ;
+ tln:hasWidth "50.13"^^xsd:float .
+
+data:_N_VII_1_Page5_Word116 a tln:Word ;
+ tln:hasCleanText "giebt"^^xsd:string ;
+ tln:hasText "giebt"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word116_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line36 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word116_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "422.919"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "167.042"^^xsd:float ;
+ tln:hasTop "411.809"^^xsd:float ;
+ tln:hasWidth "19.791"^^xsd:float .
+
+data:_N_VII_1_Page5_Word117 a tln:Word ;
+ tln:hasCleanText "es"^^xsd:string ;
+ tln:hasText "es"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word117_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line38 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word117_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "448.366"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "437.256"^^xsd:float ;
+ tln:hasWidth "7.833"^^xsd:float .
+
+data:_N_VII_1_Page5_Word118 a tln:Word ;
+ tln:hasCleanText "kein"^^xsd:string ;
+ tln:hasText "kein"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word118_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line38 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word118_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "446.475"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "25.439"^^xsd:float ;
+ tln:hasTop "435.365"^^xsd:float ;
+ tln:hasWidth "17.306"^^xsd:float .
+
+data:_N_VII_1_Page5_Word119 a tln:Word ;
+ tln:hasCleanText "logisches"^^xsd:string ;
+ tln:hasText "logisches"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word119_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line38 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word119_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "446.319"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "46.054"^^xsd:float ;
+ tln:hasTop "435.209"^^xsd:float ;
+ tln:hasWidth "35.948"^^xsd:float .
+
+data:_N_VII_1_Page5_Word11_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "48.957"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "112.784"^^xsd:float ;
+ tln:hasTop "37.847"^^xsd:float ;
+ tln:hasWidth "12.341"^^xsd:float .
+
+data:_N_VII_1_Page5_Word12 a tln:Word ;
+ tln:hasCleanText "Denken"^^xsd:string ;
+ tln:hasText "Denken"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word12_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line4 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word120 a tln:Word ;
+ tln:hasCleanText "Denken"^^xsd:string ;
+ tln:hasText "Denken,"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word120_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line38 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word120_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "446.475"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "85.584"^^xsd:float ;
+ tln:hasTop "435.365"^^xsd:float ;
+ tln:hasWidth "33.618"^^xsd:float .
+
+data:_N_VII_1_Page5_Word121 a tln:Word ;
+ tln:hasCleanText "u."^^xsd:string ;
+ tln:hasText "u."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word121_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line38 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word121_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "448.366"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "123.364"^^xsd:float ;
+ tln:hasTop "437.256"^^xsd:float ;
+ tln:hasWidth "6.889"^^xsd:float .
+
+data:_N_VII_1_Page5_Word122 a tln:Word ;
+ tln:hasCleanText "kein"^^xsd:string ;
+ tln:hasText "kein"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word122_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line38 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word122_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "446.475"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "134.619"^^xsd:float ;
+ tln:hasTop "435.365"^^xsd:float ;
+ tln:hasWidth "17.306"^^xsd:float .
+
+data:_N_VII_1_Page5_Word123 a tln:Word ;
+ tln:hasCleanText "Satz"^^xsd:string ;
+ tln:hasText "Satz"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word123_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line38 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word123_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "446.241"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "155.234"^^xsd:float ;
+ tln:hasTop "435.131"^^xsd:float ;
+ tln:hasWidth "16.75"^^xsd:float .
+
+data:_N_VII_1_Page5_Word124 a tln:Word ;
+ tln:hasCleanText "der"^^xsd:string ;
+ tln:hasText "der"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word124_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line38 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word124_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "446.397"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "175.299"^^xsd:float ;
+ tln:hasTop "435.287"^^xsd:float ;
+ tln:hasWidth "13.034"^^xsd:float .
+
+data:_N_VII_1_Page5_Word125 a tln:Word ;
+ tln:hasCleanText "Arithmetik"^^xsd:string ;
+ tln:hasText "Arithmetik"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word125_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line40 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word125_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "469.875"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "458.765"^^xsd:float ;
+ tln:hasWidth "44.056"^^xsd:float .
+
+data:_N_VII_1_Page5_Word126 a tln:Word ;
+ tln:hasCleanText "u."^^xsd:string ;
+ tln:hasText "u."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word126_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line40 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word126_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "471.766"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "61.109"^^xsd:float ;
+ tln:hasTop "460.656"^^xsd:float ;
+ tln:hasWidth "6.889"^^xsd:float .
+
+data:_N_VII_1_Page5_Word127 a tln:Word ;
+ tln:hasCleanText "Geometrie"^^xsd:string ;
+ tln:hasText "Geometrie"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word127_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line40 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word127_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "469.641"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "72.364"^^xsd:float ;
+ tln:hasTop "458.531"^^xsd:float ;
+ tln:hasWidth "42.253"^^xsd:float .
+
+data:_N_VII_1_Page5_Word128 a tln:Word ;
+ tln:hasCleanText "kann"^^xsd:string ;
+ tln:hasText "kann"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word128_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line40 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word128_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "469.875"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "118.354"^^xsd:float ;
+ tln:hasTop "458.765"^^xsd:float ;
+ tln:hasWidth "19.926"^^xsd:float .
+
+data:_N_VII_1_Page5_Word129 a tln:Word ;
+ tln:hasCleanText "aus"^^xsd:string ;
+ tln:hasText "aus"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word129_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line40 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word129_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "471.766"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "141.589"^^xsd:float ;
+ tln:hasTop "460.656"^^xsd:float ;
+ tln:hasWidth "13.008"^^xsd:float .
+
+data:_N_VII_1_Page5_Word12_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "48.675"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "128.433"^^xsd:float ;
+ tln:hasTop "37.565"^^xsd:float ;
+ tln:hasWidth "31.716"^^xsd:float .
+
+data:_N_VII_1_Page5_Word13 a tln:Word ;
+ tln:hasCleanText "er="^^xsd:string ;
+ tln:hasText "er="^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word13_TranskriptionPosition0,
+ data:_N_VII_1_Page5_Word13_TranskriptionPosition1 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line4 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word13_Style0,
+ data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word130 a tln:Word ;
+ tln:hasCleanText "ihr"^^xsd:string ;
+ tln:hasText "ihr"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word130_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line40 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word130_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "469.875"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "158.179"^^xsd:float ;
+ tln:hasTop "458.765"^^xsd:float ;
+ tln:hasWidth "11.194"^^xsd:float .
+
+data:_N_VII_1_Page5_Word131 a tln:Word ;
+ tln:hasCleanText "genommen"^^xsd:string ;
+ tln:hasText "genommen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word131_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line40 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word131_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "469.719"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "172.719"^^xsd:float ;
+ tln:hasTop "458.609"^^xsd:float ;
+ tln:hasWidth "45.166"^^xsd:float .
+
+data:_N_VII_1_Page5_Word132 a tln:Word ;
+ tln:hasCleanText "sein"^^xsd:string ;
+ tln:hasText "sein,"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word132_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line40 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word132_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "470.156"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "221.194"^^xsd:float ;
+ tln:hasTop "459.046"^^xsd:float ;
+ tln:hasWidth "17.918"^^xsd:float .
+
+data:_N_VII_1_Page5_Word133 a tln:Word ;
+ tln:hasCleanText "weil"^^xsd:string ;
+ tln:hasText "weil"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word133_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line42 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word133_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "481.075"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "64.315"^^xsd:float ;
+ tln:hasTop "469.965"^^xsd:float ;
+ tln:hasWidth "17.109"^^xsd:float .
+
+data:_N_VII_1_Page5_Word134 a tln:Word ;
+ tln:hasCleanText "er"^^xsd:string ;
+ tln:hasText "er"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word134_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line42 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word134_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "482.888"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "84.73"^^xsd:float ;
+ tln:hasTop "471.778"^^xsd:float ;
+ tln:hasWidth "7.879"^^xsd:float .
+
+data:_N_VII_1_Page5_Word135 a tln:Word ;
+ tln:hasCleanText "g"^^xsd:string ;
+ tln:hasText "g"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word135_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line42 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word135_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "480.919"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "96.105"^^xsd:float ;
+ tln:hasTop "469.809"^^xsd:float ;
+ tln:hasWidth "4.203"^^xsd:float .
+
+data:_N_VII_1_Page5_Word136 a tln:Word ;
+ tln:hasCleanText "a"^^xsd:string ;
+ tln:hasText "a"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word136_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line42 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word136_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "480.919"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "100.7"^^xsd:float ;
+ tln:hasTop "469.809"^^xsd:float ;
+ tln:hasWidth "3.812"^^xsd:float .
+
+data:_N_VII_1_Page5_Word137 a tln:Word ;
+ tln:hasCleanText "r"^^xsd:string ;
+ tln:hasText "r"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word137_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line42 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word137_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "480.919"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "105.115"^^xsd:float ;
+ tln:hasTop "469.809"^^xsd:float ;
+ tln:hasWidth "2.984"^^xsd:float .
+
+data:_N_VII_1_Page5_Word138 a tln:Word ;
+ tln:hasCleanText "nicht"^^xsd:string ;
+ tln:hasText "nicht"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word138_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line42 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word138_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "481.075"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "111.595"^^xsd:float ;
+ tln:hasTop "469.965"^^xsd:float ;
+ tln:hasWidth "20.761"^^xsd:float .
+
+data:_N_VII_1_Page5_Word139 a tln:Word ;
+ tln:hasCleanText "vorkommt"^^xsd:string ;
+ tln:hasText "vorkommt."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word139_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line42 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word139_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "481.075"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "135.535"^^xsd:float ;
+ tln:hasTop "469.965"^^xsd:float ;
+ tln:hasWidth "43.694"^^xsd:float .
+
+data:_N_VII_1_Page5_Word13_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "50.488"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "163.457"^^xsd:float ;
+ tln:hasTop "39.378"^^xsd:float ;
+ tln:hasWidth "7.879"^^xsd:float .
+
+data:_N_VII_1_Page5_Word13_TranskriptionPosition1 a tln:TranskriptionPosition ;
+ tln:hasBottom "53.847"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "171.524"^^xsd:float ;
+ tln:hasTop "42.737"^^xsd:float ;
+ tln:hasWidth "2.519"^^xsd:float .
+
+data:_N_VII_1_Page5_Word14 a tln:Word ;
+ tln:hasCleanText "dichtet"^^xsd:string ;
+ tln:hasText "dichtet,"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word14_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line6 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word14_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "71.997"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "60.887"^^xsd:float ;
+ tln:hasWidth "30.041"^^xsd:float .
+
+data:_N_VII_1_Page5_Word15 a tln:Word ;
+ tln:hasCleanText "wo"^^xsd:string ;
+ tln:hasText "wo"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word15_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line6 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word15_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "73.966"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "48.227"^^xsd:float ;
+ tln:hasTop "62.856"^^xsd:float ;
+ tln:hasWidth "11.927"^^xsd:float .
+
+data:_N_VII_1_Page5_Word16 a tln:Word ;
+ tln:hasCleanText "ein"^^xsd:string ;
+ tln:hasText "ein"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word16_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line6 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word16_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "72.357"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "63.902"^^xsd:float ;
+ tln:hasTop "61.247"^^xsd:float ;
+ tln:hasWidth "12.341"^^xsd:float .
+
+data:_N_VII_1_Page5_Word17 a tln:Word ;
+ tln:hasCleanText "Gedanke"^^xsd:string ;
+ tln:hasText "Gedanke"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word17_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line6 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word17_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "71.841"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "79.552"^^xsd:float ;
+ tln:hasTop "60.731"^^xsd:float ;
+ tln:hasWidth "35.493"^^xsd:float .
+
+data:_N_VII_1_Page5_Word18 a tln:Word ;
+ tln:hasCleanText "als"^^xsd:string ;
+ tln:hasText "als"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word18_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line6 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word18_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "72.075"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "118.782"^^xsd:float ;
+ tln:hasTop "60.965"^^xsd:float ;
+ tln:hasWidth "10.208"^^xsd:float .
+
+data:_N_VII_1_Page5_Word19 a tln:Word ;
+ tln:hasCleanText "Ursache"^^xsd:string ;
+ tln:hasText "Ursache"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word19_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line6 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word19_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "71.95"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "132.572"^^xsd:float ;
+ tln:hasTop "60.84"^^xsd:float ;
+ tln:hasWidth "32.173"^^xsd:float .
+
+data:_N_VII_1_Page5_Word1_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "24.4"^^xsd:float ;
+ tln:hasHeight "9.125"^^xsd:float ;
+ tln:hasLeft "31.032"^^xsd:float ;
+ tln:hasTop "15.275"^^xsd:float ;
+ tln:hasWidth "14.666"^^xsd:float .
+
+data:_N_VII_1_Page5_Word2 a tln:Word ;
+ tln:hasCleanText "Muster"^^xsd:string ;
+ tln:hasText "Muster"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word2_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line2 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word20 a tln:Word ;
+ tln:hasCleanText "eines"^^xsd:string ;
+ tln:hasText "eines"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word20_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line6 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word20_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "72.357"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "168.482"^^xsd:float ;
+ tln:hasTop "61.247"^^xsd:float ;
+ tln:hasWidth "20.338"^^xsd:float .
+
+data:_N_VII_1_Page5_Word21 a tln:Word ;
+ tln:hasCleanText "anderen"^^xsd:string ;
+ tln:hasText "anderen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word21_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line8 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word21_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "95.397"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "84.287"^^xsd:float ;
+ tln:hasWidth "32.791"^^xsd:float .
+
+data:_N_VII_1_Page5_Word22 a tln:Word ;
+ tln:hasCleanText "Gedankens"^^xsd:string ;
+ tln:hasText "Gedankens"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word22_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line8 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word22_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "95.241"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "50.124"^^xsd:float ;
+ tln:hasTop "84.131"^^xsd:float ;
+ tln:hasWidth "44.678"^^xsd:float .
+
+data:_N_VII_1_Page5_Word23 a tln:Word ;
+ tln:hasCleanText "gesetzt"^^xsd:string ;
+ tln:hasText "gesetzt"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word23_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line8 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word23_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "95.319"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "98.384"^^xsd:float ;
+ tln:hasTop "84.209"^^xsd:float ;
+ tln:hasWidth "28.251"^^xsd:float .
+
+data:_N_VII_1_Page5_Word24 a tln:Word ;
+ tln:hasCleanText "wird"^^xsd:string ;
+ tln:hasText "wird;"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word24_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line8 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word24_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "95.397"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "129.814"^^xsd:float ;
+ tln:hasTop "84.287"^^xsd:float ;
+ tln:hasWidth "20.498"^^xsd:float .
+
+data:_N_VII_1_Page5_Word25 a tln:Word ;
+ tln:hasCleanText "alle"^^xsd:string ;
+ tln:hasText "alle"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word25_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line8 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word25_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "95.475"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "154.474"^^xsd:float ;
+ tln:hasTop "84.365"^^xsd:float ;
+ tln:hasWidth "13.528"^^xsd:float .
+
+data:_N_VII_1_Page5_Word26 a tln:Word ;
+ tln:hasCleanText "Affekte"^^xsd:string ;
+ tln:hasText "Affekte,"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word26_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line8 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word26_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "95.475"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "171.739"^^xsd:float ;
+ tln:hasTop "84.365"^^xsd:float ;
+ tln:hasWidth "30.763"^^xsd:float .
+
+data:_N_VII_1_Page5_Word27 a tln:Word ;
+ tln:hasCleanText "alles"^^xsd:string ;
+ tln:hasText "alles"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word27_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line10 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word27_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "118.875"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "107.765"^^xsd:float ;
+ tln:hasWidth "17.358"^^xsd:float .
+
+data:_N_VII_1_Page5_Word28 a tln:Word ;
+ tln:hasCleanText "Fühlen"^^xsd:string ;
+ tln:hasText "Fühlen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word28_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line10 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word28_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "118.875"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "34.964"^^xsd:float ;
+ tln:hasTop "107.765"^^xsd:float ;
+ tln:hasWidth "28.026"^^xsd:float .
+
+data:_N_VII_1_Page5_Word29 a tln:Word ;
+ tln:hasCleanText "u."^^xsd:string ;
+ tln:hasText "u."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word29_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line10 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word29_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "120.766"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "66.299"^^xsd:float ;
+ tln:hasTop "109.656"^^xsd:float ;
+ tln:hasWidth "6.889"^^xsd:float .
+
+data:_N_VII_1_Page5_Word2_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "25.275"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "49.278"^^xsd:float ;
+ tln:hasTop "14.165"^^xsd:float ;
+ tln:hasWidth "28.874"^^xsd:float .
+
+data:_N_VII_1_Page5_Word3 a tln:Word ;
+ tln:hasCleanText "einer"^^xsd:string ;
+ tln:hasText "einer"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word3_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line2 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word30 a tln:Word ;
+ tln:hasCleanText "Wollen"^^xsd:string ;
+ tln:hasText "Wollen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word30_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line10 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word30_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "118.813"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "77.554"^^xsd:float ;
+ tln:hasTop "107.703"^^xsd:float ;
+ tln:hasWidth "28.539"^^xsd:float .
+
+data:_N_VII_1_Page5_Word31 a tln:Word ;
+ tln:hasCleanText "wird"^^xsd:string ;
+ tln:hasText "wird"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word31_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line10 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word31_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "118.797"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "109.402"^^xsd:float ;
+ tln:hasTop "107.687"^^xsd:float ;
+ tln:hasWidth "18.186"^^xsd:float .
+
+data:_N_VII_1_Page5_Word32 a tln:Word ;
+ tln:hasCleanText "hinweggedacht"^^xsd:string ;
+ tln:hasText "hinweggedacht."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word32_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line10 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word32_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "118.719"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "131.307"^^xsd:float ;
+ tln:hasTop "107.609"^^xsd:float ;
+ tln:hasWidth "63.199"^^xsd:float .
+
+data:_N_VII_1_Page5_Word33 a tln:Word ;
+ tln:hasCleanText "Es"^^xsd:string ;
+ tln:hasText "Es"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word33_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line10 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word33_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "118.875"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "198.872"^^xsd:float ;
+ tln:hasTop "107.765"^^xsd:float ;
+ tln:hasWidth "8.763"^^xsd:float .
+
+data:_N_VII_1_Page5_Word34 a tln:Word ;
+ tln:hasCleanText "kommt"^^xsd:string ;
+ tln:hasText "kommt"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word34_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line12 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word34_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "142.275"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "131.165"^^xsd:float ;
+ tln:hasWidth "28.881"^^xsd:float .
+
+data:_N_VII_1_Page5_Word35 a tln:Word ;
+ tln:hasCleanText "dergleichen"^^xsd:string ;
+ tln:hasText "dergleichen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word35_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line12 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word35_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "142.119"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "46.084"^^xsd:float ;
+ tln:hasTop "131.009"^^xsd:float ;
+ tln:hasWidth "47.091"^^xsd:float .
+
+data:_N_VII_1_Page5_Word36 a tln:Word ;
+ tln:hasCleanText "in"^^xsd:string ;
+ tln:hasText "in"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word36_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line12 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word36_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "142.557"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "96.484"^^xsd:float ;
+ tln:hasTop "131.447"^^xsd:float ;
+ tln:hasWidth "7.746"^^xsd:float .
+
+data:_N_VII_1_Page5_Word37 a tln:Word ;
+ tln:hasCleanText "der"^^xsd:string ;
+ tln:hasText "der"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word37_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line12 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word37_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "142.197"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "107.539"^^xsd:float ;
+ tln:hasTop "131.087"^^xsd:float ;
+ tln:hasWidth "13.034"^^xsd:float .
+
+data:_N_VII_1_Page5_Word38 a tln:Word ;
+ tln:hasCleanText "Wirklichkeit"^^xsd:string ;
+ tln:hasText "Wirklichkeit"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word38_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line12 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word38_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "142.213"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "123.919"^^xsd:float ;
+ tln:hasTop "131.103"^^xsd:float ;
+ tln:hasWidth "50.13"^^xsd:float .
+
+data:_N_VII_1_Page5_Word39 a tln:Word ;
+ tln:hasCleanText "nicht"^^xsd:string ;
+ tln:hasText "nicht"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word39_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line12 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word39_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "142.275"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "177.228"^^xsd:float ;
+ tln:hasTop "131.165"^^xsd:float ;
+ tln:hasWidth "20.761"^^xsd:float .
+
+data:_N_VII_1_Page5_Word3_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "25.557"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "81.496"^^xsd:float ;
+ tln:hasTop "14.447"^^xsd:float ;
+ tln:hasWidth "20.38"^^xsd:float .
+
+data:_N_VII_1_Page5_Word4 a tln:Word ;
+ tln:hasCleanText "vollständigen"^^xsd:string ;
+ tln:hasText "vollständigen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word4_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line2 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word40 a tln:Word ;
+ tln:hasCleanText "anders"^^xsd:string ;
+ tln:hasText "anders"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word40_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line13 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word40_Style0 .
+
+data:_N_VII_1_Page5_Word40_Style0 a tln:Style ;
+ tln:styleHasCSS "font-size:80%;color:#000000;"^^xsd:string ;
+ tln:styleHasColor data:_N_VII_1_Page5_Word0_Style0_Color0 ;
+ tln:styleHasFont "deutsche Schreibschrift"^^xsd:string ;
+ tln:styleHasWritingInstrument "schwarze Tinte"^^xsd:string .
+
+data:_N_VII_1_Page5_Word40_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "155.1"^^xsd:float ;
+ tln:hasHeight "7.047"^^xsd:float ;
+ tln:hasLeft "103.488"^^xsd:float ;
+ tln:hasTop "148.053"^^xsd:float ;
+ tln:hasWidth "18.644"^^xsd:float .
+
+data:_N_VII_1_Page5_Word41 a tln:Word ;
+ tln:hasCleanText "vor"^^xsd:string ;
+ tln:hasText "vor:"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word41_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line14 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word41_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "167.488"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "156.378"^^xsd:float ;
+ tln:hasWidth "14.776"^^xsd:float .
+
+data:_N_VII_1_Page5_Word42 a tln:Word ;
+ tln:hasCleanText "diese"^^xsd:string ;
+ tln:hasText "diese"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word42_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line14 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word42_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "165.597"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "33.164"^^xsd:float ;
+ tln:hasTop "154.487"^^xsd:float ;
+ tln:hasWidth "19.979"^^xsd:float .
+
+data:_N_VII_1_Page5_Word43 a tln:Word ;
+ tln:hasCleanText "ist"^^xsd:string ;
+ tln:hasText "ist"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word43_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line14 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word43_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "165.956"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "56.878"^^xsd:float ;
+ tln:hasTop "154.846"^^xsd:float ;
+ tln:hasWidth "9.309"^^xsd:float .
+
+data:_N_VII_1_Page5_Word44 a tln:Word ;
+ tln:hasCleanText "unsäglich"^^xsd:string ;
+ tln:hasText "unsäglich"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word44_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line14 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word44_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "165.519"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "69.364"^^xsd:float ;
+ tln:hasTop "154.409"^^xsd:float ;
+ tln:hasWidth "38.103"^^xsd:float .
+
+data:_N_VII_1_Page5_Word45 a tln:Word ;
+ tln:hasCleanText "complicirt"^^xsd:string ;
+ tln:hasText "complicirt."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word45_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line14 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word45_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "165.613"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "110.977"^^xsd:float ;
+ tln:hasTop "154.503"^^xsd:float ;
+ tln:hasWidth "42.484"^^xsd:float .
+
+data:_N_VII_1_Page5_Word46 a tln:Word ;
+ tln:hasCleanText "Dadurch"^^xsd:string ;
+ tln:hasText "Dadurch"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word46_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line14 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word46_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "165.597"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "157.824"^^xsd:float ;
+ tln:hasTop "154.487"^^xsd:float ;
+ tln:hasWidth "35.025"^^xsd:float .
+
+data:_N_VII_1_Page5_Word47 a tln:Word ;
+ tln:hasCleanText "daß"^^xsd:string ;
+ tln:hasText "daß"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word47_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line16 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word47_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "188.888"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "177.778"^^xsd:float ;
+ tln:hasWidth "14.384"^^xsd:float .
+
+data:_N_VII_1_Page5_Word48 a tln:Word ;
+ tln:hasCleanText "wir"^^xsd:string ;
+ tln:hasText "wir"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word48_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line16 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word48_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "189.356"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "31.89"^^xsd:float ;
+ tln:hasTop "178.246"^^xsd:float ;
+ tln:hasWidth "13.402"^^xsd:float .
+
+data:_N_VII_1_Page5_Word49 a tln:Word ;
+ tln:hasCleanText "jene"^^xsd:string ;
+ tln:hasText "jene"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word49_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line16 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word49_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "187.325"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "48.636"^^xsd:float ;
+ tln:hasTop "176.215"^^xsd:float ;
+ tln:hasWidth "16.325"^^xsd:float .
+
+data:_N_VII_1_Page5_Word4_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "25.119"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "105.22"^^xsd:float ;
+ tln:hasTop "14.009"^^xsd:float ;
+ tln:hasWidth "53.509"^^xsd:float .
+
+data:_N_VII_1_Page5_Word5 a tln:Word ;
+ tln:hasCleanText "Fiction"^^xsd:string ;
+ tln:hasText "Fiction"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word5_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line2 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word5_Style0 .
+
+data:_N_VII_1_Page5_Word50 a tln:Word ;
+ tln:hasCleanText "Fiction"^^xsd:string ;
+ tln:hasText "Fiction"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word50_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line16 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word5_Style0 .
+
+data:_N_VII_1_Page5_Word50_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "188.902"^^xsd:float ;
+ tln:hasHeight "10.892"^^xsd:float ;
+ tln:hasLeft "68.702"^^xsd:float ;
+ tln:hasTop "178.01"^^xsd:float ;
+ tln:hasWidth "27.497"^^xsd:float .
+
+data:_N_VII_1_Page5_Word51 a tln:Word ;
+ tln:hasCleanText "als"^^xsd:string ;
+ tln:hasText "als"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word51_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line16 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word51_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "189.075"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "100.785"^^xsd:float ;
+ tln:hasTop "177.965"^^xsd:float ;
+ tln:hasWidth "10.206"^^xsd:float .
+
+data:_N_VII_1_Page5_Word52 a tln:Word ;
+ tln:hasCleanText "Schema"^^xsd:string ;
+ tln:hasText "Schema"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word52_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line16 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word52_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "188.841"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "114.573"^^xsd:float ;
+ tln:hasTop "177.731"^^xsd:float ;
+ tln:hasWidth "31.202"^^xsd:float .
+
+data:_N_VII_1_Page5_Word53 a tln:Word ;
+ tln:hasCleanText "an="^^xsd:string ;
+ tln:hasText "an="^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word53_TranskriptionPosition0,
+ data:_N_VII_1_Page5_Word53_TranskriptionPosition1 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line16 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word13_Style0,
+ data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word53_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "190.888"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "149.219"^^xsd:float ;
+ tln:hasTop "179.778"^^xsd:float ;
+ tln:hasWidth "9.603"^^xsd:float .
+
+data:_N_VII_1_Page5_Word53_TranskriptionPosition1 a tln:TranskriptionPosition ;
+ tln:hasBottom "194.247"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "158.986"^^xsd:float ;
+ tln:hasTop "183.137"^^xsd:float ;
+ tln:hasWidth "2.519"^^xsd:float .
+
+data:_N_VII_1_Page5_Word54 a tln:Word ;
+ tln:hasCleanText "legen"^^xsd:string ;
+ tln:hasText "legen,"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word54_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line18 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word54_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "212.319"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.023"^^xsd:float ;
+ tln:hasTop "201.209"^^xsd:float ;
+ tln:hasWidth "23.428"^^xsd:float .
+
+data:_N_VII_1_Page5_Word55 a tln:Word ;
+ tln:hasCleanText "also"^^xsd:string ;
+ tln:hasText "also"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word55_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line18 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word55_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "212.475"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "41.611"^^xsd:float ;
+ tln:hasTop "201.365"^^xsd:float ;
+ tln:hasWidth "15.004"^^xsd:float .
+
+data:_N_VII_1_Page5_Word56 a tln:Word ;
+ tln:hasCleanText "das"^^xsd:string ;
+ tln:hasText "das"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word56_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line18 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word56_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "212.397"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "60.361"^^xsd:float ;
+ tln:hasTop "201.287"^^xsd:float ;
+ tln:hasWidth "12.806"^^xsd:float .
+
+data:_N_VII_1_Page5_Word57 a tln:Word ;
+ tln:hasCleanText "thatsächl"^^xsd:string ;
+ tln:hasText "thatsächl."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word57_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line18 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word57_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "212.475"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "76.747"^^xsd:float ;
+ tln:hasTop "201.365"^^xsd:float ;
+ tln:hasWidth "37.94"^^xsd:float .
+
+data:_N_VII_1_Page5_Word58 a tln:Word ;
+ tln:hasCleanText "Geschehen"^^xsd:string ;
+ tln:hasText "Geschehen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word58_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line18 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word58_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "212.241"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "119.051"^^xsd:float ;
+ tln:hasTop "201.131"^^xsd:float ;
+ tln:hasWidth "44.773"^^xsd:float .
+
+data:_N_VII_1_Page5_Word59 a tln:Word ;
+ tln:hasCleanText "beim"^^xsd:string ;
+ tln:hasText "beim"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word59_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line18 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word59_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "212.397"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "167.131"^^xsd:float ;
+ tln:hasTop "201.287"^^xsd:float ;
+ tln:hasWidth "19.787"^^xsd:float .
+
+data:_N_VII_1_Page5_Word5_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "25.102"^^xsd:float ;
+ tln:hasHeight "10.892"^^xsd:float ;
+ tln:hasLeft "162.046"^^xsd:float ;
+ tln:hasTop "14.21"^^xsd:float ;
+ tln:hasWidth "27.497"^^xsd:float .
+
+data:_N_VII_1_Page5_Word6 a tln:Word ;
+ tln:hasCleanText "ist"^^xsd:string ;
+ tln:hasText "ist"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word6_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line4 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word60 a tln:Word ;
+ tln:hasCleanText "Denken"^^xsd:string ;
+ tln:hasText "Denken"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word60_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line20 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word60_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "235.875"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.023"^^xsd:float ;
+ tln:hasTop "224.765"^^xsd:float ;
+ tln:hasWidth "31.711"^^xsd:float .
+
+data:_N_VII_1_Page5_Word61 a tln:Word ;
+ tln:hasCleanText "gleichsam"^^xsd:string ;
+ tln:hasText "gleichsam"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word61_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line20 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word61_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "235.719"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "49.04"^^xsd:float ;
+ tln:hasTop "224.609"^^xsd:float ;
+ tln:hasWidth "39.827"^^xsd:float .
+
+data:_N_VII_1_Page5_Word62 a tln:Word ;
+ tln:hasCleanText "durch"^^xsd:string ;
+ tln:hasText "durch"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word62_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line20 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word62_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "235.797"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "92.269"^^xsd:float ;
+ tln:hasTop "224.687"^^xsd:float ;
+ tln:hasWidth "23.597"^^xsd:float .
+
+data:_N_VII_1_Page5_Word63 a tln:Word ;
+ tln:hasCleanText "einen"^^xsd:string ;
+ tln:hasText "einen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word63_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line20 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word63_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "236.156"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "119.172"^^xsd:float ;
+ tln:hasTop "225.046"^^xsd:float ;
+ tln:hasWidth "22.287"^^xsd:float .
+
+data:_N_VII_1_Page5_Word64 a tln:Word ;
+ tln:hasCleanText "Simplifications"^^xsd:string ;
+ tln:hasText "Simplifications-"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word64_TranskriptionPosition0,
+ data:_N_VII_1_Page5_Word64_TranskriptionPosition1 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line20 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word13_Style0,
+ data:_N_VII_1_Page5_Word5_Style0 .
+
+data:_N_VII_1_Page5_Word64_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "235.452"^^xsd:float ;
+ tln:hasHeight "10.892"^^xsd:float ;
+ tln:hasLeft "144.775"^^xsd:float ;
+ tln:hasTop "224.56"^^xsd:float ;
+ tln:hasWidth "63.011"^^xsd:float .
+
+data:_N_VII_1_Page5_Word64_TranskriptionPosition1 a tln:TranskriptionPosition ;
+ tln:hasBottom "242.438"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "208.159"^^xsd:float ;
+ tln:hasTop "231.328"^^xsd:float ;
+ tln:hasWidth "2.519"^^xsd:float .
+
+data:_N_VII_1_Page5_Word65 a tln:Word ;
+ tln:hasCleanText "apparat"^^xsd:string ;
+ tln:hasText "apparat"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word65_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line22 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word65_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "259.213"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.023"^^xsd:float ;
+ tln:hasTop "248.103"^^xsd:float ;
+ tln:hasWidth "29.735"^^xsd:float .
+
+data:_N_VII_1_Page5_Word66 a tln:Word ;
+ tln:hasCleanText "filtriren"^^xsd:string ;
+ tln:hasText "filtriren:"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word66_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line22 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word66_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "259.275"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "46.937"^^xsd:float ;
+ tln:hasTop "248.165"^^xsd:float ;
+ tln:hasWidth "31.976"^^xsd:float .
+
+data:_N_VII_1_Page5_Word67 a tln:Word ;
+ tln:hasCleanText "bringen"^^xsd:string ;
+ tln:hasText "bringen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word67_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line22 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word67_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "259.119"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "83.277"^^xsd:float ;
+ tln:hasTop "248.009"^^xsd:float ;
+ tln:hasWidth "30.735"^^xsd:float .
+
+data:_N_VII_1_Page5_Word68 a tln:Word ;
+ tln:hasCleanText "wir"^^xsd:string ;
+ tln:hasText "wir"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word68_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line22 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word68_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "259.556"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "117.319"^^xsd:float ;
+ tln:hasTop "248.446"^^xsd:float ;
+ tln:hasWidth "13.402"^^xsd:float .
+
+data:_N_VII_1_Page5_Word69 a tln:Word ;
+ tln:hasCleanText "es"^^xsd:string ;
+ tln:hasText "es"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word69_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line22 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word69_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "261.166"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "134.065"^^xsd:float ;
+ tln:hasTop "250.056"^^xsd:float ;
+ tln:hasWidth "7.832"^^xsd:float .
+
+data:_N_VII_1_Page5_Word6_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "48.957"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "37.847"^^xsd:float ;
+ tln:hasWidth "9.311"^^xsd:float .
+
+data:_N_VII_1_Page5_Word7 a tln:Word ;
+ tln:hasCleanText "die"^^xsd:string ;
+ tln:hasText "die"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word7_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line4 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word70 a tln:Word ;
+ tln:hasCleanText "zu"^^xsd:string ;
+ tln:hasText "zu"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word70_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line22 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word70_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "261.166"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "145.477"^^xsd:float ;
+ tln:hasTop "250.056"^^xsd:float ;
+ tln:hasWidth "9.753"^^xsd:float .
+
+data:_N_VII_1_Page5_Word71 a tln:Word ;
+ tln:hasCleanText "einer"^^xsd:string ;
+ tln:hasText "einer"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word71_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line24 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word71_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "282.956"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "271.846"^^xsd:float ;
+ tln:hasWidth "20.38"^^xsd:float .
+
+data:_N_VII_1_Page5_Word72 a tln:Word ;
+ tln:hasCleanText "Zeichenschrift"^^xsd:string ;
+ tln:hasText "Zeichenschrift"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word72_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line24 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word72_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "282.675"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "37.75"^^xsd:float ;
+ tln:hasTop "271.565"^^xsd:float ;
+ tln:hasWidth "57.513"^^xsd:float .
+
+data:_N_VII_1_Page5_Word73 a tln:Word ;
+ tln:hasCleanText "u."^^xsd:string ;
+ tln:hasText "u."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word73_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line24 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word73_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "284.566"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "98.441"^^xsd:float ;
+ tln:hasTop "273.456"^^xsd:float ;
+ tln:hasWidth "6.888"^^xsd:float .
+
+data:_N_VII_1_Page5_Word74 a tln:Word ;
+ tln:hasCleanText "Mittheilbarkeit"^^xsd:string ;
+ tln:hasText "Mittheilbarkeit"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word74_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line24 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word74_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "282.597"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "109.693"^^xsd:float ;
+ tln:hasTop "271.487"^^xsd:float ;
+ tln:hasWidth "60.747"^^xsd:float .
+
+data:_N_VII_1_Page5_Word75 a tln:Word ;
+ tln:hasCleanText "u"^^xsd:string ;
+ tln:hasText "u"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word75_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line24 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word75_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "284.566"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "173.628"^^xsd:float ;
+ tln:hasTop "273.456"^^xsd:float ;
+ tln:hasWidth "5.159"^^xsd:float .
+
+data:_N_VII_1_Page5_Word76 a tln:Word ;
+ tln:hasCleanText "Merk="^^xsd:string ;
+ tln:hasText "Merk="^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word76_TranskriptionPosition0,
+ data:_N_VII_1_Page5_Word76_TranskriptionPosition1 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line24 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word13_Style0,
+ data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word76_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "282.675"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "182.13"^^xsd:float ;
+ tln:hasTop "271.565"^^xsd:float ;
+ tln:hasWidth "22.012"^^xsd:float .
+
+data:_N_VII_1_Page5_Word76_TranskriptionPosition1 a tln:TranskriptionPosition ;
+ tln:hasBottom "287.847"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "204.024"^^xsd:float ;
+ tln:hasTop "276.737"^^xsd:float ;
+ tln:hasWidth "2.519"^^xsd:float .
+
+data:_N_VII_1_Page5_Word77 a tln:Word ;
+ tln:hasCleanText "barkeit"^^xsd:string ;
+ tln:hasText "barkeit"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word77_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line26 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word77_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "305.997"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.023"^^xsd:float ;
+ tln:hasTop "294.887"^^xsd:float ;
+ tln:hasWidth "28.062"^^xsd:float .
+
+data:_N_VII_1_Page5_Word78 a tln:Word ;
+ tln:hasCleanText "der"^^xsd:string ;
+ tln:hasText "der"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word78_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line26 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word78_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "305.997"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "45.26"^^xsd:float ;
+ tln:hasTop "294.887"^^xsd:float ;
+ tln:hasWidth "13.034"^^xsd:float .
+
+data:_N_VII_1_Page5_Word79 a tln:Word ;
+ tln:hasCleanText "logischen"^^xsd:string ;
+ tln:hasText "logischen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word79_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line26 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word79_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "305.919"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "61.64"^^xsd:float ;
+ tln:hasTop "294.809"^^xsd:float ;
+ tln:hasWidth "37.901"^^xsd:float .
+
+data:_N_VII_1_Page5_Word7_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "48.597"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "26.513"^^xsd:float ;
+ tln:hasTop "37.487"^^xsd:float ;
+ tln:hasWidth "11.713"^^xsd:float .
+
+data:_N_VII_1_Page5_Word8 a tln:Word ;
+ tln:hasCleanText "Logik"^^xsd:string ;
+ tln:hasText "Logik."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word8_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line4 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word80 a tln:Word ;
+ tln:hasCleanText "V"^^xsd:string ;
+ tln:hasText "V"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word80_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line26 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word80_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "306.013"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "102.85"^^xsd:float ;
+ tln:hasTop "294.903"^^xsd:float ;
+ tln:hasWidth "6.675"^^xsd:float .
+
+data:_N_VII_1_Page5_Word81 a tln:Word ;
+ tln:hasCleanText "orgänge"^^xsd:string ;
+ tln:hasText "orgänge."^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word81_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line26 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word81_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "305.918"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "108.198"^^xsd:float ;
+ tln:hasTop "294.808"^^xsd:float ;
+ tln:hasWidth "33.537"^^xsd:float .
+
+data:_N_VII_1_Page5_Word82 a tln:Word ;
+ tln:hasCleanText "Also"^^xsd:string ;
+ tln:hasText "Also:"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word82_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line26 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word82_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "306.075"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "146.102"^^xsd:float ;
+ tln:hasTop "294.965"^^xsd:float ;
+ tln:hasWidth "18.994"^^xsd:float .
+
+data:_N_VII_1_Page5_Word83 a tln:Word ;
+ tln:hasCleanText "das"^^xsd:string ;
+ tln:hasText "das"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word83_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line28 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word83_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "329.397"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.023"^^xsd:float ;
+ tln:hasTop "318.287"^^xsd:float ;
+ tln:hasWidth "12.808"^^xsd:float .
+
+data:_N_VII_1_Page5_Word84 a tln:Word ;
+ tln:hasCleanText "geistige"^^xsd:string ;
+ tln:hasText "geistige"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word84_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line28 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word84_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "329.319"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "30.413"^^xsd:float ;
+ tln:hasTop "318.209"^^xsd:float ;
+ tln:hasWidth "29.688"^^xsd:float .
+
+data:_N_VII_1_Page5_Word85 a tln:Word ;
+ tln:hasCleanText "Geschehen"^^xsd:string ;
+ tln:hasText "Geschehen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word85_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line28 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word85_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "329.241"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "63.838"^^xsd:float ;
+ tln:hasTop "318.131"^^xsd:float ;
+ tln:hasWidth "44.781"^^xsd:float .
+
+data:_N_VII_1_Page5_Word86 a tln:Word ;
+ tln:hasCleanText "zu"^^xsd:string ;
+ tln:hasText "zu"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word86_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line28 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word86_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "331.366"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "111.928"^^xsd:float ;
+ tln:hasTop "320.256"^^xsd:float ;
+ tln:hasWidth "9.754"^^xsd:float .
+
+data:_N_VII_1_Page5_Word87 a tln:Word ;
+ tln:hasCleanText "betrachten"^^xsd:string ;
+ tln:hasText "betrachten,"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word87_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line28 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word87_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "329.397"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "125.023"^^xsd:float ;
+ tln:hasTop "318.287"^^xsd:float ;
+ tln:hasWidth "45.148"^^xsd:float .
+
+data:_N_VII_1_Page5_Word88 a tln:Word ;
+ tln:hasCleanText "wie"^^xsd:string ;
+ tln:hasText "wie"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word88_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line28 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word88_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "329.756"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "174.333"^^xsd:float ;
+ tln:hasTop "318.646"^^xsd:float ;
+ tln:hasWidth "14.123"^^xsd:float .
+
+data:_N_VII_1_Page5_Word89 a tln:Word ;
+ tln:hasCleanText "als"^^xsd:string ;
+ tln:hasText "als"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word89_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line28 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word89_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "329.475"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "192.193"^^xsd:float ;
+ tln:hasTop "318.365"^^xsd:float ;
+ tln:hasWidth "10.208"^^xsd:float .
+
+data:_N_VII_1_Page5_Word8_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "48.519"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "41.962"^^xsd:float ;
+ tln:hasTop "37.409"^^xsd:float ;
+ tln:hasWidth "23.769"^^xsd:float .
+
+data:_N_VII_1_Page5_Word9 a tln:Word ;
+ tln:hasCleanText "Hier"^^xsd:string ;
+ tln:hasText "Hier"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word9_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line4 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word90 a tln:Word ;
+ tln:hasCleanText "ob"^^xsd:string ;
+ tln:hasText "ob"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word90_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line30 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word90_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "352.797"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "341.687"^^xsd:float ;
+ tln:hasWidth "9.546"^^xsd:float .
+
+data:_N_VII_1_Page5_Word91 a tln:Word ;
+ tln:hasCleanText "es"^^xsd:string ;
+ tln:hasText "es"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word91_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line30 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word91_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "354.766"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "27.099"^^xsd:float ;
+ tln:hasTop "343.656"^^xsd:float ;
+ tln:hasWidth "7.833"^^xsd:float .
+
+data:_N_VII_1_Page5_Word92 a tln:Word ;
+ tln:hasCleanText "dem"^^xsd:string ;
+ tln:hasText "dem"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word92_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line30 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word92_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "352.797"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "38.514"^^xsd:float ;
+ tln:hasTop "341.687"^^xsd:float ;
+ tln:hasWidth "17.425"^^xsd:float .
+
+data:_N_VII_1_Page5_Word93 a tln:Word ;
+ tln:hasCleanText "Schema"^^xsd:string ;
+ tln:hasText "Schema"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word93_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line30 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word5_Style0 .
+
+data:_N_VII_1_Page5_Word93_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "352.452"^^xsd:float ;
+ tln:hasHeight "10.892"^^xsd:float ;
+ tln:hasLeft "59.337"^^xsd:float ;
+ tln:hasTop "341.56"^^xsd:float ;
+ tln:hasWidth "33.391"^^xsd:float .
+
+data:_N_VII_1_Page5_Word94 a tln:Word ;
+ tln:hasCleanText "jener"^^xsd:string ;
+ tln:hasText "jener"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word94_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line30 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word94_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "351.125"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "96.792"^^xsd:float ;
+ tln:hasTop "340.015"^^xsd:float ;
+ tln:hasWidth "20.2"^^xsd:float .
+
+data:_N_VII_1_Page5_Word95 a tln:Word ;
+ tln:hasCleanText "regulativen"^^xsd:string ;
+ tln:hasText "regulativen"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word95_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line30 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word95_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "352.719"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "120.335"^^xsd:float ;
+ tln:hasTop "341.609"^^xsd:float ;
+ tln:hasWidth "45.242"^^xsd:float .
+
+data:_N_VII_1_Page5_Word96 a tln:Word ;
+ tln:hasCleanText "Fic="^^xsd:string ;
+ tln:hasText "Fic="^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word96_TranskriptionPosition0,
+ data:_N_VII_1_Page5_Word96_TranskriptionPosition1 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line30 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word13_Style0,
+ data:_N_VII_1_Page5_Word5_Style0 .
+
+data:_N_VII_1_Page5_Word96_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "352.702"^^xsd:float ;
+ tln:hasHeight "10.892"^^xsd:float ;
+ tln:hasLeft "168.888"^^xsd:float ;
+ tln:hasTop "341.81"^^xsd:float ;
+ tln:hasWidth "12.174"^^xsd:float .
+
+data:_N_VII_1_Page5_Word96_TranskriptionPosition1 a tln:TranskriptionPosition ;
+ tln:hasBottom "358.047"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "181.519"^^xsd:float ;
+ tln:hasTop "346.937"^^xsd:float ;
+ tln:hasWidth "2.519"^^xsd:float .
+
+data:_N_VII_1_Page5_Word97 a tln:Word ;
+ tln:hasCleanText "tion"^^xsd:string ;
+ tln:hasText "tion"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word97_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line32 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word5_Style0 .
+
+data:_N_VII_1_Page5_Word97_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "376.102"^^xsd:float ;
+ tln:hasHeight "10.892"^^xsd:float ;
+ tln:hasLeft "14.024"^^xsd:float ;
+ tln:hasTop "365.21"^^xsd:float ;
+ tln:hasWidth "14.867"^^xsd:float .
+
+data:_N_VII_1_Page5_Word98 a tln:Word ;
+ tln:hasCleanText "entspräche"^^xsd:string ;
+ tln:hasText "entspräche:"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word98_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line32 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word98_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "376.213"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "33.477"^^xsd:float ;
+ tln:hasTop "365.103"^^xsd:float ;
+ tln:hasWidth "45.495"^^xsd:float .
+
+data:_N_VII_1_Page5_Word99 a tln:Word ;
+ tln:hasCleanText "dies"^^xsd:string ;
+ tln:hasText "dies"^^xsd:string ;
+ tln:hasTranskriptionPosition data:_N_VII_1_Page5_Word99_TranskriptionPosition0 ;
+ tln:wordBelongsToLine data:_N_VII_1_Page5_Line32 ;
+ tln:wordHasStyle data:_N_VII_1_Page5_Word1_Style0 .
+
+data:_N_VII_1_Page5_Word99_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "376.197"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "83.336"^^xsd:float ;
+ tln:hasTop "365.087"^^xsd:float ;
+ tln:hasWidth "15.54"^^xsd:float .
+
+data:_N_VII_1_Page5_Word9_TranskriptionPosition0 a tln:TranskriptionPosition ;
+ tln:hasBottom "48.675"^^xsd:float ;
+ tln:hasHeight "11.11"^^xsd:float ;
+ tln:hasLeft "70.096"^^xsd:float ;
+ tln:hasTop "37.565"^^xsd:float ;
+ tln:hasWidth "17.439"^^xsd:float .
+
+data:_N_VII_1_Page5_WordInsertionMark0 a tln:WordInsertionMark ;
+ tln:hasBottom "163.8"^^xsd:float ;
+ tln:hasHeight "5.578"^^xsd:float ;
+ tln:hasLeft "107.788"^^xsd:float ;
+ tln:hasMarkType "A"^^xsd:string ;
+ tln:hasTop "158.222"^^xsd:float ;
+ tln:hasWidth "2.047"^^xsd:float ;
+ tln:wordInsertionMarkBelongsToLine data:_N_VII_1_Page5_Line14 .
+
+data:_N_VII_1_Page5_Line1 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "18.7"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 1 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "0.0"^^xsd:float .
+
+data:_N_VII_1_Page5_Line13 a tln:Line ;
+ tln:isMainLine false ;
+ tln:lineHasBottomValueOnTranskription "156.1"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 13 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "141.4"^^xsd:float .
+
+data:_N_VII_1_Page5_Word0_Style0_Color0 a tln:Color ;
+ tln:colorHasName "black"^^xsd:string ;
+ tln:hasHexadecimalValue "#000000"^^xsd:string .
+
+data:_N_VII_1_Page5_Line2 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "23.4"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 2 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "19.7"^^xsd:float .
+
+data:_N_VII_1_Page5_Line20 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "234.0"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 20 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "213.601"^^xsd:float .
+
+data:_N_VII_1_Page5_Word13_Style0 a tln:Style ;
+ tln:styleHasCSS "color:#000000;"^^xsd:string ;
+ tln:styleHasColor data:_N_VII_1_Page5_Word0_Style0_Color0 ;
+ tln:styleHasFont "deutsche Schreibschrift"^^xsd:string ;
+ tln:styleHasWritingInstrument "schwarze Tinte"^^xsd:string .
+
+data:_N_VII_1_Page5_Word5_Style0 a tln:Style ;
+ tln:styleHasCSS "color:#000000;"^^xsd:string ;
+ tln:styleHasColor data:_N_VII_1_Page5_Word0_Style0_Color0 ;
+ tln:styleHasFont "lateinische Schreibschrift"^^xsd:string ;
+ tln:styleHasWritingInstrument "schwarze Tinte"^^xsd:string .
+
+data:_N_VII_1_Page5_Line12 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "140.4"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 12 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "120.0"^^xsd:float .
+
+data:_N_VII_1_Page5_Line18 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "210.601"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 18 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "190.2"^^xsd:float .
+
+data:_N_VII_1_Page5_Line22 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "257.4"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 22 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "237.0"^^xsd:float .
+
+data:_N_VII_1_Page5_Line24 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "280.8"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 24 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "260.4"^^xsd:float .
+
+data:_N_VII_1_Page5_Line26 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "304.201"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 26 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "283.8"^^xsd:float .
+
+data:_N_VII_1_Page5_Line32 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "374.4"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 32 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "354.0"^^xsd:float .
+
+data:_N_VII_1_Page5_Line8 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "93.601"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 8 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "73.2"^^xsd:float .
+
+data:_N_VII_1_Page5_Line10 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "117.0"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 10 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "96.601"^^xsd:float .
+
+data:_N_VII_1_Page5_Line14 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "163.8"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 14 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "157.1"^^xsd:float .
+
+data:_N_VII_1_Page5_Line16 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "187.2"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 16 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "166.8"^^xsd:float .
+
+data:_N_VII_1_Page5_Line28 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "327.6"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 28 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "307.201"^^xsd:float .
+
+data:_N_VII_1_Page5_Line30 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "351.0"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 30 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "330.6"^^xsd:float .
+
+data:_N_VII_1_Page5_Line34 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "397.8"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 34 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "377.4"^^xsd:float .
+
+data:_N_VII_1_Page5_Line36 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "421.201"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 36 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "400.8"^^xsd:float .
+
+data:_N_VII_1_Page5_Line42 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "479.201"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 42 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "471.0"^^xsd:float .
+
+data:_N_VII_1_Page5_Line6 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "70.2"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 6 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "49.8"^^xsd:float .
+
+data:_N_VII_1_Page5_Line38 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "444.6"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 38 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "424.201"^^xsd:float .
+
+data:_N_VII_1_Page5_Line4 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "46.8"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 4 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "26.4"^^xsd:float .
+
+data:_N_VII_1_Page5_Line40 a tln:Line ;
+ tln:isMainLine true ;
+ tln:lineHasBottomValueOnTranskription "468.0"^^xsd:float ;
+ tln:lineHasInnerBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasInnerTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasNumber 40 ;
+ tln:lineHasOuterBottomValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasOuterTopValueOnFaksimile "0.0"^^xsd:float ;
+ tln:lineHasTopValueOnTranskription "447.6"^^xsd:float .
+
+data:_N_VII_1_Page5_Word1_Style0 a tln:Style ;
+ tln:styleHasCSS "color:#000000;"^^xsd:string ;
+ tln:styleHasColor data:_N_VII_1_Page5_Word0_Style0_Color0 ;
+ tln:styleHasFont "deutsche Schreibschrift"^^xsd:string ;
+ tln:styleHasWritingInstrument "schwarze Tinte"^^xsd:string .
+
Index: tests_svgscripts/test_data/pdfsvg/W_II_1_page015.xml
===================================================================
--- tests_svgscripts/test_data/pdfsvg/W_II_1_page015.xml (revision 112)
+++ tests_svgscripts/test_data/pdfsvg/W_II_1_page015.xml (revision 113)
@@ -1,18 +1,20 @@
svgWordPosition
2019-06-17 22:47:39
2019-07-11 15:02:53
2019-07-04 11:13:33
2019-07-11 15:38:20
2019-08-02 09:46:39
- 2020-10-09 18:31:56
+ 2021-12-08 20:33:15
-
+
+
+
Index: tests_svgscripts/test_data/Mp_XV_page79v.xml
===================================================================
--- tests_svgscripts/test_data/Mp_XV_page79v.xml (revision 0)
+++ tests_svgscripts/test_data/Mp_XV_page79v.xml (revision 113)
@@ -0,0 +1,3616 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+ svgWordPosition
+
+
+ 2020-08-31 09:22:17
+
+ 2020-08-31 09:22:18
+ 2020-12-23 16:40:17
+ 2020-12-23 16:40:18
+ 2021-02-24 14:06:52
+ 2021-02-24 11:35:41
+ 2021-02-24 14:06:52
+ 2021-02-24 13:55:50
+ 2021-02-24 13:55:49
+ 2021-02-24 13:55:50
+
+ transkription positions
+ hyphenation
+ boxes/correction history
+ mark foreign hands
+ line assignement
+ deletion paths
+ faksimile/transkription word correspondance
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ genügen
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: tests_svgscripts/test_process_files.py
===================================================================
--- tests_svgscripts/test_process_files.py (revision 112)
+++ tests_svgscripts/test_process_files.py (revision 113)
@@ -1,55 +1,67 @@
import unittest
from os import sep, path, remove
from os.path import isfile, dirname
import lxml.etree as ET
import warnings
import sys
sys.path.append('svgscripts')
import process_files
from process_files import MyCSVHandler, MyErrorHandler
from datatypes.page_creator import PageCreator
class TestProcessFiles(unittest.TestCase):
def setUp(self):
process_files.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.csv_file = DATADIR + sep + 'bd_12_Layout.csv'
self.dir = DATADIR + sep + 'pdfsvg'
self.csv_dir = self.dir + sep + 'csv'
self.manuscript = self.dir + sep + 'W_II_1.xml'
self.graphic_file = self.dir + sep + 'W_II_1_page001_web.svg'
self.multipdf = DATADIR + sep + 'Bd_12_Mp_XIV_-XVI_Druck.pdf'
@unittest.skipUnless(__name__ == "__main__", 'test takes too long, we do not run it with unittest discover')
def test_main(self):
#self.assertEqual(process_files.main([ self.manuscript ]), 1)
argv = [ '-g', '-x', self.dir, '-s', self.dir, self.dir ]
self.assertEqual(process_files.main(argv), 0)
self.assertEqual(isfile(self.graphic_file), True)
def test_csvhandler(self):
csv_handler = MyCSVHandler(self.csv_file, self.multipdf, self.csv_dir)
self.assertEqual(len([ entry for entry in csv_handler.csv_entries if entry[MyCSVHandler.ENTRY_KEY_FILE] is not None ]), 9)
csv_handler = MyCSVHandler(self.csv_file, self.multipdf, self.csv_dir, title="Mp XV")
self.assertEqual(len([ entry for entry in csv_handler.csv_entries if entry[MyCSVHandler.ENTRY_KEY_FILE] is not None ]), 2)
+ csv_file = '/home/knister0/ownCloud/myNietzscheDE/KGW-IX_13/bd_13.csv'
+ multipdf = '/home/knister0/ownCloud/myNietzscheDE/KGW-IX_13/Bd_13_def_Verlag_0.pdf'
+ csv_handler = MyCSVHandler(csv_file, multipdf, createBlanks=True)
+ #csv_handler.process_files('./svg', './xml', None)
#print(csv_handler.csv_entries)
#self.assertEqual(csv_handler.process_files('asdf', 'asdf'), 0)
+
+ def test_get_extended_text_field(self):
+ svg_file = './svg/Mp_XVII_49r_web.svg'
+ text_field = process_files.get_extended_text_field(svg_file, multipage_index=0)
+ print(text_field.left, text_field.top, text_field.width, text_field.height)
+ svg_file = './svg/Mp_XVI_55r_web.svg'
+ text_field = process_files.get_extended_text_field(svg_file)
+ print(text_field.left, text_field.top, text_field.width, text_field.height)
def test_page_status(self):
self.assertEqual(process_files.is_page_ok(manuscript_file=self.manuscript, page_number=2), True)
#self.assertEqual(process_files.page_has_status(process_files.WARN_MISSING_USE_NODE, manuscript_file=self.manuscript, page_number='1'), True)
#self.assertEqual(process_files.get_page_output_file('2', manuscript_file=self.manuscript), dirname(self.manuscript) + sep + 'W_II_1_page001.xml')
def test_is_svg_ok(self):
self.assertEqual(process_files.is_svg_ok(manuscript_file=self.manuscript, page_number=1), True)
@unittest.skip('')
def test_run(self):
error_handler = MyErrorHandler()
error_handler.run(page_number='15')
if __name__ == "__main__":
unittest.main()
Index: fixes/get_text_field.py
===================================================================
--- fixes/get_text_field.py (revision 0)
+++ fixes/get_text_field.py (revision 113)
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This program can be used to create svg files with a rect for the text_field.
+"""
+# Copyright (C) University of Basel 2019 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+from colorama import Fore, Style
+import getopt
+import json
+import lxml.etree as ET
+import shutil
+import subprocess
+import sys
+import os
+import wget
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from os import listdir, sep, path, setpgrp, devnull, mkdir, remove
+from os.path import exists, isfile, isdir, dirname, basename
+from progress.bar import Bar
+import warnings
+
+if dirname(__file__) not in sys.path:
+ sys.path.append(dirname(__file__))
+from fix_old_data import save_page
+
+sys.path.append('svgscripts')
+from datatypes.faksimile_image import FaksimileImage
+from datatypes.faksimile import FaksimilePage
+from datatypes.archival_manuscript import ArchivalManuscriptUnity
+from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
+from datatypes.text_field import TextField
+from util import back_up, back_up_svg_file, copy_faksimile_update_image_location, copy_faksimile_svg_file
+from process_files import update_svgposfile_status
+from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
+
+sys.path.append('shared_util')
+from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
+from main_util import create_function_dictionary
+
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+THRESHOLD = 60
+BLANK_STATUS = 'blank'
+
+def get_text_field_on_image(image_file: str, image_width: float, image_height: float, id=0) ->TextField:
+ """Find the area of of the faksimile image where a manuscript page is displayed
+ and return it as a TextField.
+ """
+ image = cv2.imread(image_file)
+ blur = cv2.GaussianBlur(image, (3,3), 0)
+ #blur = cv2.bilateralFilter(image,9,75,75)
+ gray = cv2.cvtColor(blur, cv2.COLOR_BGR2GRAY)
+ thresh = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)[1]
+ original_height, original_width, channel = image.shape
+ startLine, lastLine = get_start_and_end_index(thresh)
+ top = round(startLine*image_height/original_height, 1)
+ bottom = round(lastLine*image_height/original_height, 1)
+ height = bottom-top
+ startLine, lastLine = get_start_and_end_index(thresh.T)
+ left = round(startLine*image_width/original_width, 1)
+ right = round(lastLine*image_width/original_width, 1)
+ width = right-left
+ return TextField(id=id, x=left, y=top, width=width, height=height)
+
+def get_start_and_end_index(thresh) ->(int, int):
+ """
+ """
+ startLine = -1
+ lastLine = -1
+ for id, line in enumerate(thresh):
+ if startLine == -1 and np.sum(line) == 0:
+ if id-lastLine < THRESHOLD:
+ lastLine = id
+ else:
+ startLine = lastLine
+ lastLine = id
+ break
+ return startLine, lastLine
+
+def usage():
+ """prints information on how to use the script
+ """
+ print(main.__doc__)
+
+def main(argv):
+ """This program can be used to create svg files with a rect for the text_field.
+
+ fixes/get_text_field.py [OPTIONS] width height
+
+ the directory where the files should be saved to
+
+ OPTIONS:
+ -h|--help show help
+
+ :return: exit code (int)
+ """
+ try:
+ opts, args = getopt.getopt(argv, "h", ["help"])
+ except getopt.GetoptError:
+ usage()
+ return 2
+ for opt, arg in opts:
+ if opt in ('-h', '--help'):
+ usage()
+ return 0
+ if len(args) < 3:
+ usage()
+ return 2
+ exit_status = 0
+ image_file = args[0]
+ image_width = float(args[1])
+ image_height = float(args[2])
+ if isfile(image_file):
+ textfield = get_text_field_on_image(image_file, image_width, image_height)
+ print(textfield)
+ else:
+ raise FileNotFoundError(f'File {image_file} does not exist!')
+ return exit_status
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
Index: fixes/test_create_blank_svg_files.py
===================================================================
--- fixes/test_create_blank_svg_files.py (revision 0)
+++ fixes/test_create_blank_svg_files.py (revision 113)
@@ -0,0 +1,38 @@
+import lxml.etree as ET
+from os import sep, path, remove
+from os.path import isdir, isfile, dirname, basename
+import shutil
+import sys
+import tempfile
+import unittest
+import warnings
+
+from create_blank_svg_files import SVGFileCreator, OldSVGFileCreator
+
+
+sys.path.append('shared_util')
+from myxmlwriter import copy_to_bak_dir
+
+
+
+class TestCreateSVGFILES(unittest.TestCase):
+ def setUp(self):
+ SVGFileCreator.UNITTESTING = True
+ DATADIR = path.dirname(__file__) + sep + 'test_data'
+ self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml'
+ self.fix_transkription_positions = DATADIR + sep + 'Mp_XIV_page419a.xml'
+
+ def test_init(self):
+ svg_creator = OldSVGFileCreator('Mp XVI', '/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Mp_XVI')
+ self.assertTrue(svg_creator.avg_left != -1)
+ self.assertTrue(svg_creator.avg_top != -1)
+ #print(svg_creator.avg_left, svg_creator.avg_top, svg_creator.avg_width, svg_creator.avg_height)
+
+ def test_update_svg_file(self):
+ svg_file = '/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Myriam/Blank/Mp_XVI/Mp_XVI_page81v.svg'
+ copy_to_bak_dir(svg_file)
+ exit_status = SVGFileCreator.UPDATE_TEXTFIELD_OF_SVG_FILE(svg_file)
+ self.assertTrue(exit_status == 0)
+
+if __name__ == "__main__":
+ unittest.main()
Index: fixes/create_blank_svg_files.py
===================================================================
--- fixes/create_blank_svg_files.py (revision 0)
+++ fixes/create_blank_svg_files.py (revision 113)
@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This program can be used to create svg files with a rect for the text_field.
+"""
+# Copyright (C) University of Basel 2019 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+from colorama import Fore, Style
+import getopt
+import json
+import lxml.etree as ET
+import shutil
+import subprocess
+import sys
+import os
+import wget
+from os import listdir, sep, path, setpgrp, devnull, mkdir, remove
+from os.path import exists, isfile, isdir, dirname, basename
+from progress.bar import Bar
+import warnings
+
+from fix_old_data import save_page
+from get_text_field import get_text_field_on_image
+
+sys.path.append('svgscripts')
+from datatypes.faksimile_image import FaksimileImage
+from datatypes.faksimile import FaksimilePage
+from datatypes.archival_manuscript import ArchivalManuscriptUnity
+from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
+from util import back_up, back_up_svg_file, copy_faksimile_update_image_location, copy_faksimile_svg_file
+from process_files import update_svgposfile_status
+from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
+
+sys.path.append('shared_util')
+from myxmlwriter import copy_to_bak_dir, write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
+from main_util import create_function_dictionary
+
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+MAX_SVG_XY_THRESHOLD = 10
+BLANK_STATUS = 'blank'
+
+class SPARQLQuery:
+ HEADER_OPTION = '--header'
+ HEADER_VALUE = 'Accept: application/sparql-results+json,*/*;q=0.9'
+ ENDPOINT = 'https://nietzsche.fuseki.services.dasch.swiss/nietzsche'
+ QUERY = 'query=PREFIX+tln%3A+%3Chttp%3A%2F%2Fwww.nie.org%2Fontology%2Fnietzsche%23%3E%0ASELECT+(AVG(%3Fleft)+as+%3Favg_left)+(AVG(%3Ftop)+as+%3Favg_top)+(AVG(%3Fwidth)+as+%3Favg_width)+(AVG(%3Fheight)+as+%3Favg_height)+WHERE+%7B%0A+%3Fimage+a+tln%3AFaksimileImage%3B+tln%3AhasTextField+%3Ftextfield.%0A+%3Ftextfield+tln%3AhasLeft+%3Fleft%3B+tln%3AhasTop+%3Ftop%3B+tln%3AhasWidth+%3Fwidth%3B+tln%3AhasHeight+%3Fheight.%7D+'
+ REQUEST_OPTION = '--request'
+ REQUEST_VALUE = 'POST'
+class OldSVGFileCreator:
+ """This class can be used in order to create svg files with textfield rects.
+ """
+ UNITTESTING = False
+ RESPONSE = 'response.json'
+
+ def __init__(self, title, faksimile_dir, endpoint=SPARQLQuery.ENDPOINT, target_dir='./tmp'):
+ self.avg_left = -1
+ self.avg_top = -1
+ self.avg_height = -1
+ self.avg_width = -1
+ self.endpoint = endpoint
+ self.faksimile_dir = faksimile_dir
+ self.target_dir = target_dir
+ self.title = title
+ self.curl = self._get_ext_program_path('curl')
+ self.inkscape = self._get_ext_program_path('inkscape')
+ self.namespaces = None
+ self._init_averages()
+
+ def _init_averages(self):
+ """Initialize average dimension of textfield based on the data from the endpoint
+ """
+ if not isfile(self.RESPONSE):
+ subprocess.run([self.curl, self.endpoint,\
+ SPARQLQuery.REQUEST_OPTION, SPARQLQuery.REQUEST_VALUE, '--data', SPARQLQuery.QUERY, SPARQLQuery.HEADER_OPTION, SPARQLQuery.HEADER_VALUE, '-o', self.RESPONSE ], check=True)
+ with open(self.RESPONSE) as json_file:
+ data = json.load(json_file)
+ keys = data['head']['vars']
+ for key in keys:
+ for item in data['results']['bindings']:
+ self.__dict__[key] = float(item[key]['value'])
+
+ def _get_ext_program_path(self, program_name) ->str:
+ """Return path to external program
+ """
+ program_path = None
+ error_msg = f'External command "{program_name}" not found!\nPlease install "{program_name}", check the output of "which {program_name}" and retry.'
+ try:
+ cp = subprocess.run(["which", program_name], stdout=subprocess.PIPE, check=True)
+ program_path = cp.stdout.decode().strip()
+ if not bool(program_path) or not isfile(program_path):
+ raise FileNotFoundError(error_msg)
+ except subprocess.CalledProcessError:
+ print(error_msg)
+ raise
+ return program_path
+
+ def create_svg_file(self, page: ET.Element) -> int:
+ """ Create a svg file.
+ [return] exit_status
+ """
+ number = page.get('number')
+ page_file = page.get('output')
+ faksimile_file = self.faksimile_dir + sep + page.get('alias') + '.jpg'\
+ if page.get('alias') is not None\
+ else None
+ page_id = self.title.replace(' ', '_') + '_' + number
+ if bool(page.get('alias')) and not isfile(faksimile_file):
+ wget.download(FaksimileImage.NIETZSCHE_SOURCES_URL + page.get('alias'), out=faksimile_file)
+ if bool(page.get('alias')) and isfile(faksimile_file) and page_file is not None and isfile(page_file):
+ target_file = basename(page_file).replace('.xml', '.svg')
+ if not isfile(self.target_dir + sep + target_file):
+ prog_list = [self.inkscape, '-z', '-l', target_file, faksimile_file]
+ subprocess.run(prog_list, check=True)
+ svg_tree = ET.parse(target_file)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ images = svg_tree.xpath('//ns:image', namespaces=namespaces)
+ if len(images) > 0:
+ image_file = dirname(target_file) + sep + images[0].get('{%s}href' % namespaces['xlink'])
+ image_width = float(images[0].get('width')) if bool(images[0].get('width')) else 0
+ image_height = float(images[0].get('height')) if bool(images[0].get('height')) else 0
+ text_filed = get_text_field_on_image(image_file, image_width, image_height, id=page_id)
+ text_filed.attach_as_rect(svg_tree.getroot())
+ copy_faksimile_update_image_location(faksimile_tree=svg_tree, target_directory=self.target_dir)
+ remove(target_file)
+ else:
+ print(f'There has been an error: could not find an image in {target_file}!')
+ return 2
+ return 0
+ else:
+ print(faksimile_file, page_file)
+ return 2
+ def update_textfield_of_svg_file(self, svg_file: str) -> int:
+ """ Update the textfield of the svg file by using image analysis.
+ [return] exit_status
+ """
+ svg_tree = ET.parse(target_file)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ rects = svg_tree.xpath('//ns:rect[not(contains(@id, "rect"))]', namespaces=namespaces)
+ images = svg_tree.xpath('//ns:image', namespaces=namespaces)
+ if len(rects) > 0 and len(images) > 0:
+ image_file = dirname(svg_file) + sep + images[0].get('{%s}href' % namespaces['xlink'])
+ image_width = float(images[0].get('width')) if bool(images[0].get('width')) else 0
+ image_height = float(images[0].get('height')) if bool(images[0].get('height')) else 0
+ text_filed = get_text_field_on_image(image_file, image_width, image_height)
+ text_filed.attach_as_rect(rects[0])
+ copy_faksimile_svg_file(target_file=svg_file, faksimile_tree=svg_tree)
+ return 0
+ else:
+ print(f'There has been an error: could not find a rect and an image in {svg_file}!')
+ return 2
+ return 2
+
+class SVGFileCreator:
+ """This class can be used in order to create svg files with textfield rects.
+ """
+ UNITTESTING = False
+
+ def __init__(self, title, faksimile_dir, target_dir='./tmp'):
+ self.faksimile_dir = faksimile_dir
+ self.target_dir = target_dir
+ self.title = title
+ self.inkscape = self._get_ext_program_path('inkscape')
+ self.namespaces = None
+
+ def _get_ext_program_path(self, program_name) ->str:
+ """Return path to external program
+ """
+ program_path = None
+ error_msg = f'External command "{program_name}" not found!\nPlease install "{program_name}", check the output of "which {program_name}" and retry.'
+ try:
+ cp = subprocess.run(["which", program_name], stdout=subprocess.PIPE, check=True)
+ program_path = cp.stdout.decode().strip()
+ if not bool(program_path) or not isfile(program_path):
+ raise FileNotFoundError(error_msg)
+ except subprocess.CalledProcessError:
+ print(error_msg)
+ raise
+ return program_path
+
+ def create_svg_file(self, page: ET.Element) -> int:
+ """ Create a svg file.
+ [return] exit_status
+ """
+ number = page.get('number')
+ page_file = page.get('output')
+ faksimile_file = self.faksimile_dir + sep + page.get('alias') + '.jpg'\
+ if page.get('alias') is not None\
+ else None
+ page_id = self.title.replace(' ', '_') + '_' + number
+ if bool(page.get('alias')) and not isfile(faksimile_file):
+ wget.download(FaksimileImage.NIETZSCHE_SOURCES_URL + page.get('alias'), out=faksimile_file)
+ if bool(page.get('alias')) and isfile(faksimile_file) and page_file is not None and isfile(page_file):
+ target_file = basename(page_file).replace('.xml', '.svg')
+ if not isfile(self.target_dir + sep + target_file):
+ prog_list = [self.inkscape, '-z', '-l', target_file, faksimile_file]
+ subprocess.run(prog_list, check=True)
+ svg_tree = ET.parse(target_file)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ images = svg_tree.xpath('//ns:image', namespaces=namespaces)
+ if len(images) > 0:
+ image_file = dirname(target_file) + sep + images[0].get('{%s}href' % namespaces['xlink']).replace('file://', '')
+ image_width = float(images[0].get('width')) if bool(images[0].get('width')) else 0
+ image_height = float(images[0].get('height')) if bool(images[0].get('height')) else 0
+ print(image_file)
+ text_filed = get_text_field_on_image(image_file, image_width, image_height, id=page_id)
+ text_filed.attach_as_rect(svg_tree.getroot())
+ copy_faksimile_update_image_location(faksimile_tree=svg_tree, target_directory=self.target_dir)
+ remove(target_file)
+ else:
+ print(f'There has been an error: could not find an image in {target_file}!')
+ return 2
+ return 0
+ else:
+ print(faksimile_file, page_file)
+ return 2
+
+ @staticmethod
+ def UPDATE_TEXTFIELD_OF_SVG_FILE(svg_file: str) -> int:
+ """ Update the textfield of the svg file by using image analysis.
+ [return] exit_status
+ """
+ svg_tree = ET.parse(svg_file)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ rects = svg_tree.xpath('//ns:rect[not(contains(@id, "rect"))]', namespaces=namespaces)
+ images = svg_tree.xpath('//ns:image', namespaces=namespaces)
+ if len(rects) > 0 and len(images) > 0:
+ image_file = dirname(svg_file) + sep + images[0].get('{%s}href' % namespaces['xlink'])
+ image_width = float(images[0].get('width')) if bool(images[0].get('width')) else 0
+ image_height = float(images[0].get('height')) if bool(images[0].get('height')) else 0
+ text_filed = get_text_field_on_image(image_file, image_width, image_height)
+ text_filed.update_rect(rects[0])
+ copy_faksimile_svg_file(target_file=svg_file, faksimile_tree=svg_tree)
+ return 0
+ else:
+ print(f'There has been an error: could not find a rect and an image in {svg_file}!')
+ return 2
+ return 2
+
+def usage():
+ """prints information on how to use the script
+ """
+ print(main.__doc__)
+
+def main(argv):
+ """This program can be used to create svg files with a rect for the text_field.
+
+ fixes/create_blank_svg_files.py [OPTIONS]
+ fixes/create_blank_svg_files.py -u|--update
+
+ a xml file about a manuscript, containing information about its pages.
+ a xml file about a page, containing information about svg word positions.
+ a directory containing faksimile images
+ the directory where the files should be saved to
+
+ OPTIONS:
+ -h|--help show help
+ -u|--update update svg_files: use image analysis in order to update the textfield of the svg_files
+
+ :return: exit code (int)
+ """
+ update = False
+ try:
+ opts, args = getopt.getopt(argv, "hu", ["help","update"])
+ except getopt.GetoptError:
+ usage()
+ return 2
+ for opt, arg in opts:
+ if opt in ('-h', '--help'):
+ usage()
+ return 0
+ elif opt in ('-u', '--update'):
+ update = True
+ exit_status = 0
+ if update:
+ if len(args) == 0:
+ usage()
+ return 2
+ svg_dir = args[0]
+ if not isdir(svg_dir):
+ raise FileNotFoundError(f'Directory {svg_dir} does not exist!')
+ counter = 0
+ for svg_file in [ svg_dir + sep + svg_file for svg_file in listdir(svg_dir) if isfile(svg_dir + sep + svg_file) and svg_file.endswith('.svg') ]:
+ if not SVGFileCreator.UNITTESTING:
+ print(Fore.CYAN + f'Updating svg file {svg_file} ...' + Style.RESET_ALL)
+ copy_to_bak_dir(svg_file)
+ if SVGFileCreator.UPDATE_TEXTFIELD_OF_SVG_FILE(svg_file) == 0:
+ counter += 1
+ if not SVGFileCreator.UNITTESTING:
+ print(Style.RESET_ALL + f'[{counter} pages created]')
+ return exit_status
+
+ if len(args) < 3:
+ usage()
+ return 2
+ xml_file = args[0]
+ faksimile_dir = args[1]
+ target_dir = args[2]
+ not isdir(target_dir) and mkdir(target_dir)
+ if isfile(xml_file) and isdir(faksimile_dir):
+ counter = 0
+ xpath = f'//page[contains(@status, "{BLANK_STATUS}")]'
+ if len(xml_file.split('_')) > 2: # svg_pos_file
+ manuscript_file = '_'.join(xml_file.split('_')[0:2]) + '.xml'
+ if isfile(manuscript_file):
+ source_tree = ET.parse(manuscript_file)
+ xpath = f'//page[contains(@output,"{xml_file}")]'
+ else:
+ raise FileNotFoundError(f'There is no manuscript file {manuscript_file} for svg_pos_file {xml_file}!')
+ else:
+ source_tree = ET.parse(xml_file)
+ title = source_tree.getroot().get('title')
+ svg_creator = SVGFileCreator(title, faksimile_dir, target_dir=target_dir)
+ for page in source_tree.xpath(xpath):
+ if not SVGFileCreator.UNITTESTING:
+ number = page.get('number')
+ print(Fore.CYAN + f'Creating a svg file for {title}, {number} ...' + Style.RESET_ALL)
+ if svg_creator.create_svg_file(page) == 0:
+ counter += 1
+ if not SVGFileCreator.UNITTESTING:
+ print(Style.RESET_ALL + f'[{counter} pages created]')
+ else:
+ if not isdir(faksimile_dir):
+ raise FileNotFoundError(f'Directory {faksimile_dir} does not exist!')
+ raise FileNotFoundError('File {} does not exist!'.format(xml_file))
+ return exit_status
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
Index: fixes/test_add_faksimile_image.py
===================================================================
--- fixes/test_add_faksimile_image.py (revision 0)
+++ fixes/test_add_faksimile_image.py (revision 113)
@@ -0,0 +1,32 @@
+import lxml.etree as ET
+from os import sep, path, remove
+from os.path import isdir, isfile, dirname, basename
+import shutil
+import sys
+import tempfile
+import unittest
+import warnings
+
+import add_faksimile_image
+
+
+sys.path.append('shared_util')
+from myxmlwriter import copy_to_bak_dir
+
+
+sys.path.append('svgscripts')
+from datatypes.page import Page
+
+class TestAddFaksimileImage(unittest.TestCase):
+
+ def testAdd(self):
+ add_faksimile_image.UNITTESTING = True
+ page = Page.create_cls('xml/Mp_XVI_page52v.xml')
+ faksimile_dir = '/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Myriam/Blank/Mp_XVI/Fertig'
+ exit_status = add_faksimile_image.add_faksimile_image(page, faksimile_dir)
+ self.assertEqual(exit_status, 0)
+ #print(ET.dump(page.page_tree.xpath('//faksimile-image')[0]))
+
+
+if __name__ == "__main__":
+ unittest.main()
Index: fixes/fix_old_data.py
===================================================================
--- fixes/fix_old_data.py (revision 112)
+++ fixes/fix_old_data.py (revision 113)
@@ -1,551 +1,575 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to fix old data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
sys.path.append('svgscripts')
from convert_wordPositions import HTMLConverter
from datatypes.box import Box
from datatypes.faksimile import FaksimilePage
from datatypes.imprint import Imprint
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.matrix import Matrix
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from datatypes.positional_word_part import PositionalWordPart
from datatypes.path import Path
from datatypes.word import Word
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word, update_transkription_position_ids
from join_faksimileAndTranskription import sort_words
from util import back_up, back_up_svg_file, copy_faksimile_svg_file, reset_tp_with_matrix
-from process_files import update_svgposfile_status
+from process_files import update_svgposfile_status, get_extended_text_field
from process_footnotes import save_imprints
from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import create_function_dictionary, get_manuscript_files
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
MAX_SVG_XY_THRESHOLD = 10
#TODO: fix all svg graphical files: change xlink:href to href!!!!
def convert_old_matrix(tp, xmin, ymin) ->(Matrix, float, float):
"""Return new matrix, x and y for old transkription_position.
"""
matrix = tp.transform.clone_transformation_matrix()
matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3)
x = round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)\
if tp.left > 0\
else 0
y = round((tp.height-1.5)*-1, 3)
return matrix, x, y
+def extend_text_field(page, redo=False) ->bool:
+ """Extend text_field of svg_image in page.
+ """
+ if not redo and page_already_changed(page):
+ return False
+ text_field = get_extended_text_field(page.svg_image.file_name, multipage_index=page.multipage_index)
+ page.svg_image.width = text_field.width
+ page.svg_image.height = text_field.height
+ page.svg_image.text_field = text_field
+ page.svg_image.attach_object_to_tree(page.page_tree)
+ tf = TranskriptionField(page.svg_image.file_name, multipage_index=page.multipage_index)
+ tf.xmin = text_field.left
+ tf.ymin = text_field.top
+ tf.width = text_field.width
+ tf.height = text_field.height
+ tf.shrink_svg_to_transkription_field(redo=True)
+ if not UNITTESTING:
+ save_page(page)
+ return True
+
def save_page(page, attach_first=False, backup=False, script_name=None):
"""Write page to xml file
"""
if backup:
back_up(page, page.xml_file)
if attach_first:
page.update_and_attach_words2tree()
if script_name is None:
script_name = f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}'
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=script_name, file_type=FILE_TYPE_SVG_WORD_POSITION)
def page_already_changed(page) -> bool:
"""Return whether page has alreadybeen changed by function
"""
return len(\
page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}:{inspect.currentframe().f_back.f_code.co_name}"]')\
) > 0
def fix_faksimile_line_position(page, redo=False) -> bool:
"""Create a faksimile line position.
"""
if not redo and page_already_changed(page):
return False;
update_faksimile_line_positions(page)
if not UNITTESTING:
save_page(page)
return True
def check_faksimile_positions(page, redo=False) -> bool:
"""Check faksimile line position.
"""
if len(page.page_tree.xpath('//data-source/@file')) > 0:
svg_file = page.page_tree.xpath('//data-source/@file')[0]
svg_tree = ET.parse(svg_file)
positions_are_equal_counter = 0
page_changed = False
for faksimile_page in FaksimilePage.GET_FAKSIMILEPAGES(svg_tree):
if page.title == faksimile_page.title\
and page.number == faksimile_page.page_number:
#print([fp.id for fp in faksimile_page.word_positions ])
for word in page.words:
for fp in word.faksimile_positions:
rect_fps = [ rfp for rfp in faksimile_page.word_positions if rfp.id == fp.id ]
if len(rect_fps) > 0:
rfp = rect_fps[0]
if fp.left != rfp.left or fp.top != rfp.top:
#print(f'{fp.id}: {fp.left}/{rfp.left} {fp.top}/{rfp.top}')
fp.left = rfp.left
fp.top = rfp.top
fp.bottom = fp.top + rfp.height
word.attach_word_to_tree(page.page_tree)
page_changed = True
else:
positions_are_equal_counter += 1
print(f'{positions_are_equal_counter}/{len(page.words)} are equal')
if page_changed and not UNITTESTING:
save_page(page)
return page_changed
def fix_faksimile_positions(page, redo=False) -> bool:
"""Set faksimile positions to absolute values.
[:return:] fixed
"""
if not redo and len(page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}"]')) > 0:
return False
x_min = page.text_field.xmin
y_min = page.text_field.ymin
for word in page.words:
for fp in word.faksimile_positions:
fp.left = fp.left + x_min
fp.top = fp.top + y_min
fp.bottom = fp.bottom + y_min
word.attach_word_to_tree(page.page_tree)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
return True
def _fix_tp_of_word(page, word, text_field):
"""Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top
"""
for tp in word.transkription_positions:
tp.left += text_field.left
tp.top += text_field.top
reset_tp_with_matrix(word.transkription_positions)
if type(word) == Word:
words_in_word = word.word_parts + [ item for item in word.__dict__.items() if type(item) == Word ]
for wp in words_in_word:
_fix_tp_of_word(page, wp, text_field)
def fix_tp_with_matrix(page, redo=False) -> bool:
"""Fix transkription positions with rotation matrix ->set left to 0 and top to -5.
[:return:] fixed
"""
xmin = 0 if page.svg_image is None or page.svg_image.text_field is None else page.svg_image.text_field.left
ymin = 0 if page.svg_image is None or page.svg_image.text_field is None else page.svg_image.text_field.top
for word in page.words:
reset_tp_with_matrix(word.transkription_positions, tr_xmin=xmin, tr_ymin=ymin)
for wp in word.word_parts:
reset_tp_with_matrix(wp.transkription_positions, tr_xmin=xmin, tr_ymin=ymin)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, attach_first=True)
return True
def _fix_old_transkription_positions(page, redo=False) -> bool:
"""Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top
[:return:] fixed
"""
if page.svg_image is not None\
and page.svg_image.text_field is None:
if page.svg_image is None:
if page.svg_file is not None:
transkription_field = TranskriptionField(page.svg_file)
width = round(tf.documentWidth, 3)
height = round(tf.documentHeight, 3)
page.svg_image = SVGImage(file_name=svg_file, width=width,\
height=height, text_field=transkription_field.convert_to_text_field())
page.svg_image.attach_object_to_tree(page.page_tree)
else:
raise Exception(f'ERROR page {page.page_tree.docinfo.URL} does not have a svg_file!')
elif page.svg_image.text_field is None:
page.svg_image.text_field = TranskriptionField(page.svg_image.file_name).convert_to_text_field()
page.svg_image.attach_object_to_tree(page.page_tree)
for line_number in page.line_numbers:
line_number.top += page.svg_image.text_field.top
line_number.bottom += page.svg_image.text_field.top
line_number.attach_object_to_tree(page.page_tree)
for word in page.words:
_fix_tp_of_word(page, word, page.svg_image.text_field)
for mark in page.mark_foreign_hands:
_fix_tp_of_word(page, mark, page.svg_image.text_field)
for tcm in page.text_connection_marks:
_fix_tp_of_word(page, tcm, page.svg_image.text_field)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, attach_first=True)
return True
return False
def _fix_old_pwps(page, old_tps):
"""Adjust positional_word_parts to corrected transkription_positions.
"""
for tp in old_tps:
for pwp in tp.xpath(f'./{PositionalWordPart.XML_TAG}'):
left = float(pwp.get('left'))
top = float(pwp.get('top'))
bottom = float(pwp.get('bottom'))
pwp.set('left', str(left + page.svg_image.text_field.left))
pwp.set('top', str(top + page.svg_image.text_field.top))
pwp.set('bottom', str(bottom + page.svg_image.text_field.top))
def _fix_quotation_mark_tps(page, old_tps):
"""Fix the height of transkription_positions of words with quotation marks.
"""
for tp in old_tps:
heighest_pwp = sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('height')), reverse=True)[0]
toppest_pwp = sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('top')))[0]
new_height = float(tp.get('height')) + abs(float(heighest_pwp.get('top'))-float(toppest_pwp.get('top')))
tp.set('height', str(new_height))
def fix_transkription_positions(page, redo=False) -> bool:
"""Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top
[:return:] fixed
"""
THRESHOLD = 10
if page.svg_image is not None\
and page.svg_image.text_field is None:
if not _fix_old_transkription_positions(page):
return False
_fix_old_pwps(page, [ pwp.getparent() for pwp in page.page_tree.xpath(f'//{PositionalWordPart.XML_TAG}[@id="0"]')\
if abs(float(pwp.get('left')) - float(pwp.getparent().get('left'))) > THRESHOLD ])
_fix_quotation_mark_tps(page, [ tp for tp in page.page_tree.xpath(f'//{TranskriptionPosition.XML_TAG}')\
if len(tp.xpath(f'./{PositionalWordPart.XML_TAG}')) > 0\
and sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('height')), reverse=True)[0]\
!= sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('top')))[0] ])
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page)
return True
def fix_styles(page, redo=False):
"""Remove unused styles from tree.
"""
if len(page.page_tree.xpath('//style')) > 1:
for node in page.page_tree.xpath('//style')[1:]: node.getparent().remove(node)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page)
return True
def fix_imprints(page, redo=False):
"""Remove unused styles from tree.
"""
if len(page.page_tree.xpath('//' + Imprint.XML_TAG)) == 0:
save_imprints(page)
return True
def merge_transkription_positions(page, redo=False) -> bool:
"""Fix transkription positions of merged words
[:return:] fixed
"""
if not isdir(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR)\
or not isfile(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)):
return False
merged_page = Page(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL))
sync_dictionary = sync_words_linewise(merged_page.words, page.words, merged_page.line_numbers)
words = []
for source_word in merged_page.words:
words.append(source_word)
if bool(sync_dictionary.get(source_word)):
_sync_transkriptions_with_words(source_word, sync_dictionary)
if source_word.text != ''.join([ t.get_text() for t in source_word.transkription_positions ]):
text = ''.join([ t.get_text() for t in source_word.transkription_positions ])
print(f'{source_word.line_number}: {source_word.text} has transkription_positions with text "{text}".')
response = input('Change? [Y/n]>')
if not response.startswith('n'):
new_sync_dictionary = sync_words_linewise(merged_page.words, page.words,\
[ line for line in merged_page.line_numbers if line.id == source_word.line_number ], force_sync_on_word=source_word)
if bool(new_sync_dictionary.get(source_word)):
_sync_transkriptions_with_words(source_word, new_sync_dictionary)
else:
raise Exception(f'Could not find sourc_word {source_word.text} in {new_sync_dictionary}!')
page.words = words
page.update_and_attach_words2tree()
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page)
return True
def fix_graphical_svg_file(page, redo=False) -> bool:
"""Fix glyphs of word for which there is a /changed-word in page.page_tree
"""
svg_tree = ET.parse(page.svg_file)
transkription_field = TranskriptionField(page.source)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
back_up_svg_file(svg_tree, namespaces=namespaces)
tr_xmin = transkription_field.xmin if (page.svg_image is None or page.svg_image.text_field is None) else 0
tr_ymin = transkription_field.ymin if (page.svg_image is None or page.svg_image.text_field is None) else 0
for deleted_word_node in page.page_tree.xpath('//deleted-word'):
deleted_word = Word.create_cls(deleted_word_node)
_run_function_on_nodes_for_word(svg_tree, namespaces, deleted_word, tr_xmin, tr_ymin, _set_node_attribute_to, 'visibility', 'hidden')
for changed_word_node in page.page_tree.xpath('//changed-word'):
changed_word = Word.create_cls(changed_word_node)
try:
word = [ word for word in page.words if word.id == changed_word.id and word.text == changed_word.text ][0]
left_difference = word.transkription_positions[0].left - changed_word.transkription_positions[0].left
_run_function_on_nodes_for_word(svg_tree, namespaces, word, tr_xmin, tr_ymin, _add_value2attribute, 'x', left_difference)
except IndexError:
warnings.warn(f'There is no word for changed_word {changed_word.id}: "{changed_word.text}" in {page.page_tree.docinfo.URL}!')
copy_faksimile_svg_file(target_file=page.svg_file, faksimile_tree=svg_tree, namespaces=namespaces)
def _add_value2attribute(node, attribute, value):
"""Add left_difference to x of node.
"""
node.set(attribute, str(float(node.get(attribute)) + value))
node.set('changed', 'true')
def _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=0.1) -> list:
"""Return nodes with symbol_id n x = svg_x and y = svg_y.
"""
nodes = [ node for node in svg_tree.xpath(\
f'//ns:use[@xlink:href="#{symbol_id}" and @x > {svg_x-threshold} and @x < {svg_x+threshold} and @y > {svg_y-threshold} and @y < {svg_y+threshold} ]',\
namespaces=namespaces) if not bool(node.get('changed')) ]
if len(nodes) == 0 and threshold < MAX_SVG_XY_THRESHOLD:
return _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=threshold+1)
return nodes
def _run_function_on_nodes_for_word(svg_tree, namespaces, word, tr_xmin, tr_ymin, function_on_node, attribute, value):
"""Run function on nodes for words.
"""
for tp in word.transkription_positions:
for pwp in tp.positional_word_parts:
symbol_id = pwp.symbol_id
svg_x = pwp.left + tr_xmin
svg_y = pwp.bottom + tr_ymin
nodes = _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y)
if len(nodes) > 0:
node = nodes[0]
function_on_node(node, attribute, value)
def _set_node_attribute_to(node, attribute, value):
"""Set attribute of node to value.
"""
node.set(attribute, str(value))
node.set('changed', 'true')
def sync_words_linewise(source_words, target_words, lines, force_sync_on_word=None) -> dict:
"""Sync words an create a dictionary with source_words as keys, refering to a list of corresponding words.
"""
result_dict = {}
for word in target_words + source_words: word.processed = False
for line in lines:
source_words_on_line = sorted([ word for word in source_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left)
target_words_on_line = sorted([ word for word in target_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left)
if len(target_words_on_line) == len(source_words_on_line):
_sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word)
elif len(source_words_on_line) < len(target_words_on_line):
_sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word)
else:
print('okey dokey')
return result_dict
def _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict):
"""Force sync on word.
"""
unprocessed_target_words = [t_word for t_word in target_words_on_line if not t_word.processed]
if len(unprocessed_target_words) > 0:
print([ (i, t_word.text) for i, t_word in enumerate(unprocessed_target_words)])
response = input(f'Please specify indices of words to sync {force_sync_on_word.text} with: [default:0-{len(unprocessed_target_words)-1}]>')
indices = [ i for i in range(0, len(unprocessed_target_words)) ]
if re.match(r'\d+-\d+', response):
index_strings = response.split('-')
indices = [ i for i in range(int(index_strings[0]), int(index_strings[1])+1) ]
elif response != '':
indices = [ int(i) for i in response.split(' ') ]
target_words = []
for i in indices: target_words.append(unprocessed_target_words[i])
result_dict.update({ force_sync_on_word: target_words })
else:
raise Exception(f'There are no unprocessed target_words for {force_sync_on_word.text} on line {force_sync_on_word.line_number}!')
def _sync_transkriptions_with_words(word, sync_dictionary):
"""Sync transkription_positions of word with syncronized words.
"""
word.transkription_positions = []
for target_word in sync_dictionary[word]:
word.transkription_positions += target_word.transkription_positions
def _sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None):
"""Sync if there are more target words.
"""
current_source_word = None
for target_word in target_words_on_line:
if current_source_word is not None\
and current_source_word.text.startswith(''.join([ w.text for w in result_dict[current_source_word]]) + target_word.text):
result_dict[current_source_word].append(target_word)
target_word.processed = True
if current_source_word.text == ''.join([ w.text for w in result_dict[current_source_word]]):
current_source_word = None
elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ]) > 0:
source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ][0]
target_word.processed = True
source_word.processed = True
result_dict.update({ source_word: [ target_word ] })
elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ]) > 0:
current_source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ][0]
current_source_word.processed = True
target_word.processed = True
result_dict.update({ current_source_word: [ target_word ] })
else:
msg = f'On line {target_word.line_number}: target_word "{target_word.text}" does not have a sibling in {[ s.text for s in source_words_on_line if not s.processed ]}'
warnings.warn(msg)
if force_sync_on_word is not None:
_force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict)
def _sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None):
"""Sync same length
"""
for i, word in enumerate(source_words_on_line):
if word.text == target_words_on_line[i].text:
word.processed = True
target_words_on_line[i].processed = True
result_dict.update({ word: [ target_words_on_line[i] ] })
elif len([ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ]) > 0:
target_word = [ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ][0]
word.processed = True
target_word.processed = True
result_dict.update({ word: [ target_word ] })
else:
msg = f'On line {word.line_number}: source_word "{word.text}" does not have a sibling in {[ s.text for s in target_words_on_line]}'
warnings.warn(msg)
if force_sync_on_word is not None:
_force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix old data.
svgscripts/fix_old_data.py [OPTIONS]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-c|--check-faksimile-positions check whether faksimile positions have been updated
+ -e|--update-extended-textfield update extended textfield to svg_image
-i|--fix-imprints add imprints to page
-l|--faksimile-line-position create faksimile line positions
-p|--faksimile-positions fix old faksimile positions
-r|--redo rerun
-s|--fix-graphical-svg fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file.
-S|--fix-styles fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file.
-t|--transkription-positions fix old transkription positions
-M|--matrix fix old transkription positions with transform matrix
:return: exit code (int)
"""
function_list = []
function_dict = create_function_dictionary(['-c', '--check-faksimile-positions'], check_faksimile_positions)
function_dict = create_function_dictionary(['-l', '--faksimile-line-position'], fix_faksimile_line_position, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-p', '--faksimile-positions'], fix_faksimile_positions, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-m', '--merge-positions'], merge_transkription_positions, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-s', '--fix-graphical-svg'], fix_graphical_svg_file, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-M', '--matrix'], fix_tp_with_matrix, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-t', '--transkription-positions'], fix_transkription_positions, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-S', '--fix-styles'], fix_styles, function_dictionary=function_dict)
- function_dict = create_function_dictionary(['default', '-i', '--fix-imprints'], fix_imprints, function_dictionary=function_dict)
+ function_dict = create_function_dictionary(['-i', '--fix-imprints'], fix_imprints, function_dictionary=function_dict)
+ function_dict = create_function_dictionary(['default', '-e', '--update-extended-textfield'], extend_text_field, function_dictionary=function_dict)
redo = False;
try:
- opts, args = getopt.getopt(argv, "hcplrmsStMi", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position",\
- "redo", "merge-positions", "fix-graphical-svg", "fix-styles", "transkription-positions", 'matrix', 'fix-imprints' ])
+ opts, args = getopt.getopt(argv, "hcplrmsStMie", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position",\
+ "redo", "merge-positions", "fix-graphical-svg", "fix-styles", "transkription-positions", 'matrix', 'fix-imprints', 'update-extended-textfield' ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-r', '--redo'):
redo = True;
elif opt in function_dict.keys():
function_list.append(function_dict[opt])
if len(function_list) == 0:
function_list.append(function_dict['default'])
if len(args) < 1:
usage()
return 2
exit_status = 0
for xml_file in get_manuscript_files(args):
if isfile(xml_file):
counters = { f.__name__: 0 for f in function_list }
for current_function in function_list:
status_contains = STATUS_MERGED_OK if 'faksimile' in current_function.__name__ else 'OK'
+ if 'extend_text_field' in current_function.__name__:
+ status_contains = 'blank'
for page in Page.get_pages_from_xml_file(xml_file, status_contains=status_contains):
if not UNITTESTING:
print(Fore.CYAN + f'Processing {page.title}, {page.number} with function {current_function.__name__} ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
counters[current_function.__name__] += 1 if current_function(page, redo=redo) else 0
if not UNITTESTING:
for function_name, counter in counters.items():
print(Style.RESET_ALL + f'[{counter} pages changed by {function_name}]')
else:
raise FileNotFoundError('File {} does not exist!'.format(xml_file))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: fixes/add_faksimile_image.py
===================================================================
--- fixes/add_faksimile_image.py (revision 0)
+++ fixes/add_faksimile_image.py (revision 113)
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""This program can be used to add a faksimile image to a page xml file.
+"""
+# Copyright (C) University of Basel 2021 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+from colorama import Fore, Style
+import getopt
+import json
+import lxml.etree as ET
+import shutil
+import subprocess
+import sys
+import os
+import wget
+from os import listdir, sep, path, setpgrp, devnull, mkdir, remove
+from os.path import exists, isfile, isdir, dirname, basename
+from progress.bar import Bar
+import warnings
+
+from fix_old_data import save_page
+from get_text_field import get_text_field_on_image
+
+sys.path.append('svgscripts')
+from datatypes.faksimile_image import FaksimileImage
+from datatypes.faksimile import FaksimilePage
+from datatypes.archival_manuscript import ArchivalManuscriptUnity
+from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
+from util import back_up, back_up_svg_file, copy_faksimile_update_image_location, copy_faksimile_svg_file
+from process_files import update_svgposfile_status
+from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
+
+sys.path.append('shared_util')
+from myxmlwriter import copy_to_bak_dir, write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
+from main_util import create_function_dictionary
+
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+MAX_SVG_XY_THRESHOLD = 10
+BLANK_STATUS = 'blank'
+UNITTESTING = False
+
+
+def add_faksimile_image(page: Page, faksimile_dir: str) ->int:
+ """Add faksimile image to page.
+ [:return:] exit_code
+ """
+ svg_file = faksimile_dir + sep + basename(page.page_tree.docinfo.URL).replace('.xml','.svg')
+ if isfile(svg_file):
+ fp = FaksimilePage.get_faksimile_pages(svg_file, page_number=page.number, isBlank=True)
+ if len(fp) > 0 and fp[0].faksimile_image is not None:
+ fp[0].faksimile_image.text_field = fp[0].text_field
+ fp[0].faksimile_image.attach_object_to_tree(page.page_tree)
+ if not UNITTESTING:
+ save_page(page)
+ return 0
+ return 2
+
+def usage():
+ """prints information on how to use the script
+ """
+ print(main.__doc__)
+
+def main(argv):
+ """This program can be used to add a faksimile image to a page xml file.
+
+ fixes/add_faksimile_image.py [OPTIONS]
+
+ a xml file about a manuscript, containing information about its pages.
+ a xml file about a page, containing information about svg word positions.
+ a directory containing the blank faksimile svg files
+
+ OPTIONS:
+ -h|--help show help
+
+ :return: exit code (int)
+ """
+ try:
+ opts, args = getopt.getopt(argv, "h", ["help"])
+ except getopt.GetoptError:
+ usage()
+ return 2
+ for opt, arg in opts:
+ if opt in ('-h', '--help'):
+ usage()
+ return 0
+ exit_status = 0
+ if len(args) < 2:
+ usage()
+ return 2
+ xml_file = args[0]
+ manuscript_file = xml_file\
+ if len(xml_file.split('_')) < 3\
+ else '_'.join(xml_file.split('_')[0:2]) + '.xml'
+ faksimile_dir = args[1]
+ if isfile(xml_file) and isdir(faksimile_dir):
+ counter = 0
+ for page in Page.get_pages_from_xml_file(xml_file, status_contains=BLANK_STATUS):
+ if not UNITTESTING:
+ print(Fore.CYAN + f'Adding a faksimile image to {page.title}, {page.number} ...' + Style.RESET_ALL)
+ back_up(page, page.xml_file)
+ if add_faksimile_image(page, faksimile_dir) == 0:
+ counter += 1
+ if not UNITTESTING:
+ print(Style.RESET_ALL + f'[{counter} pages created]')
+ else:
+ if not isdir(faksimile_dir):
+ raise FileNotFoundError(f'Directory {faksimile_dir} does not exist!')
+ raise FileNotFoundError('File {} does not exist!'.format(xml_file))
+ return exit_status
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
Index: fixes/test_interactive_editor.py
===================================================================
--- fixes/test_interactive_editor.py (revision 112)
+++ fixes/test_interactive_editor.py (revision 113)
@@ -1,171 +1,188 @@
import lxml.etree as ET
from os import sep, path, remove
from os.path import isdir, isfile, dirname, basename
import shutil
from svgpathtools.parser import parse_path
import sys
import tempfile
import unittest
import warnings
import interactive_editor
sys.path.append('svgscripts')
from datatypes.faksimile import FaksimilePage
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page
from datatypes.path import Path
from datatypes.positional_word_part import PositionalWordPart
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word
from datatypes.word_position import WordPosition
from datatypes.word_deletion_path import WordDeletionPath
from process_words_post_merging import MERGED_DIR
class TestInteractiveEditor(unittest.TestCase):
def setUp(self):
interactive_editor.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml'
self.fix_transkription_positions = DATADIR + sep + 'Mp_XIV_page419a.xml'
#@unittest.skip('interactive')
def test_run(self):
page = Page(self.xml_file)
#interactive_editor.InteractiveShell().run_interactive_editor(page)
+ @unittest.skip('interactive')
def test_json_dict(self):
ro = interactive_editor.ResponseOrganizer()
json_dict = ro.create_json_dict(self.xml_file)
#print(json_dict)
+ @unittest.skip('interactive')
def test_handle_json(self):
ro = interactive_editor.ResponseOrganizer()
json_dict = ro.handle_response({})
self.assertEqual(json_dict['actions']['result'], 'ERROR: there was no key "target_file" in json!')
json_dict = ro.handle_response({'target_file': self.xml_file})
self.assertTrue(json_dict['actions']['result'].startswith('ERROR: there was no key "date_stamp" in json'))
json_dict = ro.handle_response({'target_file': self.xml_file, 'date_stamp': path.getmtime(self.xml_file)})
self.assertTrue(json_dict['actions']['result'].startswith('Operation "unknown" failed'))
page = Page(self.xml_file)
json_dict = ro.handle_response({'target_file': self.xml_file, 'date_stamp': path.getmtime(self.xml_file),\
'response_handler': { 'action_name': 'join words'}, 'words': [ { 'id': w.id, 'tp_id': f'w{w.id}:t0' } for w in page.words[:2] ] })
self.assertTrue(json_dict['actions']['result'].startswith('Operation "join words" succeeded!'))
#self.assertEqual(json_dict['response'], 'ERROR: there was no key "target_file" in json!')
def test_update_word(self):
page = Page(self.xml_file)
word = page.words[0]
rh = interactive_editor.SaveChanges()
self.assertEqual(rh._update_word(word, { 'id': word.id, 'deleted': False, 'line': 99, 'tp_id': f'w{word.id}:tp0' }, page.words), 0)
self.assertEqual(word.deleted, False)
self.assertEqual(word.line_number, 99)
word = page.words[18]
self.assertEqual(rh._update_word(word, { 'id': word.id, 'deleted': True, 'line': 99, 'tp_id': f'w{word.id}:w0:tp0' }, page.words), 0)
self.assertEqual(word.word_parts[0].deleted, True)
self.assertEqual(word.word_parts[0].line_number, 99)
old_word = word
word = page.words[19]
self.assertEqual(rh._update_word(word, { 'id': word.id, 'old_id': old_word.id, 'fp_id': old_word.faksimile_positions[0].id }, page.words), 0)
self.assertEqual(len(word.faksimile_positions), 2)
self.assertEqual(len(old_word.faksimile_positions), 0)
def test_save_position(self):
page = Page(self.xml_file)
word = page.words[0]
rh = interactive_editor.SavePositions()
self.assertEqual(rh._update_word(word,\
[{ 'id': word.id, 'left': word.transkription_positions[0].left + 10, 'top': word.transkription_positions[0].top + 10, 'tp_id': f'w{word.id}:tp0' }]), 0)
word = page.words[18]
self.assertEqual(rh._update_word(word, [{ 'id': word.id, 'left': word.word_parts[0].transkription_positions[0].left + 10,\
'top': word.word_parts[0].transkription_positions[0].top + 10, 'tp_id': f'w{word.id}:w0:tp0' }]), 0)
def test_join_path(self):
+ page = Page('xml/Mp_XVI_page01v.xml')
+ words = [ word for word in page.words if word.id >= 298 and word.id <= 302]
+ jdp = interactive_editor.JoinDeletionPath()
+ jdp.join_word_paths(words)
+ print(words[0].deletion_paths[0].d_attribute)
+ """
page = Page(self.xml_file)
word = page.words[0]
paths_strings = [ "M 273.343,251.451 L 276.479,251.451 L 276.479,251.751 L 273.343,251.751 L 273.343,251.451",\
"M 276.479,251.451 L 278.579,251.451 L 278.579,251.751 L 276.479,251.751 L 276.479,251.451",\
"M 278.579,251.451 L 283.794,251.451 L 283.794,251.751 L 278.579,251.751 L 278.579,251.451",\
"M 283.794,251.451 L 286.804,251.451 L 286.804,251.751 L 283.794,251.751 L 283.794,251.451",\
"M 286.804,251.451 L 289.296,251.451 L 289.296,251.751 L 286.804,251.751 L 286.804,251.451" ]
word.deletion_paths = [ WordDeletionPath(Path(id=num+len(page.word_deletion_paths), path=parse_path(pstring)), None) for num, pstring in enumerate(paths_strings) ]
page.word_deletion_paths += word.deletion_paths
word_dict_list = []
for pstring in paths_strings: word_dict_list.append({'id': word.id, 'tp_id': f'w{word.id}:tp0', 'deletion_path': pstring })
jdp = interactive_editor.JoinDeletionPath()
old_length = word.deletion_paths[0].path.length()
self.assertEqual(jdp._join_deletion_path(page, word, word_dict_list), 0)
self.assertEqual(len(word.deletion_paths), 1)
self.assertTrue(word.deletion_paths[0].path.length() > old_length)
#print(word.deletion_paths[0].path.length())
+ """
+ def test_delete_last_char(self):
+ page = Page('xml/Mp_XVI_page01v.xml')
+ word = [ word for word in page.words if word.text == 'Himmels'][0]
+ word_dict = { 'words': [{'id': word.id, 'tp_id': f'w{word.id}:tp0' }] }
+ dlc = interactive_editor.DeleteLastChar()
+ dlc.handle_response(page, word_dict)
+ word = [ word for word in page.words if word.text == 'Himmels'][0]
+ print(word.__dict__)
def test_remove_path(self):
page = Page(self.xml_file)
word = page.words[0]
paths_strings = [ "M 273.343,251.451 L 276.479,251.451 L 276.479,251.751 L 273.343,251.751 L 273.343,251.451",\
"M 276.479,251.451 L 278.579,251.451 L 278.579,251.751 L 276.479,251.751 L 276.479,251.451",\
"M 278.579,251.451 L 283.794,251.451 L 283.794,251.751 L 278.579,251.751 L 278.579,251.451",\
"M 283.794,251.451 L 286.804,251.451 L 286.804,251.751 L 283.794,251.751 L 283.794,251.451",\
"M 286.804,251.451 L 289.296,251.451 L 289.296,251.751 L 286.804,251.751 L 286.804,251.451" ]
word.deletion_paths = [ WordDeletionPath(Path(id=num+len(page.word_deletion_paths), path=parse_path(pstring)), None) for num, pstring in enumerate(paths_strings) ]
page.word_deletion_paths += word.deletion_paths
p_wdp_length = len(page.word_deletion_paths)
w_dp_length = len(word.deletion_paths)
word_dict_list = [{'id': word.id, 'tp_id': f'w{word.id}:tp0', 'deletion_path': paths_strings[0] }]
rdp = interactive_editor.RemoveDeletionPath()
self.assertEqual(rdp._remove_deletion_path(page, word, word_dict_list), 0)
self.assertEqual(len(word.deletion_paths), w_dp_length-1)
self.assertTrue(len(page.page_tree.xpath(f'./{WordDeletionPath.XML_TAG}')) < p_wdp_length)
def test_get_transkription_words(self):
json_dict = { 'words': [{ 'id': 0, 'left': 10, 'top': 10, 'tp_id': 'w0:tp0' }, { 'id': 1, 'left': 10, 'top': 10, 'fp_id': 'rect10' } ] }
rh = interactive_editor.ResponseHandler()
self.assertEqual(len(rh.get_transkription_words(json_dict)), 1)
def test_dictcontains_keys(self):
a_dict = { 'a': { 'b': { 'c': { 'd': 0 }
}
}}
key_list = [ 'a', 'b', 'c', 'd' ]
self.assertTrue(interactive_editor.dict_contains_keys(a_dict, key_list))
def test_get_requirement(self):
rh = interactive_editor.ResponseHandler()
json_dict = { 'response_handler': { 'requirements' : [ { 'input': 'asdf', 'name': 'test' } ]}}
name, requirement = rh.get_requirement(json_dict)
self.assertEqual(name, 'test')
self.assertEqual(requirement, 'asdf')
self.assertEqual(rh.get_requirement(json_dict, index=1), (None,None))
def test_split_words_dict(self):
rh = interactive_editor.SplitWords(action_name='split words', description='asdf asdf')
self.assertTrue(interactive_editor.dict_contains_keys(rh.create_json_dict(), ['requirements']))
def test_handle_split_text(self):
page = Page(self.xml_file)
word = page.words[0]
json_dict = { 'words': [{ 'id': word.id, 'tp_id': f'w{word.id}:t0' }], 'response_handler': { 'requirements' : [ { 'input': 'h', 'name': 'split_text' } ]}}
rh = interactive_editor.SplitWords(action_name='split words', description='asdf asdf')
self.assertEqual(rh.handle_response(page, json_dict), 0)
self.assertEqual(page.words[0].text, 'h')
def test_handle_addbox(self):
page = Page(self.xml_file)
word = page.words[0]
json_dict = { 'words': [{ 'id': word.id, 'tp_id': f'w{word.id}:t0' }], 'response_handler': { 'requirements' : [ { 'input': 'test', 'name': 'box_text' } ]}}
rh = interactive_editor.AddBox(action_name='add box', description='asdf asdf')
self.assertEqual(rh.handle_response(page, json_dict), 0)
self.assertTrue(page.words[0].overwrites_word is not None)
self.assertEqual(page.words[0].overwrites_word.text, 'test')
word = page.words[1]
json_dict = { 'words': [{ 'id': word.id, 'tp_id': f'w{word.id}:t0' }], 'response_handler': { 'requirements' : [ { 'input': 'a', 'name': 'box_text' },\
{'input': 'e', 'name': 'overwritten_by'}, {'input': True, 'name': 'is_earlier_version'}]}}
self.assertEqual(rh.handle_response(page, json_dict), 0)
self.assertTrue(page.words[1].earlier_version is not None)
self.assertEqual(page.words[1].earlier_version.text, 'fast')
if __name__ == "__main__":
unittest.main()
Index: fixes/interactive_editor.py
===================================================================
--- fixes/interactive_editor.py (revision 112)
+++ fixes/interactive_editor.py (revision 113)
@@ -1,1030 +1,1200 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from datetime import datetime
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
from svgpathtools.parser import parse_path
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
from checker_handler import CheckerHandler
from fix_old_data import save_page
from fix_boxes import attach_box, split_into_parts_and_attach_box
sys.path.append('svgscripts')
from convert_wordPositions import HTMLConverter, JSONConverter
from datatypes.box import Box
from datatypes.faksimile import FaksimilePage
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
+from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word, update_transkription_position_ids
from datatypes.word_deletion_path import WordDeletionPath
from join_faksimileAndTranskription import sort_words, add_faksimile_image
-from util import back_up, back_up_svg_file, copy_faksimile_svg_file, change_title_of_svg
+from util import back_up, back_up_svg_file, copy_faksimile_svg_file, change_title_of_svg, change_id_of_textfield
from process_files import update_svgposfile_status
from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import create_function_dictionary
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
MAX_SVG_XY_THRESHOLD = 10
class ResponseHandler:
def __init__(self, response_starts_with=None, dialog_string=None, action_name=None, description=None):
self.action_name = action_name
self.dialog_string = dialog_string
self.description = description
self.response_starts_with = response_starts_with
def create_requirement_list(self) ->list:
"""Create a requirement dictionary.
"""
return []
def create_json_dict(self)->dict:
"""Create a json dictionary.
"""
json_dict = { 'action_name': self.action_name, 'description': self.description }
requirements = self.create_requirement_list()
if len(requirements) > 0:
json_dict.update({ 'requirements': requirements })
return json_dict
def get_transkription_words(self, json_dict: dict) ->list:
"""Return words with transkription positions only.
"""
words = json_dict['words']\
if bool(json_dict.get('words'))\
else []
return [ w for w in words if bool(w.get('tp_id')) ]
def get_requirement(self, json_dict: dict, index=0) ->tuple:
"""Return requirement tuple (name, input).
"""
name = requirement = None
if dict_contains_keys(json_dict, ['response_handler','requirements'])\
and index < len(json_dict['response_handler']['requirements']):
requirement_dict = json_dict['response_handler']['requirements'][index]
if dict_contains_keys(requirement_dict, ['name'])\
and dict_contains_keys(requirement_dict, ['input']):
name = requirement_dict['name']
requirement = requirement_dict['input']
return name, requirement
def match(self, response: str) ->bool:
"""Return whether response matchs with handler.
"""
if self.response_starts_with is not None:
return response.startswith(self.response_starts_with)
return True
def print_dialog(self):
"""Print dialog.
"""
if self.dialog_string is not None:
print(f'[{self.dialog_string}]')
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words ]
action_dictionary = { 'words': [ word for word in page.words if word.id in json_word_ids ] }
for index, item in enumerate(self.create_requirement_list()):
name, requirement = self.get_requirement(json_dict, index=index)
action_dictionary.update({name: requirement})
return self.run_change(page, action_dictionary)
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
return self.run_change(page, {})
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
return exit_code
class JoinWords(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response interactively and return exit code.
"""
action_dictionary = { 'words' : shell._get_words_from_response(re.compile('^\D+\s').sub('', response), page.words),\
'add_white_space_between_words': re.match(r'^\D+\s', response) }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
add_white_space_between_words = action_dictionary['add_white_space_between_words']\
if bool(action_dictionary.get('add_white_space_between_words'))\
else False
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
if len(words) > 0:
if len(set([ word.line_number for word in words ])) == 1\
and len(set([ word.deleted for word in words ])) == 1:
new_word = words[0]
for word2join in words[1:]:
page.words.remove(word2join)
new_word.join(word2join, add_white_space_between_words=add_white_space_between_words)
else:
new_word = Word.join_words(words, add_white_space_between_words=add_white_space_between_words)
index = len(page.words)
if words[0] in page.words:
index = page.words.index(words[0])
elif len([ word for word in page.words if words[0] in word.word_parts ]) > 0:
index = page.words.index([ word for word in page.words if words[0] in word.word_parts ][0])
for word2join in words:
if word2join in page.words:
page.words.remove(word2join)
elif len([ word for word in page.words if word2join in word.word_parts ]) > 0:
page.words.remove([ word for word in page.words if word2join in word.word_parts ][0])
page.words.insert(index, new_word)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class SimpleJoinWords(JoinWords):
def match(self, response: str) ->bool:
"""Return whether response matchs with handler.
"""
return re.match(r'\d+', response)
class SaveChanges(ResponseHandler):
WORD_INDEX = 0
WDICT_INDEX = 1
RELEVANT_PROPERTIES = [ ('deleted','deleted'), ('line_number','line') ]
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
self.run_change(page, {})
return shell.run_interactive_editor(page)
def _update_transkription_word(self, word, word_dict) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
for relevant_property in self.RELEVANT_PROPERTIES:
if len(word.word_parts) > 0:
if len(word_dict['tp_id'].split(':')) == 3:
wp_index = int(word_dict['tp_id'].split(':')[1].replace('w',''))
word.word_parts[wp_index].__dict__[relevant_property[self.WORD_INDEX]] = word_dict[relevant_property[self.WDICT_INDEX]]
else:
return 2
else:
word.__dict__[relevant_property[self.WORD_INDEX]] = word_dict[relevant_property[self.WDICT_INDEX]]
return exit_code
def _update_faksimile_word(self, word, word_dict, words) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
if word_dict.get('old_id') is not None:
fp_id = word_dict['fp_id']
old_id = int(word_dict['old_id'])
if len([w for w in words if w.id == old_id ]) > 0:
old_word = [w for w in words if w.id == old_id ][0]
faksimile_position = None
if len([ fp for fp in old_word.faksimile_positions if fp.id == fp_id ]) > 0:
faksimile_position = [ fp for fp in old_word.faksimile_positions if fp.id == fp_id ][0]
old_word.faksimile_positions.remove(faksimile_position)
elif len([ fp for w in old_word.word_parts for fp in w.faksimile_positions if fp.id == fp_id ]) > 0:
for w in old_word.word_parts:
for fp in w.faksimile_positions:
if fp.id == fp_id:
faksimile_position = fp
w.faksimile_positions.remove(faksimile_position)
break
if faksimile_position is not None:
word.faksimile_positions.append(faksimile_position)
else:
return 2
else:
return 3
else:
fp_id = word_dict['fp_id']
print(word.id, fp_id);
return exit_code
def _update_word(self, word, word_dict, words) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
if bool(word_dict.get('tp_id')):
exit_code = self._update_transkription_word(word, word_dict)
if exit_code > 0:
return exit_code
elif bool(word_dict.get('fp_id')):
exit_code = self._update_faksimile_word(word, word_dict, words)
if exit_code > 0:
print(exit_code)
return exit_code
else:
return 2
return exit_code
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
svg_words = [ word for word in json_dict['words'] if str(word.get('id')).startswith('rect') ]
if page.faksimile_svgFile is not None:
for word in svg_words:
word_id = word.get('id')
word_text = word.get('text')
print(f'Changing rect {word_id} to {word_text}')
change_title_of_svg(page.faksimile_svgFile, word_id, word_text)
json_word_ids = [ int(jw.get('id')) for jw in json_dict['words'] if not str(jw.get('id')).startswith('rect') ]
for word in page.words:
if word.id in json_word_ids:
print('updating word', word.id, word.text)
word_dict = [ jw for jw in json_dict['words'] if int(jw.get('id')) == word.id ][0]
if self._update_word(word, word_dict, page.words) > 0:
return 2
return self.run_change(page, {})
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
return exit_code
class SavePositions(SaveChanges):
def _update_word(self, word, word_dict_list) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
for word_dict in word_dict_list:
if bool(word_dict.get('tp_id')):
exit_code = self._update_transkription_position(word, word_dict)
if exit_code > 0:
return exit_code
elif bool(word_dict.get('fp_id')):
exit_code = self._update_faksimile_position(word, word_dict)
if exit_code > 0:
return exit_code
return exit_code
def _update_transkription_position(self, word, word_dict) ->int:
"""Update transkription position properites of word according to word_dict,
return exit_code
"""
tp_id_list = word_dict['tp_id'].split(':')
if len(tp_id_list) == 3 and len(word.word_parts) > 0:
wp_index = int(tp_id_list[1].replace('w',''))
tp_index = int(tp_id_list[2].replace('tp',''))
if wp_index < len(word.word_parts) and tp_index < len(word.word_parts[wp_index].transkription_positions):
word.word_parts[wp_index].transkription_positions[tp_index].left = float(word_dict['left'])
word.word_parts[wp_index].transkription_positions[tp_index].top = float(word_dict['top'])
word.word_parts[wp_index].transkription_positions[tp_index].bottom = word.word_parts[wp_index].transkription_positions[tp_index].top\
+ word.word_parts[wp_index].transkription_positions[tp_index].height
else:
return 2
elif len(tp_id_list) == 2:
tp_index = int(tp_id_list[1].replace('tp',''))
if tp_index < len(word.transkription_positions):
+ if (word.text == ',' or word.text == ':' or word.text == ';')\
+ and float(word_dict['top']) > word.transkription_positions[tp_index].top+word.transkription_positions[tp_index].height/2:
+ word.line_number += 1
+ print(f'Updating line number of "{word.text}" to {word.line_number}')
word.transkription_positions[tp_index].left = float(word_dict['left'])
word.transkription_positions[tp_index].top = float(word_dict['top'])
word.transkription_positions[tp_index].bottom = word.transkription_positions[tp_index].top\
+ word.transkription_positions[tp_index].height
else:
return 2
else:
return 2
return 0
def _update_faksimile_position(self, word, word_dict) ->int:
"""Update faksimile position properites of word according to word_dict,
return exit_code
"""
exit_code = 0
fp_id = word_dict['fp_id']
faksimile_position = None
if len([ fp for fp in word.faksimile_positions if fp.id == fp_id ]) > 0:
faksimile_position = [ fp for fp in word.faksimile_positions if fp.id == fp_id ][0]
if len([ fp for w in word.word_parts for fp in w.faksimile_positions if fp.id == fp_id ]) > 0:
faksimile_position = [ fp for w in word.word_parts for fp in w.faksimile_positions if fp.id == fp_id ][0]
if faksimile_position is not None:
faksimile_position.left = float(word_dict['left'])
faksimile_position.top = float(word_dict['top'])
faksimile_position.bottom = faksimile_position.top + faksimile_position.height
else:
return 2
return exit_code
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
json_word_ids = [ jw.get('id') for jw in json_dict['words'] ]
for word in page.words:
if word.id in json_word_ids:
word_dict_list = [ jw for jw in json_dict['words'] if jw.get('id') == word.id ]
if self._update_word(word, word_dict_list) > 0:
return 2
return self.run_change(page, {})
class AddDeletionPath(SaveChanges):
def _add_deletion_path(self, page, word, word_dict_list) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
for word_dict in word_dict_list:
if len([ path for path in word.deletion_paths if path.d_attribute == word_dict['deletion_path']]) == 0:
dpath = page.get_word_deletion_path(d_attribute=word_dict['deletion_path'])
if dpath is not None:
word.deletion_paths.append(dpath)
else:
exit_code = 2
return exit_code
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ]
for word in page.words:
if word.id in json_word_ids:
word_dict_list = [ jw for jw in transkription_words if jw.get('id') == word.id ]
if self._add_deletion_path(page, word, word_dict_list) > 0:
return 2
return self.run_change(page, {})
class RemoveDeletionPath(SaveChanges):
def _remove_deletion_path(self, page, word, word_dict_list) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 2
if len(word.word_parts) > 0:
exit_code = 2
for wpart in word.word_parts:
result = self._remove_deletion_path(page, wpart, word_dict_list)
if result == 0:
exit_code = 0
deletion_paths = [ path for path in word.deletion_paths if path.d_attribute in\
[ word_dict['deletion_path'] for word_dict in word_dict_list ] ]
if len(deletion_paths) > 0:
for path in deletion_paths:
if path in word.deletion_paths:
word.deletion_paths.remove(path)
for node in page.page_tree.xpath(f'./{WordDeletionPath.XML_TAG}[@d="{path.d_attribute}"]'):
node.getparent().remove(node)
exit_code = 0
return exit_code
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ]
for word in page.words:
if word.id in json_word_ids:
word_dict_list = [ jw for jw in transkription_words if jw.get('id') == word.id ]
if self._remove_deletion_path(page, word, word_dict_list) > 0:
return 2
return self.run_change(page, {})
+class UndeleteWords(ResponseHandler):
+ def handle_response(self, page: Page, json_dict: dict) -> int:
+ """Handle response and return exit code.
+ """
+ json_words = [ (jw.get('id'), jw.get('text')) for jw in json_dict['words'] if not str(jw.get('id')).startswith('rect') ]
+ for id, text in json_words:
+ words = page.page_tree.xpath(f'//word[@id="{id}" and @text="{text}"]')
+ if len(words) > 0:
+ words[0].set('undeleted', 'true')
+ return self.run_change(page, {})
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, backup=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
+ page = Page(page.page_tree.docinfo.URL)
+ return exit_code
+
class JoinDeletionPath(SaveChanges):
def _join_deletion_path(self, page, word, word_dict_list) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
deletion_paths = [ path for path in word.deletion_paths if path.d_attribute in\
[ word_dict['deletion_path'] for word_dict in word_dict_list ] ]
if len(deletion_paths) > 1:
path_string = ''
for p in deletion_paths:
path_string = path_string + ' ' + p.d_attribute.replace('M', 'L')\
if path_string != ''\
else p.d_attribute
word.deletion_paths.remove(p)
if p in page.word_deletion_paths:
page.word_deletion_paths.remove(p)
new_path = parse_path(path_string)
word.deletion_paths.append(WordDeletionPath(Path(id=deletion_paths[0].id, path=new_path), deletion_paths[0].style))
page.word_deletion_paths.append(word.deletion_paths[-1])
for node in page.page_tree.xpath(f'./{WordDeletionPath.XML_TAG}'): node.getparent().remove(node)
for p in page.word_deletion_paths: p.attach_object_to_tree(page.page_tree)
return 0
return 2
+ def _join_paths_as_one(self, deletion_paths, style) -> WordDeletionPath:
+ """Join paths as one deletion path
+ """
+ start_point = sorted([ (p.path.bbox()[0],p.path.bbox()[2]) for p in deletion_paths])[0]
+ end_point = sorted([ (p.path.bbox()[1],p.path.bbox()[3]) for p in deletion_paths], reverse=True)[0]
+ new_path = parse_path(f'M {start_point[0]},{start_point[1]} L {end_point[0]},{end_point[1]}')
+ return WordDeletionPath(Path(path=new_path), style)
+
+ def _join_paths(self, deletion_paths, style) -> WordDeletionPath:
+ """Join paths as one deletion path
+ """
+ path_string = ''
+ for path in deletion_paths:
+ path_string = path_string + ' ' + path.d_attribute.replace('M', 'L')\
+ if path_string != ''\
+ else path.d_attribute
+ new_path = parse_path(path_string)
+ return WordDeletionPath(Path(path=new_path), style)
+
+ def join_word_paths(self, words):
+ deletion_paths = []
+ for word in words:
+ if len(word.deletion_paths) > 1:
+ word.deletion_paths = [ self._join_paths_as_one(word.deletion_paths, word.deletion_paths[0].style) ]
+ deletion_paths.append(word.deletion_paths[-1])
+ for wp in word.word_parts:
+ if len(wp.deletion_paths) > 1:
+ wp.deletion_paths = [ self._join_paths_as_one(wp.deletion_paths, wp.deletion_paths[0].style) ]
+ deletion_paths.append(wp.deletion_paths[-1])
+ if len(deletion_paths) > 0:
+ deletion_path = self._join_paths(deletion_paths, deletion_paths[0].style)
+ for word in words:
+ if len(word.deletion_paths) > 0:
+ word.deletion_paths = [ deletion_path ]
+ for wp in word.word_parts:
+ if len(wp.deletion_paths) > 0:
+ wp.deletion_paths = [ deletion_path ]
+
+
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ]
- for word in page.words:
- if word.id in json_word_ids:
- word_dict_list = [ jw for jw in transkription_words if jw.get('id') == word.id ]
- if self._join_deletion_path(page, word, word_dict_list) > 0:
- return 2
+ words = [ word for word in page.words if word.id in json_word_ids ]
+ self.join_word_paths(words)
return self.run_change(page, {})
class RequestPathsNearWords(SaveChanges):
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ]
for word in page.words:
if word.id in json_word_ids\
and 'add_paths_near_words' not in word.process_flags:
word.process_flags.append('add_paths_near_words')
return self.run_change(page, {})
class SetTaskDone(SaveChanges):
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
if not bool(json_dict.get('task')):
return 2
task = json_dict.get('task')
checker = CheckerHandler(page)
checker.set_task_done(task)
return self.run_change(page, {})
class Reload(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
return shell.run_interactive_editor(Page(page.page_tree.docinfo.URL))
class RestoreBackup(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
if page.bak_file is not None:
return shell.run_interactive_editor(Page(page.bak_file))
else:
print('Could not restore backup file, please restore manually!')
return 2
class ChangeLine2Value(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
words = []
line_number = -1
if re.match(r'l:\d+\s\d+', response):
line_number = int(response.replace('l:', '').split(' ')[0])
words = shell._get_words_from_response(re.compile('l:\d+\s').sub('', response), page.words)
else:
if not re.match(r'l:\d+$', response):
new_response_line = input('Specify new line number>')
if re.match(r'^\d+$', new_response_line):
line_number = int(new_response_line)
else:
line_number = int(response.replace('l:', ''))
new_response = input(f'Specify ids of words for which line number should be changed to {line_number}>')
if re.match(r'\d+', new_response):
words = shell_get_words_from_response(new_response, page.words)
action_dictionary = { 'words': words, 'line_number' : line_number }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
line_number = action_dictionary['line_number']\
if bool(action_dictionary.get('line_number'))\
else -1
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
if line_number != -1:
for word in words: word.line_number = line_number
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class CreateCorrectionHistory(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
if re.match(r'c\w*\s\d+', response):
words = shell._get_words_from_response(re.compile('c\w*\s').sub('', response), page.words)
else:
new_response = input(f'Specify ids of words to create a correction history. >')
if re.match(r'\d+', new_response):
words = shell._get_words_from_response(new_response, page.words)
action_dictionary = { 'words': words }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
+
+ def _create_extension_correction_history(self, word, word_part_ids):
+ """Create a correction history for extension of earlier versions.
+ """
+ word_parts = []
+ extendsPart = None
+ for wp in word.word_parts:
+ if str(wp.id) not in word_part_ids:
+ transkription_positions = TranskriptionPosition.copy_list_of_cls(wp.transkription_positions)
+ word_parts.append(Word(id=wp.id, text=wp.text, transkription_positions=transkription_positions))
+ else:
+ extendsPart = wp
+ word.earlier_version = Word(text=''.join([ wp.text for wp in word_parts]), line_number=word.line_number, word_parts=word_parts)
+ if extendsPart is not None:
+ extendsPart.isExtensionOfWord = word.earlier_version
+ word.corrections.append(extendsPart)
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
if len(words) > 0:
- for word in words: word.create_correction_history()
+ for word in words:
+ if len(word.word_parts) > 0\
+ and word.earlier_version is None\
+ and len(page.page_tree.xpath(f'//word[@id="{word.id}"]/word[@extendsEarlierVersion="true"]')) > 0:
+ word_part_ids = page.page_tree.xpath(f'//word[@id="{word.id}"]/word[@extendsEarlierVersion="true"]/@id')
+ self._create_extension_correction_history(word, word_part_ids)
+ else:
+ word.create_correction_history()
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class DeleteCorrectionHistory(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response interactively and return exit code.
"""
if re.match(r'D\w*\s\d+', response):
words = shell._get_words_from_response(re.compile('D\w*\s').sub('', response), page.words)
else:
new_response = input(f'Specify ids of words to delete their correction history. >')
if re.match(r'\d+', new_response):
words = shell._get_words_from_response(new_response, page.words)
action_dictionary = { 'words' : words }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
if len(words) > 0:
for word in words:
print(word.text)
word.earlier_version = None
word.corrections = []
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class ChangeDeletionStatus(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
if re.match(r'[du]\w*\s\d+', response):
words = shell._get_words_from_response(re.compile('[du]\w*\s').sub('', response), page.words)
else:
deletion_target = 'delete' if response.startswith('d') else 'undelete'
new_response = input(f'Specify ids of words to {deletion_target}. >')
if re.match(r'\d+', new_response):
words = shell._get_words_from_response(new_response, page.words)
action_dictionary = { 'words': words, 'deleted': response.startswith('d') }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
word_should_be_deleted = bool(action_dictionary.get('deleted'))
if len(words) > 0:
for word in words: word.deleted = word_should_be_deleted
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class SplitWords(ResponseHandler):
def _split_word(self, page, word, split_text):
"""Split word.
"""
index = page.words.index(word)
_, left, right = word.split(split_text)
if left is None:
raise Exception(f'ERROR left word of word.split with split_text {split_text} is None!')
if right is None:
raise Exception(f'ERROR right word of word.split with split_text {split_text} is None!')
page.words[index] = left
page.words.insert(index+1, right)
def create_requirement_list(self) ->list:
"""Create a requirement dictionary.
"""
return [{ 'name': 'split_text', 'type': 'string', 'input': None }]
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
if re.match(r's\s\w+\s\d+', response):
words = shell._get_words_from_response(re.compile('s\s\w+\s').sub('', response), page.words)
split_text = response.split(' ')[1]
else:
split_text = input('Input split text>')
new_response = input(f'Specify ids of words to split. >')
if re.match(r'\d+', new_response):
words = shell._get_words_from_response(new_response, page.words)
action_dictionary = { 'words': words, 'split_text': split_text }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
split_text = action_dictionary['split_text']\
if bool(action_dictionary.get('split_text'))\
else ''
if len(words) > 0 and split_text != '':
for word in words: self._split_word(page, word, split_text)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
+class DeleteLastChar(ResponseHandler):
+ def _process_word(self, page, word) -> Word:
+ """Split word.
+ """
+ index = page.words.index(word)
+ split_text = word.text[:-1]
+ _, left, right = word.split(split_text)
+ if left is None:
+ raise Exception(f'ERROR left word of word.split with split_text {split_text} is None!')
+ if right is None:
+ raise Exception(f'ERROR right word of word.split with split_text {split_text} is None!')
+ right.deleted = True
+ left.id = 0
+ right.id = 0
+ page.words[index] = Word(id=index, line_number=left.line_number, text=left.text+right.text, word_parts=[left, right])
+ return page.words[index]
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ words = action_dictionary['words']\
+ if bool(action_dictionary.get('words'))\
+ else []
+ if len(words) > 0:
+ for word in words:
+ newWord = self._process_word(page, word)
+ cch = CreateCorrectionHistory()
+ exit_code = cch.run_change(page, { 'words': [ newWord ] })
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
+ page = Page(page.page_tree.docinfo.URL)
+ else:
+ exit_code = 2
+ return exit_code
+
class AddBox(ResponseHandler):
def create_requirement_list(self) ->list:
"""Create a requirement dictionary.
"""
return [{ 'name': 'box_text', 'type': 'string', 'input': None },\
{ 'name': 'overwritten_by', 'type': 'string', 'input': None },\
{ 'name': 'is_earlier_version', 'type': 'boolean', 'input': False }]
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
missing_text = action_dictionary.get('box_text')
is_earlier_version = action_dictionary.get('is_earlier_version')
overwritten_by = action_dictionary.get('overwritten_by')
if len(words) > 0 and missing_text is not None:
for word in words:
if overwritten_by is not None:
split_into_parts_and_attach_box(word, 0, missing_text, is_earlier_version, overwritten_by)
else:
attach_box(word, 0, missing_text, False)
word.create_correction_history()
if len(word.corrections) > 0:
for wp in word.word_parts:
wp.overwrites_word = None
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
+class AddDeCapitalizeBox(AddBox):
+ def create_requirement_list(self) ->list:
+ """Create a requirement dictionary.
+ """
+ return []
+
+ def run_change(self, page: Page, action_dictionary: dict) -> int:
+ """Run changes on page and return exit code.
+ """
+ exit_code = 0
+ words = action_dictionary['words']\
+ if bool(action_dictionary.get('words'))\
+ else []
+ missing_text = action_dictionary.get('box_text')
+ is_earlier_version = True
+ if len(words) > 0:
+ for word in words:
+ overwritten_by = word.text[0]
+ missing_text = overwritten_by.upper()\
+ if re.match(r'[A-Z]', overwritten_by) is None\
+ else overwritten_by.lower()
+ split_into_parts_and_attach_box(word, 0, missing_text, is_earlier_version, overwritten_by)
+ word.create_correction_history()
+ if len(word.corrections) > 0:
+ for wp in word.word_parts:
+ wp.overwrites_word = None
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
+ page = Page(page.page_tree.docinfo.URL)
+ else:
+ exit_code = 2
+ return exit_code
+
class ResponseOrganizer:
RESULT = 'result'
TIMESTAMP_NOT_SET = -1
def __init__(self, manuscript=None):
self.manuscript = manuscript
self.do_not_send = []
self.after_faksimile_merged = []
self.join_faksimile_positions = False
self.response_handler_dictionary = {}
+ self._add_response_handler(SaveChanges(action_name='save changes', description='save change to line number/deletion status for word(s)'))
self._add_response_handler(JoinWords(action_name='join words', description='join words'))
self._add_response_handler(SplitWords(action_name='split words', description='split word according to split text'))
self._add_response_handler(CreateCorrectionHistory(action_name='create correction history', description='creates a correction history for selected words'),\
is_after_faksimile_merged=True)
self._add_response_handler(DeleteCorrectionHistory(action_name='delete correction history', description='deletes the correction history of selected words'),\
is_after_faksimile_merged=True)
+ self._add_response_handler(DeleteLastChar(action_name='delete last char', description='deletes last character and creates a correction history for the selected words'),\
+ is_after_faksimile_merged=True)
self._add_response_handler(AddBox(action_name='add box', description='add box with overwritten text'),\
is_after_faksimile_merged=True)
- self._add_response_handler(SaveChanges(action_name='save changes', description='save change to line number/deletion status for word(s)' ))
- self._add_response_handler(SavePositions(action_name='save positions', description='save new transkription position(s)' ))
+ self._add_response_handler(AddDeCapitalizeBox(action_name='(de)capitalize history', description='create (de)capitalize correction history'),\
+ is_after_faksimile_merged=True)
+ self._add_response_handler(UndeleteWords(action_name='undelete', description='mark as undeleted' ))
self._add_response_handler(AddDeletionPath(action_name='add deletion paths', description='add new deletion paths to word' ),\
is_after_faksimile_merged=True)
self._add_response_handler(JoinDeletionPath(action_name='join deletion paths', description='join deletion paths of selected words' ),\
is_after_faksimile_merged=True)
self._add_response_handler(RemoveDeletionPath(action_name='remove deletion paths', description='remove deletion paths of selected words' ),\
is_after_faksimile_merged=True)
self._add_response_handler(RequestPathsNearWords(action_name='request paths near words', description='request paths near selected words' ),\
is_after_faksimile_merged=True)
self._add_response_handler(Reload(action_name='reload', description='reload page from file' ))
+ self._add_response_handler(SavePositions(action_name='save positions', description='save new transkription position(s)' ))
self._add_response_handler(SetTaskDone(action_name='set task done', description='reload page from file' ), add_to_do_not_send=True)
def _add_faksimile_image(self, page, faksimile_page):
"""Add faksimile image to page.
"""
if faksimile_page.faksimile_image.text_field is None\
and faksimile_page.text_field is not None:
faksimile_page.faksimile_image.text_field = faksimile_page.text_field
page.faksimile_image = faksimile_page.faksimile_image
page.faksimile_image.attach_object_to_tree(page.page_tree)
page.update_data_source(faksimile_svgFile=faksimile_page.svg_source_file)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
def _add_response_handler(self, response_handler: ResponseHandler, add_to_do_not_send=False, is_after_faksimile_merged=False):
"""Add response_handler to response_handler_dictionary.
"""
if add_to_do_not_send:
self.do_not_send.append(response_handler)
if is_after_faksimile_merged:
self.after_faksimile_merged.append(response_handler)
self.response_handler_dictionary.update({response_handler.action_name: response_handler})
def _get_response_handlers(self) ->list:
"""Return a list of response_handlers.
"""
return [ response_handler for response_handler in self.response_handler_dictionary.values()\
if response_handler not in self.do_not_send\
and (not self.join_faksimile_positions or response_handler not in self.after_faksimile_merged) ]
-
def create_json_dict(self, xml_file: str, svg_file=None, last_operation_result=None) ->dict:
"""Return a json dict of page with information about action.
"""
with warnings.catch_warnings(record=True) as w:
+ manuscript_tree = ET.parse(self.manuscript)
+ manuscript_title = manuscript_tree.getroot().get('title')
warnings.simplefilter("always")
page = Page(xml_file, add_paths_near_words=True, warn=True)
checker = CheckerHandler(page)
todos = checker.get_todos()
replace_ligatures(page)
faksimile_page = None
faksimile_source_file = None
if svg_file is None and page.faksimile_svgFile is not None:
svg_file = page.faksimile_svgFile
if svg_file is not None:
fps = FaksimilePage.get_faksimile_pages(svg_file, page_number=page.number)
if len(fps) > 0:
faksimile_page = fps[0]
if page.faksimile_image is None:
+ if manuscript_title != faksimile_page.title\
+ or page.number != faksimile_page.page_number:
+ change_id_of_textfield(svg_file, manuscript_title, page.number, faksimile_page.page_number)
add_faksimile_image(page, faksimile_page)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
if not isfile(xml_file.replace('xml/', 'xml/merged/'))\
and len([ word for word in page.words if len(word.faksimile_positions) == 0 ]) > 0:
self.join_faksimile_positions = True
faksimile_source_file = svg_file
todos = []
converter = JSONConverter(page, faksimile_page=faksimile_page)
json_dict = converter.create_json_dict()
pages = []
if self.manuscript is not None and isfile(self.manuscript):
manuscript_tree = ET.parse(self.manuscript)
pages = [ p.replace('./', '') for p in manuscript_tree.xpath('//page/@output') if isfile(p) ]
action_dict = { 'target_file': xml_file,\
'faksimile_source_file': faksimile_source_file,\
'pages': pages,\
'date_stamp': os.path.getmtime(xml_file),\
'join_faksimile_positions': str(self.join_faksimile_positions).lower(),\
'tasks': todos }
if last_operation_result is not None:
action_dict.update({self.RESULT: last_operation_result })
if len(w) > 0:
msg = str(w[-1].message)\
if last_operation_result is None\
else last_operation_result + '\n' + str(w[-1].message)
action_dict.update({self.RESULT: msg })
response_handlers = []
for response_handler in self._get_response_handlers():
response_handlers.append(response_handler.create_json_dict())
action_dict.update({ 'response_handlers': response_handlers })
json_dict.update({ 'actions': action_dict})
return json_dict
def handle_response(self, json_dict: dict) ->dict:
"""Handle response in json_dict and return new data json_dict.
"""
if bool(json_dict.get('target_file')):
target_file = json_dict['target_file']
svg_file = json_dict['faksimile_source_file']\
if bool(json_dict.get('faksimile_source_file'))\
else None
if bool(json_dict.get('date_stamp')):
if json_dict['date_stamp'] == self.TIMESTAMP_NOT_SET\
or os.path.getmtime(target_file) <= json_dict['date_stamp']:
exit_code = 2
operation = 'unknown'
if bool(json_dict.get('response_handler'))\
and bool(self.response_handler_dictionary.get(json_dict['response_handler']['action_name'])):
operation = json_dict['response_handler']['action_name']
response_handler = self.response_handler_dictionary[operation]
exit_code = response_handler.handle_response(Page(target_file), json_dict)
message = f'Operation "{operation}" succeeded!' if exit_code == 0 else f'Operation "{operation}" failed'
return self.create_json_dict(target_file, svg_file=svg_file, last_operation_result=message)
else:
return self.create_json_dict(target_file,\
last_operation_result=f'FAIL: file {target_file} was changed between operations!')
else:
return self.create_json_dict(target_file,\
last_operation_result='ERROR: there was no key "date_stamp" in json')
else:
return { 'actions': { self.RESULT: 'ERROR: there was no key "target_file" in json!' }}
class InteractiveShell:
def __init__(self):
self.response_handlers = []
self.response_handlers.append(SimpleJoinWords(dialog_string='specify ids of words to join [default]'))
self.response_handlers.append(RestoreBackup(response_starts_with='b', dialog_string='b=restore backup'))
self.response_handlers.append(CreateCorrectionHistory(response_starts_with='c', dialog_string='c=create correction history [+ ids]'))
self.response_handlers.append(DeleteCorrectionHistory(response_starts_with='D', dialog_string='D=delete correction history [+ ids]'))
self.response_handlers.append(ChangeDeletionStatus(response_starts_with='d', dialog_string='d=mark deleted [+ ids]'))
self.response_handlers.append(SaveChanges(response_starts_with='i', dialog_string='i=fix ids' ))
self.response_handlers.append(ChangeLine2Value(response_starts_with='l', dialog_string='l[:value]=change line to value for ids' ))
self.response_handlers.append(Reload(response_starts_with='r', dialog_string='r=reload xml file'))
self.response_handlers.append(SplitWords(response_starts_with='s', dialog_string='s=split and join word ("s splittext id")'))
self.response_handlers.append(ChangeDeletionStatus(response_starts_with='u', dialog_string='u=undelete [+ ids]'))
self.response_handlers.append(JoinWords(response_starts_with='w', dialog_string='w=join words with whitespace between them [+ ids]'))
self.response_handlers.append(ResponseHandler())
def _get_words_from_response(self, response, words) ->list:
"""Return a list of word that correspond to indices
"""
if re.match(r'\d+-\d+', response)\
or re.match(r'\d+\+', response):
index_boundaries = []
if response[-1] == '+':
index_boundaries.append(int(response[:response.index('+')]))
index_boundaries.append(index_boundaries[0]+1)
else:
index_boundaries = [ int(i) for i in response.split('-') ]
index_boundaries_length_diff = len(response.split('-')[0]) - len(response.split('-')[1])
if index_boundaries_length_diff > 0:
index_boundaries[1] = int(response.split('-')[0][0-index_boundaries_length_diff-1] + response.split('-')[1])
indices = [ i for i in range(index_boundaries[0], index_boundaries[1]+1) ]
if index_boundaries[0] > index_boundaries[1]:
indices = [ index_boundaries[0] ]
while indices[-1] > index_boundaries[1]:
indices.append(indices[-1]-1)
else:
indices = [ int(i) for i in response.split(' ') ]
result_words = []
for index in indices:
if len([ word for word in words if word.id == index ]) > 0:
result_words += [ word for word in words if word.id == index ]
return result_words
def run_interactive_editor(self, page) -> int:
"""Run interactive shell.
"""
replace_ligatures(page)
HTMLConverter(page).convert()
for response_handler in self.response_handlers: response_handler.print_dialog()
response = input('>')
for response_handler in self.response_handlers:
if response_handler.match(response):
return response_handler.handle_interactive_response(page, response, self)
def replace_ligatures(page):
"""Replace ligatures
"""
+ words_with_spaces = [ word for word in page.words if word.text.endswith(' ') ]
+ if len(words_with_spaces) > 0:
+ for word in words_with_spaces:
+ word.text = word.text[:-1]
+ save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
if len([ word for word in page.words if re.match(r'.*[flfi]', word.text) ]) > 0:
for word in [ word for word in page.words if re.match(r'.*[fi]', word.text) ]:
word.text = word.text.replace('fi', 'fi')
for word in [ word for word in page.words if re.match(r'.*[fl]', word.text) ]:
word.text = word.text.replace('fl', 'fl')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
def dict_contains_keys(a_dict, key_list)->bool:
"""Return whether dict a_dict contains key path given by key_list.
"""
if len(key_list) == 0:
return True
else:
if key_list[0] in a_dict.keys():
return dict_contains_keys(a_dict[key_list[0]], key_list[1:])
return False
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix faksimile position ->set them to their absolute value.
fixes/interactive_editor.py [OPTIONS]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
xml_file = args[0]
if isfile(xml_file):
counter = 0
shell = InteractiveShell()
for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK):
if not UNITTESTING:
print(Fore.CYAN + f'Processing {page.title}, {page.number} with interactive editor ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
counter += 1 if shell.run_interactive_editor(page) == 0 else 0
if not UNITTESTING:
print(Style.RESET_ALL + f'[{counter} pages changed by interactive shell]')
else:
raise FileNotFoundError('File {} does not exist!'.format(xml_file))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: fixes/test_get_text_field.py
===================================================================
--- fixes/test_get_text_field.py (revision 0)
+++ fixes/test_get_text_field.py (revision 113)
@@ -0,0 +1,25 @@
+import lxml.etree as ET
+from os import sep, path, remove
+from os.path import isdir, isfile, dirname, basename
+import shutil
+import sys
+import tempfile
+import unittest
+import warnings
+
+import get_text_field
+
+
+
+
+class TestGETIMAGEINFO(unittest.TestCase):
+ def setUp(self):
+ DATADIR = path.dirname(__file__) + sep + 'test_data'
+ self.xml_file = DATADIR + sep + 'N_VII_1_page138.xml'
+ self.fix_transkription_positions = DATADIR + sep + 'Mp_XIV_page419a.xml'
+
+ def test_main(self):
+ get_text_field.main(['/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Mp_XVI/Mp-XVI-4,13.jpg','1260.3199','1339.36'])
+
+if __name__ == "__main__":
+ unittest.main()
Index: py2ttl/convert.py
===================================================================
--- py2ttl/convert.py (revision 112)
+++ py2ttl/convert.py (revision 113)
@@ -1,131 +1,135 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert py objects to ontology and data in turtle format.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import getopt
import lxml.etree as ET
from os import sep, path, listdir
from os.path import isfile, isdir, dirname, basename
from progress.bar import Bar
import re
import sys
sys.path.append('svgscripts')
from datatypes.archival_manuscript import ArchivalManuscriptUnity
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from class_spec import SemanticClass
from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL
from py2ttl_data import Py2TTLDataConverter
from py2ttl_ontology import Py2TTLOntologyConverter
sys.path.append('shared_util')
from myxmlwriter import xml2dict
from main_util import get_manuscript_files_and_include_status
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
FILE_TYPE_XML_PROJECT = "xmlProjectFile"
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert py objects to a owl:Ontology and rdf data in turtle format.
- py2ttl/py2ttl_data.py [OPTIONS] [ ...]
+ py2ttl/convert.py [OPTIONS] [ ...]
xml file of type shared_util.myxmlwriter.FILE_TYPE_XML_MANUSCRIPT.
OPTIONS:
-h|--help: show help
+ -c|--create-or-update-pages create or update pages as seperate ttl files in dir 'ttl'
-i|--include-status=STATUS include pages with status = STATUS. STATUS is a ':' seperated string of status, e.g. 'OK:faksimile merged'.
-I|--Include-files-only create include files only with suffix INCLUDE_DATA.ttl.
:return: exit code (int)
"""
check_config_files_exist()
datatypes_dir = get_datatypes_dir()
include_only = False
+ create_or_update_pages = False
containsAttr = 'status'
source_ontology_file = PROJECT_ONTOLOGY_FILE
target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, PROJECT_NAME)
manuscript_file = None
page_status_list = [ 'OK', 'faksimile merged' ]
try:
- opts, args = getopt.getopt(argv, "hi:I", ["help", "include-status=", "Include-files-only"])
+ opts, args = getopt.getopt(argv, "hci:I", ["help", "create-or-update-pages", "include-status=", "Include-files-only"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-i', '--include-status'):
page_status_list = arg.split(':')
+ elif opt in ('-c', '--create-or-update-pages'):
+ create_or_update_pages = True
elif opt in ('-I', '----Include-files-only'):
include_only = True
containsAttr = 'include'
if len(args) < 1 :
usage()
return 2
ontology_created = False
ontology_converter = Py2TTLOntologyConverter(project_ontology_file=source_ontology_file)
output = 2
status = ':'.join(page_status_list)\
if not include_only\
else 'OK'
for arg in get_manuscript_files_and_include_status(args, containsAttr, status):
if type(arg) == str:
manuscript_file, include_status = arg, None
else:
manuscript_file, include_status = arg[0], arg[1]
if not isfile(manuscript_file):
usage()
return 2
if not ontology_created:
print(Fore.CYAN + 'Create ontology from "{}" ...'.format(manuscript_file))
if ontology_converter.create_ontology(datatypes_dir, target_ontology_file) == 0:
print(Fore.GREEN + '[Ontology file {0} created]'.format(target_ontology_file))
ontology_created = True
else:
return 2
current_page_status_list = page_status_list\
if include_status is None\
else include_status.split(':')
print(Fore.CYAN + f'Create data from "{manuscript_file}" with status "{current_page_status_list}" ...')
data_converter = Py2TTLDataConverter(manuscript_file, mapping_dictionary=ontology_converter.uri_mapping4cls_and_properties)
- output = data_converter.convert(page_status_list=current_page_status_list)
+ output = data_converter.convert(page_status_list=current_page_status_list, create_or_update_changed_pages=create_or_update_pages)
return output
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: py2ttl/data_handler.py
===================================================================
--- py2ttl/data_handler.py (revision 112)
+++ py2ttl/data_handler.py (revision 113)
@@ -1,196 +1,197 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to add data to a rdf graph.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD
from rdflib import RDF as ns_rdf
from os.path import isfile
import random
import warnings
from class_spec import SemanticClass
from config import DATA_URL
class RDFDataHandler:
"""
This class can be used to add data to a rdf graph.
"""
UNITTESTING = False
SIMPLE_DATA_TYPE_MAPPING = { int: XSD.integer, float: XSD.float, str: XSD.string, bool: XSD.boolean, list: RDF.List }
def __init__(self, target_file, mapping_dictionary):
self.target_file = target_file
self.mapping_dictionary = mapping_dictionary
self.ontology_graph = Graph()
self.data_graph = Graph()
self.data_identifier_mapping = {}
if bool(self.mapping_dictionary.get('ontology')):
self.project_name = self.mapping_dictionary['ontology'].get('project_name')
self.project_uri = URIRef(self.mapping_dictionary['ontology'].get('project_uri'))
ontology_file = self.mapping_dictionary['ontology'].get('ontology_file')
if bool(ontology_file) and isfile(ontology_file):
self.ontology_graph.parse(ontology_file, format="turtle")
self.ns = { uriref: ns for ns, uriref in self.data_graph.namespace_manager.namespaces() }
self.data_graph.bind(self.project_name, self.project_uri)
self.data_graph.bind('data', DATA_URL + '#')
else:
raise Exception('Error: mapping_dictionary does not contain key "ontology"!')
- def add_data(self, data_instance, identifier_prefix, parent_data_instance=None):
+ def add_data(self, data_instance, identifier_prefix, parent_data_instance=None, skip_data_instance_list=None):
"""Add a data rdf instance of data_instance to the data_graph.
:return: (rdflib.URIRef) subject_uri of data instance
"""
+ skip_data_instance_list = [] if skip_data_instance_list is None else skip_data_instance_list
identifier_uri = self.create_identifier_uri(data_instance, identifier_prefix)
- if bool(self.mapping_dictionary['classes'].get(type(data_instance).__name__)):
+ if type(data_instance).__name__ not in skip_data_instance_list and bool(self.mapping_dictionary['classes'].get(type(data_instance).__name__)):
class_uri = self.mapping_dictionary['classes'][type(data_instance).__name__]['class_uri']
self.data_identifier_mapping.update({data_instance: identifier_uri})
self.data_graph_add((identifier_uri, RDF.type, class_uri))
semantic_dict = data_instance.get_semantic_dictionary()
for key, content in semantic_dict['properties'].items():
if bool(self.mapping_dictionary['classes'][type(data_instance).__name__]['properties'].get(key)):
datatype = content.get('class')
cardinality = content.get('cardinality')\
if bool(content.get('cardinality')) else 0
if data_instance.__dict__.get(key) is not None\
and (type(data_instance.__dict__.get(key)) != int or data_instance.__dict__.get(key) != -1):
predicate_uri = self.mapping_dictionary['classes'][type(data_instance).__name__]['properties'][key]
child_data_instance = data_instance.__dict__.get(key)
new_identifier_prefix = identifier_uri[identifier_uri.index('#')+1:]
if datatype is list:
self.add_ordered_list(child_data_instance, identifier_uri, predicate_uri,\
- new_identifier_prefix, data_instance)
+ new_identifier_prefix, data_instance, skip_data_instance_list=skip_data_instance_list)
elif issubclass(datatype, SemanticClass):
if type(child_data_instance) is not list:
if type(child_data_instance) != datatype\
and not issubclass(type(child_data_instance), datatype):
child_id = child_data_instance
child_data_instance = parent_data_instance.get_object_from_list_with_id(datatype,\
child_id)
if child_data_instance is None:
print(key, content)# parent_data_instance.number, child_id, type(child_id), datatype)
msg = 'No child_data_instance found for data_instance {0}: looking for {1} with id {2}'.format(\
type(parent_data_instance), datatype, child_id)
raise Exception(msg)
else:
new_list_name = 'list_of_' + datatype.__name__ + 's'
if new_list_name in data_instance.__dict__.keys():
data_instance.__dict__[new_list_name].append(child_data_instance)
else:
data_instance.__dict__.update({ new_list_name: [ child_data_instance ]})
if child_data_instance not in self.data_identifier_mapping.keys():
child_identifier_uri = self.add_data(child_data_instance, new_identifier_prefix,\
- parent_data_instance=data_instance)
+ parent_data_instance=data_instance, skip_data_instance_list=skip_data_instance_list)
else:
child_identifier_uri = self.data_identifier_mapping[child_data_instance]
self.data_graph_add((identifier_uri, predicate_uri, child_identifier_uri))
else:
for child_item in child_data_instance:
if child_item not in self.data_identifier_mapping.keys():
child_identifier_uri = self.add_data(child_item, new_identifier_prefix,\
- parent_data_instance=data_instance)
+ parent_data_instance=data_instance, skip_data_instance_list=skip_data_instance_list)
else:
child_identifier_uri = self.data_identifier_mapping[child_item]
self.data_graph_add((identifier_uri, predicate_uri, child_identifier_uri))
else:
literal_datatype = RDFDataHandler.SIMPLE_DATA_TYPE_MAPPING[datatype]
ontology_datatypes = [ o for o in self.ontology_graph.objects(subject=predicate_uri, predicate=RDFS.range) ]
if len(ontology_datatypes) > 0:
literal_datatype = ontology_datatypes[0]
if type(child_data_instance) is list:
for child_item in child_data_instance:
object_literal = Literal(str(child_item), datatype=literal_datatype)
self.data_graph_add((identifier_uri, predicate_uri, object_literal))
else:
object_literal = Literal(str(child_data_instance), datatype=literal_datatype)
self.data_graph_add((identifier_uri, predicate_uri, object_literal))
else:
msg = 'Mapping dictionary for {0} does not contain a entry for {1}!'.format(type(data_instance).__name__, key)
raise Exception(msg)
- else:
+ elif type(data_instance).__name__ not in skip_data_instance_list:
msg = 'Mapping dictionary does not contain a entry for {}!'.format(type(data_instance).__name__)
raise Exception(msg)
return identifier_uri
- def add_ordered_list(self, data_instance_list, identifier_uri, predicate_uri, identifier_prefix, data_instance):
+ def add_ordered_list(self, data_instance_list, identifier_uri, predicate_uri, identifier_prefix, data_instance, skip_data_instance_list=None):
"""Add a data rdf instance of data_instance to the data_graph.
"""
if len(data_instance_list) > 0:
child_identifiers = []
for item in data_instance_list:
if item not in self.data_identifier_mapping.keys():
- child_identifiers.append(self.add_data(item, identifier_prefix, data_instance))
+ child_identifiers.append(self.add_data(item, identifier_prefix, data_instance, skip_data_instance_list=skip_data_instance_list))
else:
child_identifiers.append(self.data_identifier_mapping[item])
list_node = self.generate_RDF_collection(child_identifiers)
self.data_graph_add((identifier_uri, predicate_uri, list_node))
def create_identifier_uri(self, data_instance, identifier_prefix):
"""Return a data identifier uri.
:return: (rdflib.URIRef) subject_uri of data instance
"""
data_type, id = data_instance.get_name_and_id()
identifier_uri = URIRef(DATA_URL + '#' + identifier_prefix + '_' + data_type + str(id))
randombit_length = 5
while (identifier_uri, None, None) in self.data_graph:
identifier_uri = URIRef(DATA_URL + '#' + identifier_prefix + '_' + data_type + str(random.getrandbits(randombit_length)))
randombit_length += 1
return identifier_uri
def data_graph_add(self, rdf_triple):
"""Add a triple to the graph.
"""
#not RDFDataHandler.UNITTESTING and print(rdf_triple)
self.data_graph.add(rdf_triple)
def generate_RDF_collection(self, vals ) -> BNode:
"""
Generate an RDF List from vals, returns the head of the list
@URL:
@organization: U{World Wide Web Consortium}
@author: U{Ivan Herman}
@license:
U{W3C® SOFTWARE NOTICE AND LICENSE}
@param graph: RDF graph
@type graph: RDFLib Graph
@param vals: array of RDF Resources
@return: head of the List (an RDF Resource)
"""
heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ]
for i in range(0, len(vals)) :
self.data_graph_add( (heads[i], ns_rdf["first"], vals[i]) )
self.data_graph_add( (heads[i], ns_rdf["rest"], heads[i+1]) )
return heads[0]
def write(self, output_format="turtle"):
"""Write graph.
"""
f = open(self.target_file, 'wb+')
f.write(self.data_graph.serialize(format=output_format))
f.close()
Index: py2ttl/py2ttl_data.py
===================================================================
--- py2ttl/py2ttl_data.py (revision 112)
+++ py2ttl/py2ttl_data.py (revision 113)
@@ -1,146 +1,168 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert py objects to data in turtle format.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import getopt
import lxml.etree as ET
from os import sep, path, listdir
-from os.path import isfile, isdir, dirname, basename
+from os.path import isfile, isdir, dirname, basename, getmtime
from progress.bar import Bar
import re
import sys
sys.path.append('svgscripts')
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.super_page import SuperPage
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from class_spec import SemanticClass
from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL
from data_handler import RDFDataHandler
sys.path.append('shared_util')
from myxmlwriter import xml2dict
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Py2TTLDataConverter:
"""This class can be used convert py objects to rdf data in turtle format.
"""
UNITTESTING = False
def __init__(self, manuscript_file, xml_dictionary_file=None, mapping_dictionary=None):
if mapping_dictionary is None and xml_dictionary_file is not None:
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.CYAN + 'initializing mapping dictionary from file "{}" ...'.format(xml_dictionary_file))
self.mapping_dictionary = xml2dict(xml_dictionary_file)
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.GREEN + '[{} classes added]'.format(str(len(self.mapping_dictionary['classes']))))
elif mapping_dictionary is not None:
self.mapping_dictionary = mapping_dictionary
else:
raise Exception('Error: Py2TTLDataConverter init expects either a xml_dictionary_file or a mapping_dictionary!')
self.manuscript_file = manuscript_file
- def convert(self, page_status_list=None):
+ def convert(self, page_status_list=None, create_or_update_changed_pages=False):
"""Convert manuscript instantiated with manuscript_file to rdf data and write to target_file.
"""
if page_status_list is None or len(page_status_list) < 1:
page_status_list = ['OK', SuperPage.STATUS_MERGED_OK]
not Py2TTLDataConverter.UNITTESTING and print(Fore.CYAN + 'initializing python objects with file "{}" ...'.format(self.manuscript_file))
manuscript = ArchivalManuscriptUnity.create_cls(self.manuscript_file, page_status_list=page_status_list, update_page_styles=True)
include_tag = '_INCLUDE'\
if 'OK' in page_status_list and len(page_status_list) == 1\
else ''
target_data_file = manuscript.title.replace(' ', '_') + include_tag + '_DATA.ttl'
+ skip_list = None\
+ if create_or_update_changed_pages is False\
+ else [ 'Page' ]
data_handler = RDFDataHandler(target_data_file, self.mapping_dictionary)
- if not Py2TTLDataConverter.UNITTESTING:
+ identifier_uri = data_handler.add_data(manuscript, '', skip_data_instance_list=skip_list)
+ if create_or_update_changed_pages:
+ counter = 0
+ for page in [ page for page in manuscript.pages ]:
+ if 'xml_file' not in page.__dict__.keys():
+ #TODO: change xml_file to @output in manuscrit_tree
+ page.xml_file = manuscript.manuscript_tree.docinfo.URL.replace('.xml', '_') + 'page' + page.number + '.xml'
+ target_page_file = page.xml_file.replace('xml', 'ttl')
+ if isfile(page.xml_file) and (not isfile(target_page_file) or getmtime(page.xml_file) > getmtime(target_page_file)):
+ counter += 1
+ page_data_handler = RDFDataHandler(target_page_file, self.mapping_dictionary)
+ page_data_handler.add_data(page, identifier_uri.split('#')[1], parent_data_instance=manuscript)
+ page_data_handler.write()
+ if not Py2TTLDataConverter.UNITTESTING:
+ print(Fore.GREEN + f'[{counter} pages created/updated]')
+ elif not Py2TTLDataConverter.UNITTESTING:
print(Fore.GREEN + '[{} pages added]'.format(str(len([ page for page in manuscript.pages if 'xml_file' in page.__dict__.keys()]))))
+ if not Py2TTLDataConverter.UNITTESTING:
print(Fore.CYAN + 'adding triples to rdf graph ... ')
- data_handler.add_data(manuscript, '')
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.GREEN + '[{} statements added]'.format(str(len(data_handler.data_graph))))
print(Fore.CYAN + 'writing graph to file "{}" ...'.format(target_data_file))
data_handler.write()
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.GREEN + '[OK]')
print(Style.RESET_ALL)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert py objects to rdf data in turtle format.
py2ttl/py2ttl_data.py [OPTIONS]
xml file of type shared_util.myxmlwriter.FILE_TYPE_XML_MANUSCRIPT.
OPTIONS:
-h|--help: show help
+ -c|--create-or-update-pages create or update pages as seperate ttl files in dir 'ttl'
-i|--include-status=STATUS include pages with status = STATUS. STATUS is a ':' seperated string of status, e.g. 'OK:faksimile merged'.
-m|--mapping=mapping_dict.xml xml file generated by py2ttl/py2ttl.py containing mapping information for each property of a class.
:return: exit code (int)
"""
check_config_files_exist()
datatypes_dir = get_datatypes_dir()
target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, PROJECT_NAME)
xml_dictionary_file = 'mapping_file4' + datatypes_dir.replace(sep, '.') + '2' + target_ontology_file.replace('.' + sep, '').replace(sep, '.').replace('.ttl', '.xml')
+ create_or_update_pages = False
manuscript_file = None
page_status_list = None
try:
- opts, args = getopt.getopt(argv, "hi:m:", ["help", "include-status=", "mapping="])
+ opts, args = getopt.getopt(argv, "hci:I", ["help", "create-or-update-pages", "include-status=", "Include-files-only"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-i', '--include-status'):
page_status_list = arg.split(':')
+ elif opt in ('-c', '--create-or-update-pages'):
+ create_or_update_pages = True
elif opt in ('-m', '--mapping'):
xml_dictionary_file = arg
if len(args) < 1 :
usage()
return 2
manuscript_file = args[0]
if not isfile(xml_dictionary_file) or not isfile(manuscript_file):
usage()
return 2
converter = Py2TTLDataConverter(manuscript_file, xml_dictionary_file=xml_dictionary_file)
- converter.convert(page_status_list=page_status_list)
+ converter.convert(page_status_list=page_status_list, create_or_update_changed_pages=create_or_update_pages)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: tests_py2ttl/test_data/mapping_dict.xml
===================================================================
--- tests_py2ttl/test_data/mapping_dict.xml (revision 112)
+++ tests_py2ttl/test_data/mapping_dict.xml (revision 113)
@@ -1,408 +1,417 @@
tln
http://www.nie.org/ontology/nietzsche#
./tln-ontology_autogenerated.ttl
http://www.nie.org/ontology/nietzsche#ManuscriptUnity
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasManuscriptType
http://www.nie.org/ontology/nietzsche#hasPages
http://www.nie.org/ontology/nietzsche#hasDescription
http://www.nie.org/ontology/nietzsche#ArchivalManuscriptUnity
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasManuscriptType
http://www.nie.org/ontology/nietzsche#hasPages
http://www.nie.org/ontology/nietzsche#hasStyles
http://www.nie.org/ontology/nietzsche#hasGsaSignature
http://www.nie.org/ontology/nietzsche#hasDescription
http://www.nie.org/ontology/nietzsche#partsBelongToReconstructedKonvolut
http://www.nie.org/ontology/nietzsche#hasEarlierDescriptions
http://www.nie.org/ontology/nietzsche#EditorComment
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
+ http://www.nie.org/ontology/nietzsche#commentHasText
http://www.nie.org/ontology/nietzsche#AtypicalWriting
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
- http://www.nie.org/ontology/nietzsche#atypicalWritingHasText
+ http://www.nie.org/ontology/nietzsche#commentHasText
http://www.nie.org/ontology/nietzsche#Path
http://www.nie.org/ontology/nietzsche#hasDAttribute
http://www.nie.org/ontology/nietzsche#Box
http://www.nie.org/ontology/nietzsche#hasDAttribute
http://www.nie.org/ontology/nietzsche#hasEarlierText
http://www.nie.org/ontology/nietzsche#Clarification
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
- http://www.nie.org/ontology/nietzsche#clarificationHasText
+ http://www.nie.org/ontology/nietzsche#commentHasText
http://www.nie.org/ontology/nietzsche#Color
http://www.nie.org/ontology/nietzsche#colorHasName
http://www.nie.org/ontology/nietzsche#hasHexadecimalValue
http://www.nie.org/ontology/nietzsche#Text
http://www.nie.org/ontology/nietzsche#textHasContent
http://www.nie.org/ontology/nietzsche#textHasMarkup
http://www.nie.org/ontology/nietzsche#Description
http://www.nie.org/ontology/nietzsche#textHasContent
http://www.nie.org/ontology/nietzsche#textHasMarkup
http://www.nie.org/ontology/nietzsche#EarlierDescription
http://www.nie.org/ontology/nietzsche#textHasContent
http://www.nie.org/ontology/nietzsche#hasAuthor
http://www.nie.org/ontology/nietzsche#hasCitation
http://www.nie.org/ontology/nietzsche#textHasMarkup
http://www.nie.org/ontology/nietzsche#EditorCorrection
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#hasCorrectionText
+ http://www.nie.org/ontology/nietzsche#commentHasText
http://www.nie.org/ontology/nietzsche#Image
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasFileName
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasPrimaryurl
http://www.nie.org/ontology/nietzsche#hasSecondaryurl
http://www.nie.org/ontology/nietzsche#hasTextField
http://www.nie.org/ontology/nietzsche#FaksimileImage
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasFileName
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasPrimaryurl
http://www.nie.org/ontology/nietzsche#hasSecondaryurl
http://www.nie.org/ontology/nietzsche#hasApiurl
http://www.nie.org/ontology/nietzsche#hasThumburl
http://www.nie.org/ontology/nietzsche#hasMediumurl
http://www.nie.org/ontology/nietzsche#hasTextField
http://www.nie.org/ontology/nietzsche#PositionalObject
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#WordPosition
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#FaksimilePosition
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#Imprint
http://www.nie.org/ontology/nietzsche#imprintHasReference
http://www.nie.org/ontology/nietzsche#imprintRefersToLines
http://www.nie.org/ontology/nietzsche#Line
http://www.nie.org/ontology/nietzsche#lineHasNumber
http://www.nie.org/ontology/nietzsche#lineHasBottomValueOnTranskription
http://www.nie.org/ontology/nietzsche#lineHasTopValueOnTranskription
http://www.nie.org/ontology/nietzsche#lineHasInnerBottomValueOnFaksimile
http://www.nie.org/ontology/nietzsche#lineHasInnerTopValueOnFaksimile
http://www.nie.org/ontology/nietzsche#lineHasOuterBottomValueOnFaksimile
http://www.nie.org/ontology/nietzsche#lineHasOuterTopValueOnFaksimile
http://www.nie.org/ontology/nietzsche#isMainLine
http://www.nie.org/ontology/nietzsche#lineHasEditorComment
http://www.nie.org/ontology/nietzsche#LineContinuation
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#isLineAContinuationTo
+ http://www.nie.org/ontology/nietzsche#commentHasText
http://www.nie.org/ontology/nietzsche#lineContinuationHasReference
http://www.nie.org/ontology/nietzsche#SimpleWord
http://www.nie.org/ontology/nietzsche#hasText
http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#SpecialWord
http://www.nie.org/ontology/nietzsche#hasText
http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#MarkForeignHands
http://www.nie.org/ontology/nietzsche#hasText
http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#penOfForeignHands
http://www.nie.org/ontology/nietzsche#resolutionOfAbbreviation
+ http://www.nie.org/ontology/nietzsche#foreignHandHasCommentByEditors
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#textOfForeignHands
http://www.nie.org/ontology/nietzsche#Page
http://www.nie.org/ontology/nietzsche#hasNumber
http://www.nie.org/ontology/nietzsche#hasOrientation
- http://www.nie.org/ontology/nietzsche#hasImprints
http://www.nie.org/ontology/nietzsche#hasLines
+ http://www.nie.org/ontology/nietzsche#hasImprints
http://www.nie.org/ontology/nietzsche#hasMarkForeignHands
http://www.nie.org/ontology/nietzsche#hasWords
http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths
http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks
+ http://www.nie.org/ontology/nietzsche#hasEditorComments
http://www.nie.org/ontology/nietzsche#hasFaksimileImage
http://www.nie.org/ontology/nietzsche#pageIsOnSVGTextField
http://www.nie.org/ontology/nietzsche#pageIsOnFaksimileTextField
http://www.nie.org/ontology/nietzsche#hasSvgImage
http://www.nie.org/ontology/nietzsche#NonExistentPage
http://www.nie.org/ontology/nietzsche#hasNumber
http://www.nie.org/ontology/nietzsche#hasOrientation
- http://www.nie.org/ontology/nietzsche#hasImprints
http://www.nie.org/ontology/nietzsche#hasLines
+ http://www.nie.org/ontology/nietzsche#hasImprints
http://www.nie.org/ontology/nietzsche#hasMarkForeignHands
http://www.nie.org/ontology/nietzsche#hasWords
http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths
http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks
+ http://www.nie.org/ontology/nietzsche#hasEditorComments
http://www.nie.org/ontology/nietzsche#hasStatus
http://www.nie.org/ontology/nietzsche#hasFaksimileImage
http://www.nie.org/ontology/nietzsche#pageIsOnSVGTextField
http://www.nie.org/ontology/nietzsche#pageIsOnFaksimileTextField
http://www.nie.org/ontology/nietzsche#hasSvgImage
http://www.nie.org/ontology/nietzsche#ReconstructedKonvolut
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasManuscriptType
http://www.nie.org/ontology/nietzsche#hasPages
http://www.nie.org/ontology/nietzsche#hasDescription
http://www.nie.org/ontology/nietzsche#Reference
http://www.nie.org/ontology/nietzsche#firstLineOfReference
http://www.nie.org/ontology/nietzsche#lastLineOfReference
http://www.nie.org/ontology/nietzsche#wordReference
http://www.nie.org/ontology/nietzsche#IsUncertain
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasPageNumber
http://www.nie.org/ontology/nietzsche#SVGImage
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasFileName
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasPrimaryurl
http://www.nie.org/ontology/nietzsche#hasSecondaryurl
http://www.nie.org/ontology/nietzsche#hasTextField
http://www.nie.org/ontology/nietzsche#StandoffTag
http://www.nie.org/ontology/nietzsche#standoffTagHasStartIndex
http://www.nie.org/ontology/nietzsche#standoffTagHasEndIndex
http://www.nie.org/ontology/nietzsche#standoffTagHasCSS
http://www.nie.org/ontology/nietzsche#TextConnectionMark
http://www.nie.org/ontology/nietzsche#hasText
http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#textConnectionMarkHasTextSource
http://www.nie.org/ontology/nietzsche#TextField
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#TranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#UncertainDecipherment
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
+ http://www.nie.org/ontology/nietzsche#commentHasText
http://www.nie.org/ontology/nietzsche#Word
http://www.nie.org/ontology/nietzsche#hasText
http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#hasEditedText
http://www.nie.org/ontology/nietzsche#hasCleanEditedText
http://www.nie.org/ontology/nietzsche#wordHasWordParts
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#wordHasStyle
http://www.nie.org/ontology/nietzsche#overwritesWord
http://www.nie.org/ontology/nietzsche#isTransformationOfWord
http://www.nie.org/ontology/nietzsche#isExtensionOfWord
http://www.nie.org/ontology/nietzsche#isDeletionOfWord
http://www.nie.org/ontology/nietzsche#isClarificationOfWord
http://www.nie.org/ontology/nietzsche#wordHasEarlierVersion
http://www.nie.org/ontology/nietzsche#wordHasCorrection
+ http://www.nie.org/ontology/nietzsche#wordIsUndeletedFromPath
http://www.nie.org/ontology/nietzsche#wordIsDeletedByPath
- http://www.nie.org/ontology/nietzsche#wordHasEditorComment
+ http://www.nie.org/ontology/nietzsche#wordHasEditorComment
+ http://www.nie.org/ontology/nietzsche#hasStandoffMarkup4PartThatOverwritesWord
http://www.nie.org/ontology/nietzsche#WordDeletionPath
http://www.nie.org/ontology/nietzsche#hasDAttribute
http://www.nie.org/ontology/nietzsche#WordInsertionMark
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasMarkType
http://www.nie.org/ontology/nietzsche#hasSymbolId
http://www.nie.org/ontology/nietzsche#hasNextWord
http://www.nie.org/ontology/nietzsche#hasPreviousWord
http://www.nie.org/ontology/nietzsche#wordInsertionMarkBelongsToLine
xml-dictionary
- 2021-08-23 09:52:15
+ 2021-09-30 15:34:35
Index: tests_py2ttl/test_data_handler.py
===================================================================
--- tests_py2ttl/test_data_handler.py (revision 112)
+++ tests_py2ttl/test_data_handler.py (revision 113)
@@ -1,51 +1,55 @@
import unittest
from os import sep, path
from os.path import dirname, isfile
import inspect
from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD
import rdflib
import sys
sys.path.append('shared_util')
from myxmlwriter import xml2dict
sys.path.append('svgscripts')
from datatypes.image import Image
from datatypes.page import Page
+from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.text_field import TextField
sys.path.append('py2ttl')
from data_handler import RDFDataHandler
class TestRDFDataHandler(unittest.TestCase):
def setUp(self):
RDFDataHandler.UNITTESTING = True
DATADIR = dirname(__file__) + sep + 'test_data'
self.mapping_dictionary = xml2dict(DATADIR + sep + 'mapping_dict.xml')
self.xml_file = DATADIR + sep + 'N_VII_1_page001.xml'
+ self.manuscript_file = DATADIR + sep + 'N_VII_1.xml'
def test_add_data(self):
data_handler = RDFDataHandler('test.ttl', self.mapping_dictionary)
page = Page(self.xml_file)
data_handler.add_data(page, page.title.replace(' ', '_'))
+ manuscript = ArchivalManuscriptUnity.create_cls(self.manuscript_file)
+ data_handler.add_data(page, '', skip_data_instance_list=['Page'])
#print(data_handler.data_graph.serialize(format="turtle"))
#data_handler.write()
def test_init(self):
with self.assertRaises(Exception):
RDFDataHandler(None, {})
mapping_dictionary = { 'ontology': { 'project_name': 'test', 'project_uri': 'test' }}
data_handler = RDFDataHandler('test.ttl', mapping_dictionary)
self.assertEqual(data_handler.project_name, 'test')
def test_create_identifier_uri(self):
tf = TextField()
mapping_dictionary = { 'ontology': { 'project_name': 'test', 'project_uri': 'test' }}
data_handler = RDFDataHandler('test.ttl', mapping_dictionary)
identifier_uri = data_handler.create_identifier_uri(tf, 'asdf')
data_handler.data_graph.add((identifier_uri, RDF.type, OWL.Class))
next_identifier_uri = data_handler.create_identifier_uri(tf, 'asdf')
self.assertEqual(identifier_uri != next_identifier_uri, True)
if __name__ == "__main__":
unittest.main()
Index: tests_py2ttl/test_py2ttl_data.py
===================================================================
--- tests_py2ttl/test_py2ttl_data.py (revision 112)
+++ tests_py2ttl/test_py2ttl_data.py (revision 113)
@@ -1,38 +1,43 @@
import unittest
import lxml.etree as ET
from os import sep, path, remove
from os.path import isfile, dirname
from rdflib import Graph, URIRef, Literal
import sys
sys.path.append('py2ttl')
import py2ttl_data
from py2ttl_data import Py2TTLDataConverter
from config import PROJECT_NAME, PROJECT_ONTOLOGY_FILE
from knora_base import KNORA_BASE
if dirname(dirname(__file__)) not in sys.path:
sys.path.append(dirname(dirname(__file__)))
from svgscripts.datatypes.word import Word
from svgscripts.datatypes.word_position import WordPosition
class TestPy2TTL(unittest.TestCase):
"""This is the unittest for py2ttl.py2ttl.
@label unittest
"""
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
self.manuscript_file = DATADIR + sep + 'N_VII_1.xml'
self.ttl_target = __file__ + 'test.ttl'
+ self.dictionary_file = DATADIR + sep + 'mapping_dict.xml'
def test_main(self):
Py2TTLDataConverter.UNITTESTING = True
argv = [ self.manuscript_file ]
self.assertEqual(py2ttl_data.main(argv), 0)
+ def test_convert(self):
+ converter = Py2TTLDataConverter(self.manuscript_file, xml_dictionary_file=self.dictionary_file)
+ converter.convert(create_or_update_changed_pages=True)
+
def tearDown(self):
isfile(self.ttl_target) and remove(self.ttl_target)
if __name__ == "__main__":
unittest.main()