Index: svgscripts/process_footnotes.py
===================================================================
--- svgscripts/process_footnotes.py (revision 109)
+++ svgscripts/process_footnotes.py (revision 110)
@@ -1,282 +1,294 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import getopt
import lxml.etree as ET
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path as PathlibPath
from progress.bar import Bar
+import inspect
import re
import shutil
import sys
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.atypical_writing import AtypicalWriting
from datatypes.clarification import Clarification
from datatypes.editor_comment import EditorComment
from datatypes.editor_correction import EditorCorrection
from datatypes.footnotes import extract_footnotes
+from datatypes.imprint import extract_imprints
from datatypes.line_continuation import LineContinuation
from datatypes.standoff_tag import StandoffTag
from datatypes.text import Text
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.uncertain_decipherment import UncertainDecipherment
from util import back_up
from process_files import update_svgposfile_status
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
ATYPICAL_GROUP = re.compile(r'(.*:.*]\s*)(¿)(.*)')
CLARIFICATION_GROUP = re.compile(r'(.*:.*]\s*)(Vk)(.*)')
CONTINUATION_GROUP = re.compile(r'(.*:\s*)(Fortsetzung\s*)')
COMMENT_GROUP = re.compile(r'(.*:.*])')
EDITOR_CORRECTION_GROUP = re.compile(r'(.*:.*]\s*)(>[?]*)(.*)')
LINE_REFERENCE_GROUP = re.compile(r'(\d+-|\d/(\d+/)*)*([0-9]+)(:.*)')
LINE_REFERENCE_GROUP_START_INDEX = 1
LINE_REFERENCE_GROUP_MID_INDEX = 2
LINE_REFERENCE_GROUP_END_INDEX = 3
LINE_COMMENT_GROUP = re.compile(r'(.*\d+:)')
UNCERTAINTY_WORD_GROUP = re.compile(r'(.*:.*]\s*)([>]*\?)(.*)')
UNCERTAINTY_EDITOR_GROUP = re.compile(r'(.*)(\?)')
WORD_REFERENCE_GROUP = re.compile(r'(.*[0-9]+:\s*)(.*)(].*)')
DEBUG = False
def categorize_footnotes(page, footnotes=None, debug=False, skip_after=-1.0, find_content=False):
"""Categorize footnotes.
"""
DEBUG = debug
if footnotes is None:
footnotes = extract_footnotes(page, skip_after=skip_after)
for footnote in footnotes:
line_match = re.match(LINE_REFERENCE_GROUP, footnote.content)
if line_match is not None:
_process_line_match(page, footnote, line_match)
else:
warnings.warn(f'Unknown editor comment without a line reference: <{footnote}>')
if find_content and len(page.text_connection_marks) > 0:
TextConnectionMark.find_content_in_footnotes(page, footnotes=footnotes)
page.update_and_attach_words2tree()
for line in page.lines: line.attach_object_to_tree(page.page_tree)
DEBUG = False
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
+def save_imprints(page):
+ """Categorize footnotes.
+ """
+ for imprint in extract_imprints(page):
+ imprint.attach_object_to_tree(page.page_tree)
+ if not UNITTESTING:
+ write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
+ script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}', file_type=FILE_TYPE_SVG_WORD_POSITION)
+
def _is_uncertain(footnote) -> bool:
"""Return whether footnote contains sign for uncertainty.
"""
uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content)
return (uncertain_match is not None\
and len([ markup for markup in footnote.standoff_markups\
if markup.css_string.endswith('italic;')\
and uncertain_match.end() >= markup.startIndex\
and uncertain_match.end() <= markup.endIndex ]) > 0)
def _process_line_match(page, footnote, line_match):
"""Process footnote if reference to a line matches.
"""
word_match = re.match(WORD_REFERENCE_GROUP, footnote.content)
end_line_number = int(line_match.group(LINE_REFERENCE_GROUP_END_INDEX))
lines = []
if line_match.group(LINE_REFERENCE_GROUP_START_INDEX) is not None:
if line_match.group(LINE_REFERENCE_GROUP_MID_INDEX) is not None:
line_ids = [ int(line_id) for line_id in\
line_match.group(LINE_REFERENCE_GROUP_START_INDEX).split('/')\
if line_id != '' ] + [ end_line_number ]
lines = [ line for line in page.lines if line.id in line_ids ]
else:
start_line_number = int(line_match.group(1)[0:-1])
lines = [ line for line in page.lines if line.id >= start_line_number and line.id <= end_line_number ]
else:
lines = [ line for line in page.lines if line.id == end_line_number ]
if word_match is not None:
_process_word_match(page.words, footnote, line_match, word_match.group(2), end_line_number)
elif len(lines) > 0:
uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content)
for line in lines:
_process_line_reference(page, footnote, line, _is_uncertain(footnote))
else:
warnings.warn(f'Footnote refers to missing line {line_number}: {footnote}')
def _process_line_reference(page, footnote, line, is_uncertain):
"""Process footnote if there is a line reference.
"""
continuation_match = re.match(CONTINUATION_GROUP, footnote.content)
if continuation_match is not None:
reference_string = footnote.content[continuation_match.end():]
if is_uncertain:
reference_string = reference_string[:-1]
line.editor_comments.append(LineContinuation.create_cls(reference_string=reference_string, is_uncertain=is_uncertain))
else:
comment_match = re.match(LINE_COMMENT_GROUP, footnote.content)
if comment_match is not None:
is_uncertain = _is_uncertain(footnote)
comment = footnote.content[comment_match.end():-1].strip()\
if is_uncertain\
else footnote.content[comment_match.end():].strip()
line.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain))
else:
warnings.warn(f'Unknown editor comment for line "{line.id}": <{footnote}>')
def _process_word_match(words, footnote, line_match, word_text, line_number, parent_word_composition=None):
"""Process footnote if there is a word reference.
"""
referred_words = [ word for word in words\
if word.line_number == line_number\
and (word.text == word_text\
or re.match(rf'\W*{word_text}\W', word.text)\
or word.edited_text == word_text) ]
referred_word_parts = [ word.word_parts for word in words\
if word.line_number == line_number\
and len(word.word_parts) > 0\
and word_text in [ wp.text for wp in word.word_parts ] ]
overwritten_word_matches = [ word for word in words\
if word.line_number == line_number\
and len(word.word_parts) > 0\
and len([word_part for word_part in word.word_parts\
if word_part.overwrites_word is not None\
and word_part.overwrites_word.text == word_text]) > 0]
if len(referred_words) > 0\
or len(overwritten_word_matches) > 0\
or len(referred_word_parts) > 0:
word = None
if len(referred_words) == 1:
word = referred_words[0]
elif len(overwritten_word_matches) > 0:
word = [ word_part.overwrites_word for word_part in overwritten_word_matches[0].word_parts\
if word_part.overwrites_word is not None and word_part.overwrites_word.text == word_text][0]
elif len(referred_word_parts) > 0:
word = [ word_part for word_part in referred_word_parts[0] if word_part.text == word_text ][0]
else:
word = [ better_word for better_word in referred_words if better_word.text == word_text][0]
atypical_match = re.match(ATYPICAL_GROUP, footnote.content)
correction_match = re.match(EDITOR_CORRECTION_GROUP, footnote.content)
clarification_match = re.match(CLARIFICATION_GROUP, footnote.content)
is_uncertain = re.match(UNCERTAINTY_WORD_GROUP, footnote.content) is not None
if correction_match is not None:
correction = correction_match.group(3).strip()
word.editor_comment = EditorCorrection(correction_text=correction, is_uncertain=is_uncertain)
if not is_uncertain:
word.edited_text = correction
elif clarification_match is not None:
word.editor_comment = Clarification(text=footnote.extract_part(word_text, css_filter='bold;'))
elif atypical_match is not None:
text = footnote.extract_part(word_text, css_filter='bold;')\
if footnote.markup_contains_css_filter('bold;')\
else None
word.editor_comment = AtypicalWriting(text=text)
elif is_uncertain:
word.editor_comment = UncertainDecipherment()
else:
comment_match = re.match(COMMENT_GROUP, footnote.content)
if comment_match is not None:
is_uncertain = _is_uncertain(footnote)
comment = footnote.content[comment_match.end():-1].strip()\
if is_uncertain\
else footnote.content[comment_match.end():].strip()
word.editor_comment = EditorComment(comment=comment, is_uncertain=is_uncertain)
else:
warnings.warn(f'Unknown editor comment for word "{word.text}": <{footnote}>')
elif re.match(r'.*\s.*', word_text):
for word_part in word_text.split(' '):
_process_word_match(words, footnote, line_match, word_part, line_number, parent_word_composition=word_text)
elif len([word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]) > 0:
new_words = []
for word in [word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]:
new_words += word.word_parts
_process_word_match(new_words, footnote, line_match, word_text, line_number)
else:
warnings.warn(f'No word found with text "{word_text}" on line {line_number}: <{footnote}>')
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to process the footnotes of a page.
svgscripts/process_footnotes.py [OPTIONS]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-s|--skip-until=left skip all nodes.get('X') < left
:return: exit code (int)
"""
skip_after=-1.0
try:
opts, args = getopt.getopt(argv, "hs:", ["help", "skip-until=" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-s', '--skip-until'):
skip_after = float(arg)
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
manuscript_file = file_a\
if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
else None
counter = 0
for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK):
if not UNITTESTING:
print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
categorize_footnotes(page, skip_after=skip_after, find_content=True)
+ save_imprints(page)
counter += 1
not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/datatypes/word.py
===================================================================
--- svgscripts/datatypes/word.py (revision 109)
+++ svgscripts/datatypes/word.py (revision 110)
@@ -1,907 +1,913 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
import inspect
from lxml import etree as ET
from operator import attrgetter
import re
import string
import sys
import warnings
from .box import Box
from .editor_comment import EditorComment
from .matrix import Matrix
from .path import Path
from .simple_word import SimpleWord
from .style import Style
from .word_deletion_path import WordDeletionPath
from .word_position import WordPosition
from .transkription_position import TranskriptionPosition
from .writing_process import WritingProcess
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
def execute_function_on_parts(word_parts, func_name):
"""Execute function on parts and add those parts instead of original word to word_parts.
:return: new word_parts, output from func
"""
copy_parts = word_parts[:]
for word in word_parts:
output = eval('word.{0}()'.format(func_name))
if len(word.word_parts) > 0:
for part_word in word.word_parts:
copy_parts.insert(copy_parts.index(word), part_word)
copy_parts.remove(word)
word.word_parts = []
return copy_parts, output
def update_transkription_position_ids(word):
"""Update transkription_position' ids according to index.
"""
word_part_ids = [ wp.id for wp in word.word_parts ]
if len(word_part_ids) != len(set(word_part_ids)):
for id, wp in enumerate(word.word_parts):
wp.id = id
for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
transkription_position.id = index
transkription_position.has_box = None
transkription_position.deleted = False
class Word(SimpleWord):
"""
This class represents a word.
"""
COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ]
APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' }
DATA = 'debug-data'
- RDFS_SUBCLASSOF_LIST = ['http://www.e-editiones.ch/ontology/text#HandwrittenText']
+ RDFS_SUBCLASSOF_LIST = ['https://www.e-editiones.ch/ontology/text#HandwrittenText']
XML_TAG = 'word'
XML_EARLIER_VERSION = 'earlier-version'
XML_OVERWRITES = 'overwrites'
XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\
'isDeletionOfWord': 'deletesEarlierPart',\
'isExtensionOfWord': 'extendsEarlierVersion',\
'isTransformationOfWord': 'transformsEarlierPart' }
def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None):
super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
self.corrections = []
+ self.clean_edited_text = None
self.deleted = deleted
self.deletion_paths = []
self.deletion_paths_near_word = []
self.debug_container = {}
self.debug_msg = None
self.earlier_version = earlier_version
self.edited_text = None
self.editor_comment = None
self.isClarificationOfWord = None
self.isDeletionOfWord = None
self.isExtensionOfWord = None
self.isTransformationOfWord = None
if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0:
self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ])
self.overwrites_word = None
self.process_flags = []
self.styles = styles\
if styles is not None\
else []
self.verified = None
self.writing_process_id = writing_process_id
self.writing_processes = []
self.word_insertion_mark = None
self.word_box = None
self.word_parts = word_parts if word_parts is not None else []
self.word_part_objs = word_part_objs if word_part_objs is not None else []
def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Add a word deletion path to word.
"""
if len(self.word_parts) > 0:
for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
elif self.deleted:
index = 0
while len(self.deletion_paths) == 0 and index < len(self.transkription_positions):
include_pwps = (len(self.transkription_positions[index].positional_word_parts) > 0
and abs(self.transkription_positions[index].left-self.transkription_positions[index].positional_word_parts[0].left) < 10)
word_path = Path.create_path_from_transkription_position(self.transkription_positions[index],\
tr_xmin=tr_xmin, tr_ymin=tr_ymin, include_pwps=include_pwps)
self.deletion_paths += [ deletion_path for deletion_path in deletion_paths\
if not Path.is_path_contained(self.deletion_paths, deletion_path)\
and deletion_path.do_paths_intersect(word_path) ]
index += 1
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
word_node = super(Word,self).attach_word_to_tree(target_tree)
if self.deleted is not None:
word_node.set('deleted', str(self.deleted).lower())
if self.verified is not None:
word_node.set('verified', str(self.verified).lower())
if self.edited_text is not None:
word_node.set('edited-text', self.edited_text)
if self.editor_comment is not None:
self.editor_comment.attach_object_to_tree(word_node)
if self.writing_process_id > -1:
word_node.set('writing-process-id', str(self.writing_process_id))
if len(self.process_flags) > 0:
word_node.set('process-flags', ' '.join(self.process_flags))
for index, word_part in enumerate(self.word_parts):
word_part.id = index
word_part.attach_word_to_tree(word_node)
if self.earlier_version is not None:
earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION)
self.earlier_version.attach_word_to_tree(earlier_node)
if self.overwrites_word is not None\
and len(self.overwrites_word.transkription_positions) > 0:
overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES)
self.overwrites_word.attach_word_to_tree(overwrite_node)
if self.word_box is not None:
self.word_box.attach_object_to_tree(word_node)
if len(self.corrections) > 0:
word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ])))
for deletion_id, deletion_path in enumerate(self.deletion_paths):
deletion_path.id = deletion_id
deletion_path.tag = WordDeletionPath.XML_TAG
deletion_path.attach_object_to_tree(word_node)
for key in self.XML_CORRECTION_DICT.keys():
if self.__dict__[key] is not None:
word_node.set(self.XML_CORRECTION_DICT[key], 'true')
return word_node
def belongs_to_multiple_writing_processes(self, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if len(self.word_parts) > 0 and include_parts:
return len(set(word.writing_process_id for word in self.word_parts)) > 1
return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1
def set_parent_word_writing_process_id(self):
"""Set writing_process_id for parent word.
"""
ids = set(word.transkription_positions[0].style for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None)
if len(ids) > 1:
self.writing_process_id = max([style.writing_process_id for style in ids])
if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\
for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\
> 1:
self.writing_process_id += 1
@classmethod
def create_cls(cls, word_node):
"""Creates a word from a (lxml.Element) node.
[:return:] Word
"""
cls = super(Word,cls).create_cls(word_node)
cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
cls.split_strings = None
cls.join_string = word_node.get('join')
if bool(word_node.get('split')):
cls.split_strings = word_node.get('split').split(' ')
if ''.join(cls.split_strings) != cls.text:
error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\
format(word_node.getroottree().docinfo.URL, str(cls.id))\
+ 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\
+ 'Text attribute: "{0}".\n'.format(cls.text)
raise Exception(error_msg)
cls.verified = word_node.get('verified') == 'true'\
if bool(word_node.get('verified')) else None
cls.deleted = word_node.get('deleted') == 'true'\
if bool(word_node.get('deleted')) else None
cls.edited_text = word_node.get('edited-text')
+ if cls.edited_text is not None:
+ cls.clean_edited_text = cls._create_clean_text(cls.edited_text)
cls.editor_comment = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ][0]\
if len([ node for node in word_node.xpath('./' + EditorComment.XML_TAG) ]) > 0 else None
cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ]
if bool(word_node.get('corrections')):
for index in [ int(i) for i in word_node.get('corrections').split(' ') ]:
if index < len(cls.word_parts):
cls.corrections.append(cls.word_parts[index])
cls.earlier_version = None
if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0:
cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0]
for key_value in cls.XML_CORRECTION_DICT.values():
if word_node.get(key_value) == 'true':
cls.__dict__[key_value] = True
if cls.earlier_version is not None:
for word_part in cls.word_parts:
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\
and len(cls.word_parts) <= len(cls.earlier_version.word_parts):
try:
word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id]
except Exception:
msg = f'{cls.id} {cls.text}: {word_part.id}'
raise Exception(msg)
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls.earlier_version
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls
cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\
if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\
else None
cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\
if len(word_node.xpath('./' + Box.XML_TAG)) > 0\
else None
cls.deletion_paths = [ Path(node=node) for node in word_node.xpath(f'./{WordDeletionPath.XML_TAG}') ]
cls.process_flags = word_node.get('process-flags').split(' ')\
if bool(word_node.get('process-flags'))\
else []
return cls
@classmethod
def join_words(cls, list_of_words, add_white_space_between_words=False):
"""Creates a word from a list of words.
[:return:] Word
"""
if len(list_of_words) > 1:
deleted = True in [ word.deleted for word in list_of_words ]\
and len(set([ word.deleted for word in list_of_words ])) == 1
line_number = list_of_words[0].line_number\
if len(set([ word.line_number for word in list_of_words ])) == 1\
else -1
faksimile_positions = []
for word in list_of_words:
if len(word.word_parts) > 0:
faksimile_positions += word.faksimile_positions
index = list_of_words.index(word)
list_of_words.remove(word)
for part_word in reversed(word.word_parts):
list_of_words.insert(index, part_word)
new_word_text = ''.join([word.text for word in list_of_words])\
if not add_white_space_between_words\
else ' '.join([word.text for word in list_of_words])
new_word = cls(id=list_of_words[0].id, text=new_word_text, faksimile_positions=faksimile_positions,\
line_number=line_number, deleted=deleted, word_parts=list_of_words)
if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]:
change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0]
new_word.edited_text = new_word.text.replace(change_text, change_text[:-1])
for id, word in enumerate(new_word.word_parts): word.id = id
return new_word
if len(list_of_words) > 0:
return list_of_words[0]
else:
return None
def create_earlier_version(self, root_word=None, id=0):
"""Create an earlier version of word.
"""
if root_word is None:
root_word = self
root_word.set_parent_word_writing_process_id()
word_parts = []
non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\
if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ]
non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts)
if non_single_punctuation_word_parts_length > 0\
and len([ word_part for word_part in non_single_punctuation_word_parts\
if word_part.deleted ])\
== non_single_punctuation_word_parts_length:
self.deleted = True
for word_part in non_single_punctuation_word_parts: word_part.deleted = False
for id, word_part in enumerate(self.word_parts):
earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id)
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
elif word_part.overwrites_word is not None\
and ((len(word_part.transkription_positions) > 0\
and word_part.overwrites_word.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style\
!= word_part.overwrites_word.transkription_positions[0].style)
or word_part.word_box.earlier_version):
word_part.overwrites_word.id = word_part.id
word_parts.append(word_part.overwrites_word)
word_part.isTransformationOfWord = word_part.overwrites_word
#print(f'transform: {self.text}')
if word_part not in self.corrections:
self.corrections.append(word_part)
elif root_word.writing_process_id > -1\
and (len(word_part.transkription_positions) > 0\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style.writing_process_id\
== root_word.writing_process_id):
word_part.extendsEarlierVersion = True
#print('extends')
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
#print(f'default: {self.text}')
word_parts.append(earlierWordPart)
text = ''.join([ word.text for word in word_parts ])\
if len(word_parts) > 0\
else self.text
if len(word_parts) == 1:
self.transkription_positions += word_parts[0].transkription_positions
self.faksimile_positions += word_parts[0].faksimile_positions
word_parts = []
new_transkription_positions = copy.deepcopy(self.transkription_positions)
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None:
writing_process_id = self.transkription_positions[0].style.writing_process_id
for new_tp in new_transkription_positions:
new_tp.style.writing_process_id = writing_process_id
return Word(id=id, text=text, transkription_positions=new_transkription_positions,\
faksimile_positions=self.faksimile_positions, line_number=self.line_number,\
word_parts=word_parts)
def create_correction_history(self, page=None, box_style=None):
"""Create correction history.
"""
if self.word_box is not None:
manuscript = self.transkription_positions[0].style.manuscript\
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None\
else None
style = Style()
if box_style is not None:
style = box_style
if page is not None:
style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript)
for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]:
style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions)
for transkription_position in transkription_positions:
transkription_position.style = style
self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\
line_number=self.line_number)
for word_part in self.word_parts:
word_part.create_correction_history(page=page, box_style=box_style)
if len(self.word_parts) > 0:
earlier_version = self.create_earlier_version()
extending_words = self._get_parts_with_property_key('extendsEarlierVersion')
if len(extending_words) > 0:
for word in extending_words:
word.isExtensionOfWord = earlier_version
if self.has_mixed_status('deleted', include_parts=True):
self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ])
if len(self.corrections) > 0:
self.earlier_version = earlier_version
@staticmethod
def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
"""Creates a word from a (lxml.Element) node or word_part_objs.
[:return:] Word
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
text = word_node.get('text')
deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
if len(word_node.findall('.//' + Word.DATA)) > 0\
else [ item.attrib for item in word_node.findall('.//part')]
return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
WIDTH = 5
TOPCORRECTION = 2.0
FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
height = height
x = round(float(word_part_objs[0]['x']), 3)
if(page is not None and bool(page.style_dict)):
HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
if endSign is not None and '%' in endSign:
lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
if bool(page.style_dict[key].get('font-size'))]
lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
elif endSign is not None and '%' in endSign:
endX = float(endX) + WIDTH
bottom = round(float(word_part_objs[0]['y']), 3)
y = round(bottom - height + TOPCORRECTION, 3)
width = round(float(endX) - x, 3)
transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
text = ''.join([ dict['text'] for dict in word_part_objs])
line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
word.debug_msg = debug_msg
return word
else:
error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(Word,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\
cardinality=1, cardinality_restriction='minCardinality',\
name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\
name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\
name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\
comment='Word has been deleted by the author using a deletion path.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comment', EditorComment,\
name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\
name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\
name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.'))
+ dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('clean_edited_text', str,\
+ name='hasCleanEditedText', label='word has an edited text without punctuation',\
+ comment='Word has a text without punctuation that is edited automatically by removing deleted parts or hyphens.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\
name='isClarificationOfWord', label='word is a clarification of word',\
comment='The author has used this part of the word in order to clarify the appearance of that word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\
name='isDeletionOfWord', label='word is a deletion of word',\
comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\
name='isExtensionOfWord', label='word is a extension of word',\
comment='The author has used this part of a word in order to extend an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\
name='isTransformationOfWord', label='word is a transformation of word',\
comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\
name='overwritesWord', label='word overwrites word',\
comment='The author has used this word in order to overwrite that word.'))
# This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING,
# cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class.
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\
name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\
subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING))
super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\
name='isCorrectionOfWord', label='word is a correction of word',\
comment='The author has used this word in order to correct that word.')
for key in cls.XML_CORRECTION_DICT.keys():
correction_dict = dictionary[cls.PROPERTIES_KEY].get(key)
correction_dict.update(super_property_dictionary)
dictionary[cls.PROPERTIES_KEY].update({key: correction_dict})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def has_mixed_status(self, property_key, include_parts=False, concerns_word=True):
"""Returns true if transkription_positions have mixed status concerning the property_key in their __dict__.
"""
if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions):
return False
if len(self.word_parts) > 0 and include_parts:
if concerns_word:
if False in set(property_key in word.__dict__.keys() for word in self.word_parts):
return False
return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1
else:
return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\
if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1
return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1
def init_word(self, page):
"""Initialize word with objects from page.
"""
super(Word,self).init_word(page)
if self.writing_process_id > -1:
self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ]
writing_processes = self.writing_processes
for word_part in self.word_parts:
word_part.init_word(page)
self.lines += word_part.lines
self.writing_processes += word_part.writing_processes
self.lines = [ line for line in set(self.lines) ]
self.writing_processes = [ wp for wp in set(self.writing_processes)]
if self.overwrites_word is not None:
self.overwrites_word.init_word(page)
if self.earlier_version is not None:
if self.earlier_version.writing_process_id == -1:
self.earlier_version.writing_process_id = self.writing_process_id-1
if self.earlier_version.line_number == -1:
self.earlier_version.line_number = self.line_number
self.earlier_version.init_word(page)
self.deletion_paths = [ page.get_word_deletion_path(path) for path in self.deletion_paths if path.path is not None ]
def join(self, other_word, append_at_end_of_new_word=True, add_white_space_between_words=False):
"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
"""
if append_at_end_of_new_word:
self.text = self.text + other_word.text\
if not add_white_space_between_words\
else self.text + ' ' + other_word.text
for position in other_word.transkription_positions:
position.id = str(len(self.transkription_positions))
self.transkription_positions.append(position)
for position in other_word.faksimile_positions:
position.id = str(len(self.faksimile_positions))
self.faksimile_positions.append(position)
else:
self.text = other_word.text + self.text
index = 0
for position in other_word.transkription_positions:
self.transkription_positions.insert(index, position)
index += 1
while index < len(self.transkription_positions):
self.transkription_positions[index].id = str(index)
index += 1
index = 0
for position in other_word.faksimile_positions:
self.faksimile_positions.insert(indexposition)
index += 1
while index < len(self.faksimile_positions):
self.faksimile_positions[index].id = str(index)
index += 1
self.simplify_transkription_positions()
def partition_according_to_deletion(self):
"""Partition a word according to its transkription_positions' deletion status
->split word and add partial words as its parts.
"""
if self.has_mixed_status('deleted'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.deleted != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
for tp in transkription_positions:
newWord.deletion_paths += tp._deletion_paths
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.deleted
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
for tp in transkription_positions:
newWord.deletion_paths += tp._deletion_paths
self.word_parts.append(newWord)
self.transkription_positions = []
self.line_number = -1
self.deleted = False
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion')
elif not self.deleted\
and len(self.transkription_positions) > 0\
and self.transkription_positions[0].deleted:
self.deleted = True
for tp in self.transkription_positions:
self.deletion_paths += tp._deletion_paths
def partition_according_to_writing_process_id(self):
"""Partition a word according to its transkription_positions' writing_process_ids
->split word and add partial words as its parts.
"""
if self.belongs_to_multiple_writing_processes():
last_writing_process_id = -1
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.writing_process_id != last_writing_process_id\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_writing_process_id = transkription_position.writing_process_id
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id')
if self.belongs_to_multiple_writing_processes(include_parts=True):
self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0]
elif len(self.transkription_positions) > 0:
self.writing_process_id = self.transkription_positions[0].writing_process_id
def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False):
"""Determines whether word is over a word box.
"""
word_over_box = None
if len(self.word_parts) > 0:
for word in self.word_parts:
current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None))
if current_word is not None and current_word.word_box is not None:
word_over_box = current_word
else:
new_tp_dict = {}
for index, transkription_position in enumerate(self.transkription_positions):
if previous_word_has_box and index == 0:
if len(transkription_position.positional_word_parts) > 0:
transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2
#print(f'{self.text}: {transkription_position.positional_word_parts[0].left}')
else:
transkription_position.left += 1
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
containing_boxes = [ box_path for box_path in box_paths\
if word_path.is_partially_contained_by(box_path)\
or box_path.do_paths_intersect(word_path) ]
if len(containing_boxes) > 0:
if previous_word_has_box:
print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}')
self._set_box_to_transkription_position(containing_boxes[0], word_path,\
transkription_position, new_tp_dict, tr_xmin)
box_paths.remove(containing_boxes[0])
for replace_tp in new_tp_dict.keys():
for tp in new_tp_dict.get(replace_tp):
self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp)
self.transkription_positions.remove(replace_tp)
word_over_box = self._get_partial_word_over_box()
update_transkription_position_ids(self)
return word_over_box
def set_word_insertion_mark(self, word_insertion_mark):
"""Sets word_insertion_mark
"""
self.word_insertion_mark = word_insertion_mark
def set_writing_process_id_to_transkription_positions(self, page):
"""Determines the writing process id of the transkription_positions.
"""
for transkription_position in self.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
if font_key in page.fontsizekey2stage_mapping.keys():
transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
def simplify_transkription_positions(self):
"""Merge transkription_positions if possible.
"""
index = len(self.transkription_positions)-1
while index > 0\
and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
current_tp = self.transkription_positions[index]
index -= 1
previous_tp = self.transkription_positions[index]
if previous_tp.is_mergebale_with(current_tp):
positional_word_parts = previous_tp.positional_word_parts
positional_word_parts += current_tp.positional_word_parts
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
if len(transkription_positions) == 1:
transkription_positions[0].writing_process_id = previous_tp.writing_process_id\
if previous_tp.writing_process_id != -1\
else current_tp.writing_process_id
self.transkription_positions.pop(index+1)
self.transkription_positions[index] = transkription_positions[0]
#print(self.text, len(self.transkription_positions))
def split(self, split_string, start_id=0):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString, currentString, nextString = self.text.partition(split_string)
currentWord = None
previousWord = None
nextWord = None
previousIndex = 0
current_id = start_id
all_positional_word_parts = []
for position in self.transkription_positions:
all_positional_word_parts += position.positional_word_parts
if len(all_positional_word_parts) == 0:
warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString))
if len(previousString) > 0:
previous_pwps = []
while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
previous_pwps.append(all_positional_word_parts[previousIndex])
previousIndex += 1
if previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString))
else:
previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split')
previous_text = ''.join([ pwp.text for pwp in previous_pwps ])
previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions)
previousWord.faksimile_positions = self.faksimile_positions
current_id += 1
all_positional_word_parts = all_positional_word_parts[previousIndex:]
if len(nextString) > 0:
tmp_pwps = []
index = 0
while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
tmp_pwps.append(all_positional_word_parts[index])
index += 1
if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString))
else:
next_pwps = all_positional_word_parts[index:]
next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split')
next_text = ''.join([ pwp.text for pwp in next_pwps ])
nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions)
nextWord.faksimile_positions = self.faksimile_positions
all_positional_word_parts = all_positional_word_parts[:index]
current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split')
current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ])
currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions)
currentWord.faksimile_positions = self.faksimile_positions
return previousWord, currentWord, nextWord
def split_according_to_status(self, status, splits_are_parts=False):
"""Split a word according to its transkription_positions' text.
:return: a list of new word.Word
"""
new_words = []
if self.has_mixed_status(status):
last_status = None
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.__dict__[status] != last_status\
and len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.__dict__[status]
if len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
if splits_are_parts:
self.word_parts += new_words
if len(self.word_parts) > 0:
self.transkription_positions = []
return new_words
def undo_partitioning(self):
"""Undo partitioning.
"""
if len(self.word_parts) > 0:
for word_part in self.word_parts:
word_part.undo_partitioning()
if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]):
self.transkription_positions += word_part.transkription_positions
self.earlier_version = None
self.edited_text = None
self.word_box = None
self.word_parts = []
self.corrections = []
self.earlier_versions = []
self.box_paths = []
def _create_new_word(self, transkription_positions, status, new_id=0):
"""Create a new word from self and transkription_positions.
"""
newWord = Word(id=new_id, transkription_positions=transkription_positions)
for key in self.COPY_PROPERTY_KEY:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys():
newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status])
else:
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
return newWord
def _get_parts_with_property_key(self, property_key):
"""Return a list of word_parts with property == property_key.
"""
word_parts = []
for word_part in self.word_parts:
if property_key in word_part.__dict__.keys():
word_parts.append(word_part)
else:
word_parts += word_part._get_parts_with_property_key(property_key)
return word_parts
def _get_partial_word_over_box(self):
"""Partition a word according to its transkription_positions' has_box
->split word and add partial words as its parts.
:return: word over box or self
"""
word_over_box = None
if self.has_mixed_status('has_box'):
transkription_positions = []
last_word_box = None
for transkription_position in self.transkription_positions:
if transkription_position.has_box != last_word_box\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
transkription_positions = []
transkription_positions.append(transkription_position)
last_word_box = transkription_position.has_box
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
self.transkription_positions = []
elif len(self.word_parts) > 0:
#self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box')
for word_part in self.word_parts:
if word_over_box is None:
word_over_box = word_part._get_partial_word_over_box()
else:
break
elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1:
word_over_box = self
word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box
return word_over_box
def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin):
"""Set box_path to transkription_position that is contained by box_path.
Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary.
"""
if box_path.contains_path(word_path):
transkription_position.has_box = box_path
elif box_path.contains_start_of_path(word_path):
split_position = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[0].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
elif box_path.contains_end_of_path(word_path):
split_position = box_path.path.bbox()[0] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
else: # box_path in the middle of word_pathz
split_position1 = box_path.path.bbox()[0] - tr_xmin
split_position2 = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position1, split_position2)
if len(new_tps) >= 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
def do_paths_intersect_saveMode(mypath1, mypath2):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try:
return mypath1.path.intersect(mypath2.path, justonemode=True)\
or mypath1.is_partially_contained_by(mypath2)
except AssertionError:
return False
Index: svgscripts/datatypes/page.py
===================================================================
--- svgscripts/datatypes/page.py (revision 109)
+++ svgscripts/datatypes/page.py (revision 110)
@@ -1,428 +1,430 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile, basename
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import re
import sys
import warnings
from .box import Box
from .color import Color
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .faksimile_position import FaksimilePosition
+from .imprint import Imprint
from .lineNumber import LineNumber
from .line import Line
from .mark_foreign_hands import MarkForeignHands
from .matrix import Matrix
from .path import Path
from .positional_word_part import PositionalWordPart
from .super_page import SuperPage
from .style import Style
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
from .word_deletion_path import WordDeletionPath
from .word_insertion_mark import WordInsertionMark
sys.path.append('py2ttl')
from class_spec import SemanticClass
sys.path.append('shared_util')
from main_util import extract_paths_on_tf, get_paths_near_position
FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION
FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT
STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK
STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK
class Page(SemanticClass,SuperPage):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
faksimile_image: FaksimileImage.
faksimile_svgFile: svg file containing information about word positions.
"""
UNITTESTING = False
def __init__(self, xml_source_file=None, faksimile_image=None, faksimile_svgFile=None, add_paths_near_words=False, warn=False, number=None):
if xml_source_file is not None:
super(Page,self).__init__(xml_source_file)
self.update_property_dictionary('faksimile_image', faksimile_image)
self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile)
self.init_all_properties()
self.add_style(style_node=self.page_tree.getroot().find('.//style'))
self.faksimile_text_field = None
self.svg_text_field = None
self.init_node_objects()
self.warn = warn
self.add_deletion_paths_to_words(add_paths_near_words)
else:
self.page_tree = None
self.number = number
def add_deletion_paths_to_words(self, add_paths_near_words=False):
"""Add deletion paths to words.
"""
words = [ word for word in self.words if (len(word.word_parts) == 0 and word.deleted and len(word.deletion_paths) == 0)\
or 'add_paths_near_words' in word.process_flags ]
words += [ word for word in self.words\
if len(word.word_parts) > 0 and True in\
[ (wp.deleted and len(wp.deletion_paths) == 0) for wp in word.word_parts ]]
if len(words) > 0 and ((self.svg_file is not None and isfile(self.svg_file))\
or (self.source is not None and isfile(self.source))):
svg_file = self.svg_file if self.svg_file is not None else self.source
transkription_field = TranskriptionField(svg_file)
tr_xmin = transkription_field.xmin if (self.svg_image is None or self.svg_image.text_field is None) else 0
tr_ymin = transkription_field.ymin if (self.svg_image is None or self.svg_image.text_field is None) else 0
word_deletion_paths = self.word_deletion_paths
index = 0
dp_updated = False
while index < len(words):
word = words[index]
word.add_deletion_paths(word_deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
if len(word.deletion_paths) > 0 or True in [ len(w.deletion_paths) > 0 for w in word.word_parts ]:
deletion_paths = word.deletion_paths
for wp in word.word_parts: deletion_paths += wp.deletion_paths
for deletion_path in deletion_paths:
if deletion_path not in self.word_deletion_paths:
self.word_deletion_paths.append(deletion_path)
elif not dp_updated:
word_deletion_paths = extract_paths_on_tf(self)
dp_updated = True
index -= 1
if add_paths_near_words\
and ('add_paths_near_words' in word.process_flags\
or ((word.deleted and len(word.deletion_paths) == 0)\
or True in [ (w.deleted and len(w.deletion_paths) == 0) for w in word.word_parts ])):
if not dp_updated\
and 'add_paths_near_words' in word.process_flags:
word_deletion_paths = extract_paths_on_tf(self)
dp_updated = True
transform = None
tp = None
target_word = word
paths_near_word = []
if word.deleted and len(word.transkription_positions) > 0:
transform = word.transkription_positions[0].transform
for tp in word.transkription_positions:
word.deletion_paths_near_word += get_paths_near_position(tp, word_deletion_paths)
elif len(word.word_parts) > 0:
for wp in word.word_parts:
if wp.deleted and len(wp.transkription_positions) > 0:
target_word = wp
for tp in wp.transkription_positions:
wp.deletion_paths_near_word = get_paths_near_position(tp, word_deletion_paths)
if self.warn and (word.deleted and len(word.deletion_paths) == 0):
warnings.warn(\
f'WARNING: {self.title} {self.number}: {word.id} on {word.line_number}, {word.text} has no deletion paths! {target_word.deletion_paths_near_word}, {transform}')
index += 1
@classmethod
def create_cls(cls, xml_source_file=None, create_dummy_page=False, page_node=None):
"""Create a Page.
"""
if not create_dummy_page:
return cls(xml_source_file)
else:
m = re.match(r'(.*)(page[0]*)(.*)(\.xml)', xml_source_file)
if m is not None and len(m.groups()) > 3:
number = m.group(3)
else:
number = basename(xml_source_file).replace('.xml','')
return cls(number=number)
@classmethod
def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree = ET.parse(xml_file)
if source_tree.getroot().find('metadata/type').text == cls.FILE_TYPE_SVG_WORD_POSITION:
page = cls(xml_file)
if word_selection_function is None or len(word_selection_function(page.words)) > 0:
return [ page ]
else:
return []
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
pages = []
xpath = '//page/@output'
if status_contains != '' and status_not_contain != '':
xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain)
elif status_contains != '':
xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains)
elif status_not_contain != '':
xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain)
for xml_source_file in source_tree.xpath(xpath):
if isfile(xml_source_file):
pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function)
return pages
else:
return []
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'number': { 'class': str, 'cardinality': 1}}
properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage, subPropertyOf=cls.HAS_IMAGE))
properties.update(cls.create_semantic_property_dictionary('faksimile_text_field', TextField,\
name='pageIsOnFaksimileTextField', label='page is on faksimile text field',\
comment='Relates a page to the text field on a svg image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
properties.update(cls.create_semantic_property_dictionary('orientation', str))
properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage, subPropertyOf=cls.HAS_IMAGE))
properties.update(cls.create_semantic_property_dictionary('svg_text_field', TextField,\
name='pageIsOnSVGTextField', label='page is on svg text field',\
comment='Relates a page to the text field on a faksimile image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
- for key in [ 'lines', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']:
+ for key in [ 'imprints', 'lines', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']:
properties.update(cls.create_semantic_property_dictionary(key, list))
dictionary.update({cls.CLASS_KEY: class_dict})
dictionary.update({cls.PROPERTIES_KEY: properties})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def get_word_deletion_path(self, path=None, d_attribute=None) ->WordDeletionPath:
"""Return a word deletion path that belongs to page.
"""
if path is None and d_attribute is None:
raise Exception('ERROR: get_word_deletion_path needs a path or a d_attribute!')
if d_attribute is None:
d_attribute = path.d_attribute
page_paths = [ dpath for dpath in self.word_deletion_paths if dpath.d_attribute == d_attribute ]
if len(page_paths) > 0:
return page_paths[0]
else:
dpath = WordDeletionPath.create_cls(self, path=path, d_attribute=d_attribute)
if dpath is not None:
dpath.id = len(self.word_deletion_paths)
self.word_deletion_paths.append(dpath)
dpath.attach_object_to_tree(self.page_tree)
return dpath
def init_node_objects(self):
"""Initialize all node objects.
"""
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ]
self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ]
self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
+ self.imprints = [ Imprint.create_cls_from_node(imprint_node, self.lines) for imprint_node in self.page_tree.getroot().xpath('//' + Imprint.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
self.word_deletion_paths = [ WordDeletionPath.create_cls(self, node=node) for node in self.page_tree.xpath('./' + WordDeletionPath.XML_TAG) ]
if self.faksimile_image is not None and self.faksimile_image.text_field is not None:
self.faksimile_text_field = self.faksimile_image.text_field
if self.svg_image is not None and self.svg_image.text_field is not None:
self.svg_text_field = self.svg_image.text_field
for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks:
simple_word.init_word(self)
for wim in self.word_insertion_marks:
if wim.line_number > -1:
wim.line = [ line for line in self.lines if line.id == wim.line_number ][0]
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None):
"""Update the data source of page.
"""
if faksimile_svgFile is not None:
self.faksimile_svgFile = faksimile_svgFile
data_node = self.page_tree.xpath('.//data-source')[0]\
if len(self.page_tree.xpath('.//data-source')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'data-source')
data_node.set('file', self.faksimile_svgFile)
if xml_correction_file is not None:
data_node.set('xml-corrected-words', xml_correction_file)
def update_line_number_area(self, transkription_field, svg_tree=None, set_to_text_field_zero=True):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD = 0.4
if svg_tree is None:
svg_tree = ET.parse(self.source)
if len(self.line_numbers) > 1:
line_number = self.line_numbers[9]\
if transkription_field.is_page_verso() and len(self.line_numbers) > 8\
else self.line_numbers[1]
ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\
if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\
and LineNumber.IS_A_LINE_NUMBER(item)\
and LineNumber(raw_text_node=item).id == line_number.id ]
if len(ln_nodes) > 0:
matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform'))
if transkription_field.is_page_verso():
transkription_field.add_line_number_area_width(matrix.getX())
elif self.svg_file is not None and isfile(self.svg_file):
svg_path_tree = ET.parse(self.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
svg_x = matrix.getX()
svg_y = self.line_numbers[1].bottom + transkription_field.ymin\
if set_to_text_field_zero\
else self.line_numbers[1].bottom
use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
if len(use_nodes) > 0:
symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
if len(d_strings) > 0 and d_strings[0] != '':
path = parse_path(d_strings[0])
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
transkription_field.add_line_number_area_width(matrix.getX() + width)
def update_page_type(self, transkription_field=None):
"""Adds a source to page and attaches it to page_tree.
"""
if self.number.endswith('r')\
or self.number.endswith('v'):
self.page_type = Page.PAGE_VERSO\
if self.number.endswith('v')\
else Page.PAGE_RECTO
else:
if transkription_field is None:
if self.source is None or not isfile(self.source):
raise FileNotFoundError('Page does not have a source!')
transkription_field = TranskriptionField(self.source, multipage_index=self.multipage_index)
self.page_type = Page.PAGE_VERSO\
if transkription_field.is_page_verso()\
else Page.PAGE_RECTO
self.page_tree.getroot().set('pageType', self.page_type)
def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False):
"""Update styles of words and add them to their transkription_positions.
Args:
add_to_parents: Add styles also to word (and if not None to manuscript).
partition_according_to_styles: Partition word if its transkription_positions have different styles.
"""
style_dictionary = {}
if words is None:
words = self.words
for word in words:
if len(word.word_parts) > 0:
self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\
add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles)
for transkription_position in word.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
style_class = transkription_position.positional_word_parts[0].style_class
writing_process_id = -1
for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]:
writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id)
if create_css:
if style_dictionary.get((style_class_key, word.deleted)) is None:
color = None
if len(word.deletion_paths) > 0:
if word.deletion_paths[0].style_class is not None\
and word.deletion_paths[0].style_class != ''\
and self.style_dict.get(word.deletion_paths[0].style_class) is not None:
color = Color.create_cls_from_style_object(self.style_dict.get(word.deletion_paths[0].style_class))
else:
color = Color()
style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\
create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] )
transkription_position.style = style_dictionary[(style_class_key, word.deleted)]
#print(style_dictionary[(style_class_key, word.deleted)])
else:
if style_dictionary.get(style_class_key) is None:
style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css)
style_dictionary[style_class_key].writing_process_id = style_class_key[1]
transkription_position.style = style_dictionary[style_class_key]
if add_to_parents and transkription_position.style not in word.styles:
word.styles.append(transkription_position.style)
if partition_according_to_styles:
word.split_according_to_status('style', splits_are_parts=True)
if manuscript is not None\
and add_to_parents:
manuscript.update_styles(*style_dictionary.values())
def __eq__(self, other):
"""Returns true if self is qualitatively identical to other.
"""
if other is None:
return False
if self.page_tree is None and other.page_tree is None:
return self.number == other.number
if self.page_tree is None or other.page_tree is None:
return False
return self.page_tree.docinfo.URL == other.page_tree.docinfo.URL
def __hash__(self):
"""Return a hash value for self.
"""
try:
if self.page_tree is None:
return hash(self.number)
except AttributeError:
print(self)
return hash(self.number)
return hash(self.page_tree.docinfo.URL)
Index: svgscripts/datatypes/imprint.py
===================================================================
--- svgscripts/datatypes/imprint.py (revision 0)
+++ svgscripts/datatypes/imprint.py (revision 110)
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This program can be used to extract imprints from a svg file.
+"""
+# Copyright (C) University of Basel 2021 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+import re
+import sys
+from os import listdir, sep, path
+from os.path import isfile, isdir, dirname
+import lxml.etree as ET
+import warnings
+
+
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+from .attachable_object import AttachableObject
+from .atypical_writing import AtypicalWriting
+from .clarification import Clarification
+from .editor_correction import EditorCorrection
+from .line_continuation import LineContinuation
+from .matrix import Matrix
+from .standoff_tag import StandoffTag
+from .text import Text
+from .transkriptionField import TranskriptionField
+from .uncertain_decipherment import UncertainDecipherment
+from .footnotes import FootnoteColumns
+
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+from xml_conform_dictionary import XMLConformDictionary
+
+UNITTESTING = False
+DEBUG = False
+
+class Imprint(SemanticClass,AttachableObject):
+ """This class represents an imprint, i.e. the reference to the printed version of the corresponding manuscript.
+ """
+ START_END_LINE_PATTERN = re.compile('^(\d+)(-)(\d+)(:\s*)(.*)')
+ COMMA_LINE_PATTERN = re.compile('^(\d+)(,)(\d+)(-)(\d+)(:\s*)(.*)')
+ LINE_PATTERN = re.compile('^(((\d+,)*\d+-)*\d+)(:\s)(.*)')
+ XML_TAG = 'imprint'
+ DEBUG = False
+
+ def __init__(self, reference=None, lines=None, line_list_string='', id=0):
+ self.id = id
+ self.reference = reference
+ self.lines = lines if lines is not None else []
+ self.line_list_string = line_list_string
+
+ def attach_object_to_tree(self, target_tree):
+ """Attach object to tree.
+ """
+ obj_node = self.get_or_create_node_with_id(target_tree)
+ obj_node.set('reference', self.reference)
+ if self.line_list_string != '':
+ obj_node.set('line-list-string', self.line_list_string)
+
+ @classmethod
+ def create_cls_from_node(cls, node, lines):
+ """Initialize a cls from node.
+
+ [:return:] cls
+ """
+ reference = node.get('reference')
+ line_list_string = node.get('line-list-string')\
+ if bool(node.get('line-list-string')) else ''
+ return cls(reference=reference, lines=get_lines(lines, line_list_string))
+
+ @classmethod
+ def extract_cls(cls, lines, raw_node, namespaces, id=0):
+ """Return an instance of cls by extractign information from raw svg node.
+ """
+ raw_string = ''.join(raw_node.xpath('./ns:tspan/text()', namespaces=namespaces))
+ line_match = re.match(cls.LINE_PATTERN, raw_string)
+ if line_match is not None:
+ return cls(line_match.groups()[-1], line_list_string=line_match.groups()[0], id=id)
+ else:
+ return cls(raw_string, id=id)
+
+ @classmethod
+ def get_semantic_dictionary(cls):
+ """ Creates a semantic dictionary as specified by SemanticClass.
+ """
+ properties = {}
+ properties.update(cls.create_semantic_property_dictionary('reference', str,\
+ name='imprintHasReference', label='imprint refers to the signature of the printed version of the manuscript'))
+ properties.update(cls.create_semantic_property_dictionary('lines', list,\
+ name='imprintRefersToLines', label='the printed version of the manuscript concerns this list of lines'))
+ dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties }
+ return cls.return_dictionary_after_updating_super_classes(dictionary)
+
+def get_lines(lines, line_list_string='') ->list:
+ """Returns a list of lines that correspond to the lines that are imprinted
+ """
+ # TODO create tln:partOfPageTextUnit for each line range
+ if line_list_string == '':
+ return []
+ relevant_lines = []
+ if re.match(r'(.*\d+)(,)(\d+.*)', line_list_string):
+ for line_list_sub_string in line_list_string.split(','):
+ relevant_lines += get_lines(lines, line_list_string=line_list_sub_string)
+ return relevant_lines
+ multi_line_match = re.match(r'(\d+)(-)(\d+)', line_list_string)
+ single_line_match = re.match(r'^\d+$', line_list_string)
+ if multi_line_match is not None:
+ start_segment = int(multi_line_match.groups()[0])
+ end_segment = int(multi_line_match.groups()[2])
+ return [ line for line in lines if line.id >= start_segment and line.id <= end_segment ]
+ elif single_line_match is not None:
+ return [ line for line in lines if line.id == int(single_line_match.group()) ]
+ return relevant_lines
+
+def extract_imprints(page, transkription_field=None, svg_tree=None) ->list:
+ """Returns a list of imprints.
+ """
+ if page.marginals_source is not None:
+ svg_tree = ET.parse(page.marginals_source)
+ if transkription_field is None:
+ transkription_field = TranskriptionField(page.source)
+ if svg_tree is None and page.source is not None:
+ svg_tree = ET.parse(page.source)
+ if transkription_field is None:
+ transkription_field = TranskriptionField(svg_tree.docinfo.URL)
+ nodes_beneath_tf = [ item for item in filter(lambda node: Matrix.IS_BENEATH_TF(Matrix(transform_matrix_string=node.get('transform')), transkription_field),\
+ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
+ if len(nodes_beneath_tf) == 0:
+ return []
+ x = Matrix(transform_matrix_string=nodes_beneath_tf[0].get('transform')).getX()
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ imprints = [ Imprint.extract_cls(page.lines, node, namespaces, id=i) for (i, node) in enumerate([ node for node in svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap)\
+ if Matrix(transform_matrix_string=node.get('transform')).getX() == x ]) ]
+ return imprints
+
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
Index: svgscripts/datatypes/footnotes.py
===================================================================
--- svgscripts/datatypes/footnotes.py (revision 109)
+++ svgscripts/datatypes/footnotes.py (revision 110)
@@ -1,347 +1,347 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract footnotes from a svg file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
import warnings
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from .atypical_writing import AtypicalWriting
from .clarification import Clarification
from .editor_correction import EditorCorrection
from .line_continuation import LineContinuation
from .matrix import Matrix
from .standoff_tag import StandoffTag
from .text import Text
from .transkriptionField import TranskriptionField
from .uncertain_decipherment import UncertainDecipherment
UNITTESTING = False
DEBUG = False
class FootnoteColumns:
"""This class represents footnote columns.
"""
REFERENCE_PATTERN = re.compile('.*(\d+-)*[0-9]+:')
EXTENDED_REFERENCE_PATTERN = re.compile('.*(\d+(-|/))*[0-9]+:')
REFERENCE_GROUP = re.compile('(.*\D)((\d+-)*[0-9]+:)')
EXCEPTION = re.compile('((\d+/)+[0-9]+:)')
def __init__(self, nsmap, nodes, bottom_values, style_dict, debug=False, skip_after=-1.0):
self.bottom_values = bottom_values
self.footnote_columns = []
self.footnote_keys = {}
self.index = 0
self.nodes = nodes
self.nsmap = nsmap
self.skip_after = skip_after
self.style_dict = style_dict
self.debug = debug
self._init_columns()
def _init_columns(self):
"""Initialize footnote column positions
by creating lists in self.footnote_columns and adding the positions a keys
to self.footnote_keys while the index of self.footnote_columns are their values.
"""
first_line_fn_nodes = sorted([ item for item in self.nodes\
if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == round(self.bottom_values[0], 1)\
and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after],\
key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX())
current_nodes = []
for node in first_line_fn_nodes:
matrix = Matrix(transform_matrix_string=node.get('transform'))
if len(node.getchildren()) > 0:
for tspan in node.findall('tspan', self.nsmap):
x = matrix.add2X(float(tspan.get('x')))
current_nodes.append({ 'x': x, 'text': tspan.text })
elif node.text is not None:
x = matrix.getX()
current_nodes.append({ 'x': x, 'text': node.text })
if re.match(self.EXTENDED_REFERENCE_PATTERN,\
''.join([ item.get('text') for item in current_nodes])):
current_nodes = self._remove_unused_texts(current_nodes)
self.footnote_columns.append([])
self.footnote_keys.update({ round(current_nodes[0].get('x')): len(self.footnote_columns)-1 })
current_nodes = []
if len(self.footnote_keys) == 0:
raise Exception(f'ERROR: there are no footnote_keys')
def _remove_unused_texts(self, nodes):
"""Remove tspan that contain text that is not a line reference.
"""
threshold = 100
node_text = ''.join([ item.get('text') for item in nodes])
match = re.match(self.REFERENCE_GROUP, node_text)
if match is not None and match.group(1) is not None\
and not re.match(self.EXCEPTION, node_text):
unused_text = ''
index = 0
for item in nodes:
unused_text += item.get('text')
if match.group(1).startswith(unused_text):
index += 1
else:
break
if len(nodes) > index+1:
counter = 0
has_gap = False
for item in nodes[index:]:
if len(nodes) > index+counter+1\
and nodes[index+counter+1].get('x')-nodes[index+counter].get('x') > threshold:
index += counter+1
has_gap = True
break
counter += 1
if has_gap:
return nodes[index+1:]
return nodes[index:]
return nodes
def append(self, footnote):
"""Append footnote to a column
"""
self.footnote_columns[self.index].append(footnote)
@classmethod
def create_cls(cls, style_dict=None, page=None, transkription_field=None, svg_tree=None, svg_file=None, marginals_on_extra_page=False, skip_after=-1.0):
"""Returns all footnotes as a list of Text.
"""
if page is not None and page.source is not None and svg_file is None:
svg_file = page.source\
if page.marginals_source is None\
else page.marginals_source
if transkription_field is None and svg_file is not None:
multipage_index = -1\
if page is None\
else page.multipage_index
transkription_field = TranskriptionField(svg_file, multipage_index=multipage_index)
if svg_tree is None and svg_file is not None:
svg_tree = ET.parse(svg_file)
if style_dict is None and page is not None:
style_dict = StandoffTag.create_relevant_style_dictionary(page)
if page is not None and page.marginals_source is not None:
marginals_on_extra_page = True
svg_tree = ET.parse(page.marginals_source)
nodes_in_footnote_area = cls.EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field, marginals_on_extra_page=marginals_on_extra_page)
bottom_values = cls.GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area)
if len(bottom_values) == 0:
return None
else:
return cls(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, style_dict, skip_after=skip_after)
def extract_footnotes(self, contains_string='', contains_strings=None) -> list:
"""Returns all footnotes as a list of Text.
"""
left_value = -1
for bottom_value in self.bottom_values:
nodes_on_line = sorted([ item for item in self.nodes\
if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == bottom_value\
and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after\
],\
key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX())
footnote = None
matrix = None
for node in nodes_on_line:
matrix = Matrix(transform_matrix_string=node.get('transform'))
footnote, left_value = self._process_content_and_markup(node, footnote, matrix)
if footnote is not None:
self.append(footnote)
footnotes = self.toList()
if contains_strings is not None:
footnotes = [ footnote for footnote in footnotes if True in [ contains_string in footnote.content for contains_string in contains_strings] ]
if contains_string != '':
footnotes = [ footnote for footnote in footnotes if contains_string in footnote.content ]
return footnotes
def get_index(self, left_value) -> int:
"""Return index of column for left value.
"""
index = -1
if round(left_value) in self.footnote_keys.keys():
index = self.footnote_keys[round(left_value)]
else:
for key, value in self.footnote_keys.items():
if abs(key - round(left_value)) < 2:
index = value
break
return index
def register_index(self, left_value):
"""Register index for next column to be used.
"""
index = self.get_index(left_value)
if index > -1:
self.index = index
else:
error_value = round(left_value)
msg = f'Left value not part of columns: {error_value} -> {self.footnote_keys}'
raise Exception(msg)
def toList(self):
"""Return footnotes as a list of Text.
"""
footnotes = []
for footnote_list in self.footnote_columns:
for footnote in footnote_list:
if re.match(self.REFERENCE_PATTERN, footnote.content):
footnotes.append(footnote)
elif len(footnotes) > 0:
footnotes[-1].join(footnote)
else:
print([ footnote.content for footnote in self.footnote_columns[1]])
print(self.footnote_keys)
raise Exception(f'List of footnotes empty and footnote "{footnote.content}" does not match {self.REFERENCE_PATTERN.pattern}!')
return footnotes
def _process_content_and_markup(self, node, footnote, matrix):
"""Process content and markup of node.
[:return:] (footnote: Text, left_value: float)
"""
startIndex = 0
next_text = node.text
left_value = matrix.getX()
items = [ item for item in node.findall('tspan', self.nsmap)]
if len(items) > 0:
next_text = ''.join([ item.text for item in items])
left_value = matrix.add2X(float(items[0].get('x')))
elif bool(node.get('x')):
left_value = matrix.add2X(float(node.get('x')))
if footnote != None and\
((re.match(r'.*[0-9]+:', next_text)\
and re.match(r'.*[0-9]+:', footnote.content)\
and not re.match(r'.*\d-', footnote.content))\
or (self.get_index(left_value) > -1\
and self.get_index(left_value) != self.index)):
if DEBUG and re.match(r'.*[0-9]+:', next_text)\
and not re.match(r'.*[0-9]+:', footnote.content):
print(footnote, next_text)
self.append(footnote)
footnote = None
if len(items) > 0:
for item in items:
footnote, left_value = self._process_content_and_markup(item, footnote, matrix)
else:
if footnote is None:
footnote = Text(content=next_text)
try:
self.register_index(left_value)
except Exception:
print(self.footnote_columns)
raise Exception(f'{footnote}')
else:
startIndex = footnote.append(next_text)
if bool(node.get('class')):
- standoff_markups = StandoffTag.create_cls(startIndex, len(footnote.content), node.get('class'), style_dict=self.style_dict)
+ standoff_markups = StandoffTag.create_cls(startIndex, len(footnote.content)-1, node.get('class'), style_dict=self.style_dict)
if len(standoff_markups) > 0:
if len(footnote.standoff_markups) > 0:
standoff_markups = footnote.standoff_markups[-1].join_list(standoff_markups)
if len(standoff_markups) > 0:
footnote.standoff_markups += standoff_markups
return footnote, left_value
@staticmethod
def EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field=None, marginals_on_extra_page=False) ->list:
"""Return a list of nodes that are in footnote area.
"""
if transkription_field is None and svg_tree is not None:
transkription_field = TranskriptionField(svg_tree.docinfo.URL)
nodes_in_footnote_area = [ item for item in filter(lambda node: Matrix.NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, transkription_field,\
marginals_on_extra_page=marginals_on_extra_page),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
for node in nodes_in_footnote_area:
if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, marginals_on_extra_page=marginals_on_extra_page):
for child in node.getchildren():
if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, x=float(child.get('x')), marginals_on_extra_page=marginals_on_extra_page):
node.remove(child)
return nodes_in_footnote_area
@staticmethod
def GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area) ->list:
"""Return sorted list of unique bottom values.
"""
return sorted([ bottom_value for bottom_value in set(round(Matrix(transform_matrix_string=item.get('transform')).getY(),1) for item in nodes_in_footnote_area) ])
def extract_footnotes_as_strings(transkription_field=None, svg_tree=None, svg_file=None, contains_string='', marginals_extra=False):
"""Returns all footnotes as a list of strings.
"""
if transkription_field is None and svg_file is not None:
transkription_field = TranskriptionField(svg_file)
if svg_tree is None and svg_file is not None:
svg_tree = ET.parse(svg_file)
footnotes = []
nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field, marginals_on_extra_page=marginals_extra),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ])
for bottom_value in bottom_values:
nodes_on_line = [ item for item in nodes_in_footnote_area if Matrix(transform_matrix_string=item.get('transform')).getY() == bottom_value ]
nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX())
footnote_string = ''
for node in nodes_on_line:
if len(node.getchildren()) == 0:
if footnote_string != '' and re.match(r'.*[0-9]+:', node.text):
footnotes.append(footnote_string)
footnote_string = node.text
else:
footnote_string += node.text
else:
next_string = ''.join([ item.text for item in node.findall('tspan', svg_tree.getroot().nsmap)])
if footnote_string != '' and re.match(r'.*[0-9]+:', next_string):
footnotes.append(footnote_string)
footnote_string = next_string
else:
footnote_string += next_string
footnotes.append(footnote_string)
if contains_string != '':
footnotes = [ footnote_string for footnote_string in footnotes if contains_string in footnote_string ]
return footnotes
def extract_footnotes(page, transkription_field=None, svg_tree=None, svg_file=None, contains_string='', contains_strings=None, skip_after=-1.0) ->list:
"""Returns all footnotes as a list of Text.
"""
marginals_on_extra_page = False
if page.marginals_source is not None:
marginals_on_extra_page = True
svg_tree = ET.parse(page.marginals_source)
if transkription_field is None:
transkription_field = TranskriptionField(page.source)
footnote_columns = FootnoteColumns.create_cls(page=page, transkription_field=transkription_field,\
svg_tree=svg_tree, svg_file=svg_file, marginals_on_extra_page=marginals_on_extra_page, skip_after=skip_after)
if footnote_columns is None:
return []
return footnote_columns.extract_footnotes(contains_string=contains_string, contains_strings=contains_strings)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/datatypes/simple_word.py
===================================================================
--- svgscripts/datatypes/simple_word.py (revision 109)
+++ svgscripts/datatypes/simple_word.py (revision 110)
@@ -1,124 +1,139 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent a simple word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import abc
from lxml import etree as ET
+import re
import sys
from .line import Line
from .faksimile_position import FaksimilePosition
from .transkription_position import TranskriptionPosition
from .word_position import WordPosition
sys.path.append('py2ttl')
from class_spec import SemanticClass
class SimpleWord(SemanticClass, metaclass=abc.ABCMeta):
"""
This class represents a simple word.
"""
+ PUNCTUATION_PATTERN = re.compile('(^[\.\?,\!;:\-_–()“„]|[\.\?,\!;:\-_–()“„]$)')
+ FIND_PUNCTUATION_PATTERN = re.compile('(^[\.\?,\!;:\-_–()“„]|.*[\.\?,\!;:\-_–()“„]$)')
XML_TAG = 'simple-word'
XML_SUB_TAG = 'content'
def __init__(self, id=0, line_number=-1, line=None, text='', deleted=False, transkription_positions=None, faksimile_positions=None):
self.id = id
self.text = text
+ self.clean_text = self._create_clean_text(text)
self.line_number = line_number
self.lines = []
if line is not None:
self.lines.append(line)
self.transkription_positions = transkription_positions if transkription_positions is not None else []
self.faksimile_positions = faksimile_positions if faksimile_positions is not None else []
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
if len(target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)) > 0:
word_node = target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)[0]
word_node.getparent().remove(word_node)
word_node = ET.SubElement(target_tree, self.XML_TAG, attrib={'id': str(self.id)})
word_node.set('text', self.text)
if self.line_number > -1:
word_node.set('line-number', str(self.line_number))
for id, transkription_position in enumerate(self.transkription_positions):
transkription_position.id = id
transkription_position.attach_object_to_tree(word_node)
for faksimile_position in self.faksimile_positions:
faksimile_position.attach_object_to_tree(word_node)
return word_node
+ def _create_clean_text(self, text: str) ->str:
+ """Creates a text without any punctuation chars.
+ """
+ if len(text) < 2\
+ or (len(text) < 3 and re.match(self.FIND_PUNCTUATION_PATTERN, text[0]) is None)\
+ or re.match(self.FIND_PUNCTUATION_PATTERN, text) is None:
+ return text
+ return self._create_clean_text(re.sub(self.PUNCTUATION_PATTERN, '', text))
+
@classmethod
def create_cls(cls, word_node):
"""Creates a cls from a (lxml.Element) node.
[:return:] cls
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else -1
text = word_node.get('text')
transkription_positions = [ TranskriptionPosition(id=id, node=node) for id, node in enumerate(word_node.findall('./' + WordPosition.TRANSKRIPTION)) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('./' + WordPosition.FAKSIMILE) ]
return cls(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
else:
error_msg = 'word_node has not been defined'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'lines': {cls.CLASS_KEY: Line,\
cls.CARDINALITY: 1,\
cls.CARDINALITY_RESTRICTION: 'minCardinality',\
cls.PROPERTY_NAME: 'wordBelongsToLine',\
cls.PROPERTY_LABEL: 'word belongs to a line',\
cls.PROPERTY_COMMENT: 'Relating a word to a line.'}}
properties.update(cls.create_semantic_property_dictionary('transkription_positions', TranskriptionPosition,\
name='hasTranskriptionPosition', cardinality=1, cardinality_restriction='minCardinality'))
properties.update(cls.create_semantic_property_dictionary('faksimile_positions', FaksimilePosition,\
name='hasFaksimilePosition', cardinality=1, cardinality_restriction='minCardinality'))
properties.update(cls.create_semantic_property_dictionary('text', str, cardinality=1,\
subPropertyOf=cls.HOMOTYPIC_HAS_TEXT_URL_STRING))
+ properties.update(cls.create_semantic_property_dictionary('clean_text', str, cardinality=1,\
+ name='hasCleanText', label='text without punctuation', comment='text of word without punctuation except abbrevations'))
dictionary.update({cls.CLASS_KEY: class_dict})
dictionary.update({cls.PROPERTIES_KEY: properties})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def init_word(self, page):
"""Initialize word with objects from page.
"""
if self.line_number > -1:
self.lines += [ line for line in page.lines if line.id == self.line_number ]
elif 'word_parts' in self.__dict__.keys() and len(self.word_parts) > 0:
self.lines += [ line for line in page.lines if line.id in [ wp.line_number for wp in self.word_parts ] ]
Index: svgscripts/datatypes/super_page.py
===================================================================
--- svgscripts/datatypes/super_page.py (revision 109)
+++ svgscripts/datatypes/super_page.py (revision 110)
@@ -1,295 +1,296 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a super page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile, basename, dirname
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import sys
import warnings
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .mark_foreign_hands import MarkForeignHands
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .writing_process import WritingProcess
class SuperPage:
"""
This super class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition'
FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile'
ADD2Y = 7
PAGE_RECTO = 'recto'
PAGE_VERSO = 'verso'
STATUS_MERGED_OK = 'faksimile merged'
STATUS_POSTMERGED_OK = 'words processed'
UNITTESTING = False
XML_TAG = 'page'
def __init__(self, xml_file, title=None, page_number='', orientation='North', multipage_index=-1, page_type=PAGE_VERSO, should_xml_file_exist=False):
self.properties_dictionary = {\
'faksimile_image': (FaksimileImage.XML_TAG, None, FaksimileImage),\
'faksimile_svgFile': ('data-source/@file', None, str),\
'multipage_index': ('page/@multipage-index', multipage_index, int),\
'marginals_source': ('page/@marginals-source', None, str),\
'number': ('page/@number', str(page_number), str),\
'orientation': ('page/@orientation', orientation, str),\
'page_type': ('page/@pageType', page_type, str),\
'pdfFile': ('pdf/@file', None, str),\
'source': ('page/@source', None, str),\
'svg_file': ('svg/@file', None, str),\
'svg_image': (SVGImage.XML_TAG, None, SVGImage),\
'text_field': (FaksimileImage.XML_TAG + '/' + TextField.XML_TAG, None, TextField),\
'title': ('page/@title', title, str),\
}
self.bak_file = None
self.online_properties = []
+ self.imprints = []
self.line_numbers = []
self.lines = []
self.mark_foreign_hands = []
self.page_tree = None
self.sonderzeichen_list = []
self.style_dict = {}
self.text_connection_marks = []
self.word_deletion_paths = []
self.word_insertion_marks = []
self.words = []
self.writing_processes = []
self.xml_file = xml_file
if not self.is_page_source_xml_file():
msg = f'ERROR: xml_source_file {self.xml_file} is not of type "{FILE_TYPE_SVG_WORD_POSITION}"'
raise Exception(msg)
self._init_tree(should_xml_file_exist=should_xml_file_exist)
def add_style(self, sonderzeichen_list=None, letterspacing_list=None, style_dict=None, style_node=None):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self.sonderzeichen_list = sonderzeichen_list if sonderzeichen_list is not None else []
self.letterspacing_list = letterspacing_list if letterspacing_list is not None else []
self.style_dict = style_dict if style_dict is not None else {}
if style_node is not None:
self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') }
self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ]
self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('letterspacing-list')) ]
elif bool(self.style_dict):
for node in self.page_tree.xpath('//style'): node.getparent().remove(node)
style_node = ET.SubElement(self.page_tree.getroot(), 'style')
if len(self.sonderzeichen_list) > 0:
style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list))
if len(self.letterspacing_list) > 0:
style_node.set('letterspacing-list', ' '.join(self.letterspacing_list))
for key in self.style_dict.keys():
self.style_dict[key]['name'] = key
ET.SubElement(style_node, 'class', attrib=self.style_dict[key])
fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value }
fontsizes = sorted(fontsize_dict.values(), reverse=True)
# create a mapping between fontsizes and word stages
self.fontsizekey2stage_mapping = {}
for fontsize_key, value in fontsize_dict.items():
if value >= fontsizes[0]-1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION })
elif value <= fontsizes[-1]+1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION })
else:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION })
def get_biggest_fontSize4styles(self, style_set={}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if bool(self.style_dict):
sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True)
return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1
else:
return 1
def get_line_number(self, y):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if len(self.line_numbers) > 0:
result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ]
return result_list[0] if len(result_list) > 0 else -1
else:
return -1
def init_all_properties(self, overwrite=False):
"""Initialize all properties.
"""
for property_key in self.properties_dictionary.keys():
if property_key not in self.online_properties:
self.init_property(property_key, overwrite=overwrite)
def init_property(self, property_key, value=None, overwrite=False):
"""Initialize all properties.
Args:
property_key: key of property in self.__dict__
value: new value to set to property
overwrite: whether or not to update values from xml_file (default: read only)
"""
if value is None:
if property_key not in self.online_properties:
xpath, value, cls = self.properties_dictionary.get(property_key)
if len(self.page_tree.xpath('//' + xpath)) > 0:
value = self.page_tree.xpath('//' + xpath)[0]
if value is not None:
if cls.__module__ == 'builtins':
self.update_tree(value, xpath)
self.__dict__.update({property_key: cls(value)})
else:
value = cls(node=value)\
if type(value) != cls\
else value
self.__dict__.update({property_key: value})
self.__dict__.get(property_key).attach_object_to_tree(self.page_tree)
else:
self.__dict__.update({property_key: value})
self.online_properties.append(property_key)
elif overwrite or property_key not in self.online_properties:
xpath, default_value, cls = self.properties_dictionary.get(property_key)
if cls.__module__ == 'builtins':
self.__dict__.update({property_key: cls(value)})
self.update_tree(value, xpath)
else:
self.__dict__.update({property_key: value})
self.__dict__.get(property_key).attach_object_to_tree(self.page_tree)
self.online_properties.append(property_key)
def is_locked(self):
"""Return true if page is locked.
"""
return len(self.page_tree.xpath('//metadata/lock')) > 0
def is_page_source_xml_file(self, source_tree=None):
"""Return true if xml_file is of type FILE_TYPE_SVG_WORD_POSITION.
"""
if not isfile(self.xml_file):
return True
if source_tree is None:
source_tree = ET.parse(self.xml_file)
return source_tree.getroot().find('metadata/type').text == self.FILE_TYPE_SVG_WORD_POSITION
def lock(self, reference_file, message=''):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if not self.is_locked():
metadata = self.page_tree.xpath('./metadata')[0]\
if len(self.page_tree.xpath('./metadata')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'metadata')
lock = ET.SubElement(metadata, 'lock')
ET.SubElement(lock, 'reference-file').text = reference_file
if message != '':
ET.SubElement(lock, 'message').text = message
def unlock(self):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if self.is_locked():
lock = self.page_tree.xpath('//metadata/lock')[0]
lock.getparent().remove(lock)
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_property_dictionary(self, property_key, default_value):
"""Update properties_dictionary.
"""
content = self.properties_dictionary.get(property_key)
if content is not None:
self.properties_dictionary.update({property_key: (content[0], default_value, content[2])})
else:
msg = f'ERROR: properties_dictionary does not contain a key {property_key}!'
raise Exception(msg)
def update_tree(self, value, xpath):
"""Update tree.
"""
node_name = dirname(xpath)
node = self.page_tree.xpath('//' + node_name)[0]\
if len(self.page_tree.xpath('//' + node_name)) > 0\
else ET.SubElement(self.page_tree.getroot(), node_name)
node.set(basename(xpath).replace('@', ''), str(value))
def _init_tree(self, should_xml_file_exist=False):
"""Initialize page_tree from xml_file if it exists.
"""
if isfile(self.xml_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(self.xml_file, parser)
elif not should_xml_file_exist:
self.page_tree = ET.ElementTree(ET.Element('page'))
self.page_tree.docinfo.URL = self.xml_file
else:
msg = f'ERROR: xml_source_file {self.xml_file} does not exist!'
raise FileNotFoundError(msg)
Index: tests_svgscripts/test_util.py
===================================================================
--- tests_svgscripts/test_util.py (revision 109)
+++ tests_svgscripts/test_util.py (revision 110)
@@ -1,256 +1,260 @@
import unittest
from os import sep, path, remove, listdir
from os.path import isdir, isfile, dirname, basename
import shutil
import sys
import lxml.etree as ET
import sys
import tempfile
import warnings
sys.path.append('svgscripts')
import util
from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT
from datatypes.faksimile import FaksimilePage
from datatypes.page import Page
from datatypes.page_creator import PageCreator
from datatypes.positional_word_part import PositionalWordPart
from datatypes.text_field import TextField
from datatypes.transkriptionField import TranskriptionField
from datatypes.word_position import WordPosition
from datatypes.word import Word
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
sys.path.append('fixes')
from fix_old_data import save_page
class TestCopy(unittest.TestCase):
def setUp(self):
util.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.test_dir = DATADIR
self.faksimile_dir = DATADIR + sep + 'faksimile_svg'
self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg'
self.image = DATADIR + sep + 'image.jpg'
self.svg_testrecord = DATADIR + sep + 'TESTRECORD.svg'
self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml'
self.Mp_XIV_page420 = DATADIR + sep + 'Mp_XIV_page420.xml'
self.tmp_dir = tempfile.mkdtemp()
def test_copy(self):
tmp_image = self.tmp_dir + sep + basename(self.image)
target_file = 'asdf.svg'
shutil.copy(self.image, self.tmp_dir)
util.copy_faksimile_svg_file(target_file, faksimile_source_file=self.faksimile_file,\
target_directory=self.tmp_dir, local_image_path=tmp_image)
self.assertEqual(isfile(self.tmp_dir + sep + target_file), True)
util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_file,\
target_directory=self.tmp_dir, local_image_path=tmp_image)
self.assertEqual(isfile(self.tmp_dir + sep + basename(self.faksimile_file)), True)
with self.assertRaises(Exception):
util.copy_faksimile_svg_file()
with self.assertRaises(Exception):
util.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_source_file)
def test_copy_xml(self):
old_page = Page(self.xml_file)
xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir)
self.assertEqual(isfile(xml_file), True)
page = Page(xml_file)
self.assertEqual(len(page.words), len(old_page.words))
self.assertEqual(len(page.line_numbers), 0)
def test_create_highlighted_svg_file(self):
target_file = self.tmp_dir + sep + basename(self.faksimile_file)
tmp_image = self.tmp_dir + sep + basename(self.image)
faksimile_tree = ET.parse(self.faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
node_ids = ['rect947', 'rect951', 'rect953', 'rect955', 'rect959', 'rect961', 'rect963']
highlight_color = 'blue'
util.create_highlighted_svg_file(faksimile_tree, node_ids, target_directory=self.tmp_dir, highlight_color=highlight_color, namespaces=namespaces)
self.assertEqual(isfile(target_file), True)
new_tree = ET.parse(target_file)
for node in new_tree.xpath('//ns:rect[@fill="{0}"]|//ns:path[@fill="{0}"]'.format(highlight_color), namespaces=namespaces):
node_ids.remove(node.get('id'))
self.assertEqual(len(node_ids), 0)
def test_get_empty_node_ids(self):
faksimile_tree = ET.parse(self.faksimile_file)
faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0]
empty_node_ids = util.get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page)
self.assertEqual('rect1085' in empty_node_ids, True)
def test_record_changes(self):
new_tree = ET.parse(self.faksimile_file)
old_tree = ET.parse(self.faksimile_file)
empty_node_id = 'rect1085'
title_node_id = 'test001'
namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() }
node = new_tree.xpath('//ns:rect[@id="{0}"]'.format(empty_node_id), namespaces=namespaces)[0]
title = ET.SubElement(node, 'title', attrib={ 'id': title_node_id })
title.text = 'test'
new_file = self.tmp_dir + sep + 'new.svg'
old_file = self.tmp_dir + sep + 'old.svg'
util.copy_faksimile_svg_file(target_file=new_file, faksimile_tree=new_tree)
util.copy_faksimile_svg_file(target_file=old_file, faksimile_tree=old_tree)
util.record_changes(old_file, new_file, [ empty_node_id ], namespaces=namespaces)
test_tree = ET.parse(old_file)
self.assertEqual(len(test_tree.xpath('//ns:rect[@id="{0}"]/ns:title[@id="{1}"]'.format(empty_node_id, title_node_id), namespaces=namespaces)), 1)
def test_replace_chars(self):
page = Page(self.xml_file)
faksimile_tree = ET.parse(self.faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
word_position = WordPosition(id='rect1159', text='„Gedächtniß"')
wps, texts = util.replace_chars(page.words, [ word_position ])
self.assertEqual(texts[0].endswith('“'), True)
self.assertEqual(wps[0].text.endswith('“'), True)
word_position = WordPosition(id='rect1173', text='-')
wps, texts = util.replace_chars(page.words, [ word_position ])
self.assertEqual(wps[0].text.endswith('–'), True)
def test_mismatch_words(self):
page = Page(self.xml_file)
faksimile_tree = ET.parse(self.faksimile_file)
faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0]
page = Page('xml/N_VII_1_page174.xml')
faksimile_tree = ET.parse('faksimile_svg/N-VII-1,173et174.svg')
faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)[0]
self.assertEqual('-' in [ tp.text for tp in faksimile_page.word_positions], True)
wps, texts = util.replace_chars(page.words,faksimile_page.word_positions)
self.assertEqual('–' in texts, True)
self.assertEqual(len([ faksimile_position for faksimile_position in wps\
if faksimile_position.text == '–' ]), 4)
mismatching_words, mismatching_faksimile_positions = util.get_mismatching_ids(page.words, faksimile_page.word_positions)
self.assertEqual(len([word for word in mismatching_words if word.text.endswith('“') ]), 0)
self.assertEqual(len([word for word in mismatching_words if word.text.endswith('–') ]), 0)
def test_process_warnings(self):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('default')
warnings.warn('Test1: asdf')
warnings.warn('Test2: asdf')
status = util.process_warnings4status(w, ['Test1', 'Test2' ], 'asdf', 'OK', status_prefix='with warnings')
#print(status)
self.assertTrue('Test1' in status.split(':'))
self.assertTrue('Test2' in status.split(':'))
@unittest.skip('test uses external program, has been tested')
def test_show_files(self):
list_of_files = [ self.test_dir + sep + file for file in listdir(self.test_dir) if file.endswith('pdf') ][0:2]
util.ExternalViewer.show_files(single_file=self.faksimile_file, list_of_files=list_of_files)
def test_record_changes_to_page(self):
page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 1 ])
old_length = len(page.words)
self.assertEqual(page.words[1].text, 'asdf')
self.assertEqual(page.words[1].transkription_positions[0].width, 353)
page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 13 ])
self.assertEqual(page.words[13].text, 'er')
self.assertEqual(page.words[14].text, '=')
self.assertEqual(len(page.words), old_length+1)
page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord, [ 64 ])
self.assertEqual(page.words[64].text, 'Simplifications-apparat')
self.assertEqual(len(page.words[64].transkription_positions), 3)
self.assertEqual(len(page.words), old_length-1)
@unittest.skipUnless(__name__ == "__main__", 'tests all words')
def test_extended__record_changes_to_page(self):
page = Page(self.xml_file)
old_length = len(page.words)
page = util.record_changes_on_svg_file_to_page(self.xml_file, self.svg_testrecord)
self.assertEqual(page.words[1].text, 'asdf')
self.assertEqual(page.words[13].text, 'er')
self.assertEqual(page.words[14].text, '=')
self.assertEqual(page.words[65].text, 'Simplifications-apparat')
self.assertEqual(len(page.words), old_length)
def test_copy_faksimile_update_image_location(self):
test_dir = self.tmp_dir #FAKSIMILE_LOCATION + '/Myriam/Fertig/'
util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir)
with self.assertWarns(UserWarning):
util.copy_faksimile_update_image_location(self.faksimile_file, target_directory=test_dir)
def test_record_changes_on_xml(self):
old_page = Page(self.xml_file)
xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir)
tree = ET.parse(xml_file)
node = tree.xpath('//word[@id="135"]')[0]
counter =0
while node.get('text') != 'gar' or counter > 5:
counter += 1
nextnode = node.getnext()
node.set('text', node.get('text') + nextnode.get('text'))
for element in nextnode.getchildren():
node.append(element)
nextnode.getparent().remove(nextnode)
write_pretty(xml_element_tree=tree, file_name=xml_file,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file)
self.assertEqual(len(new_page.words), len(old_page.words)-2)
self.assertEqual(len([ word for word in new_page.words if word.text == 'gar']), 1)
old_page = Page(self.xml_file)
xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir)
tree = ET.parse(xml_file)
node = tree.xpath('//word[@id="138"]')[0]
counter =0
while node.get('text') != 'nichtvorkommt.' or counter > 5:
counter += 1
nextnode = node.getnext()
node.set('text', node.get('text') + nextnode.get('text'))
for element in nextnode.getchildren():
node.append(element)
nextnode.getparent().remove(nextnode)
node.set('split', 'nicht vorkommt.')
write_pretty(xml_element_tree=tree, file_name=xml_file,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
joined_page = Page(xml_file)
self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.']), 1)
self.assertEqual(len([word for word in joined_page.words if word.text == 'nichtvorkommt.'][0].split_strings), 2)
self.assertEqual(len(joined_page.words), len(old_page.words)-1)
new_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file)
self.assertEqual(len(new_page.words), len(old_page.words))
self.assertEqual(len([word for word in new_page.words if word.text == 'vorkommt.']), 1)
self.assertEqual(len([word for word in old_page.words if word.text == 'nicht']),\
len([word for word in new_page.words if word.text == 'nicht']))
xml_file = util.copy_xml_file_word_pos_only(self.xml_file, self.tmp_dir)
tree = ET.parse(xml_file)
old_page = Page(xml_file)
nodes = tree.xpath('//word[@id>="85" and @id<="87"]')
self.assertEqual(len(nodes), 3)
prevWordText = nodes[0].get('text')
nodes[0].set('join', prevWordText + 'z')
nodes[1].set('split', 'z u')
lastWordText = nodes[2].get('text')
nodes[2].set('join', 'u' + lastWordText)
write_pretty(xml_element_tree=tree, file_name=xml_file,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
joined_page = util.record_changes_on_xml_file_to_page(self.xml_file, xml_file)
self.assertEqual(len(joined_page.words), len(old_page.words)-1)
def test_reset_tp_with_matrix(self):
page = Page(self.Mp_XIV_page420)
util.reset_tp_with_matrix(page.words[0].transkription_positions)
self.assertTrue(page.words[0].transkription_positions[0].left > 0 and page.words[0].transkription_positions[0].top > -5)
transformed_words = [w for w in page.words if (len(w.transkription_positions) > 0 and w.transkription_positions[0].transform is not None) ]
util.reset_tp_with_matrix(transformed_words[0].transkription_positions)
self.assertEqual(transformed_words[0].transkription_positions[0].left, 0)
self.assertTrue(transformed_words[0].transkription_positions[0].top < 0)
def test_back_up(self):
test_dir = self.tmp_dir
page = Page(self.xml_file)
target_file_name = util.back_up(page, self.xml_file, bak_dir=test_dir)
self.assertEqual(isfile(target_file_name), True)
svg_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
util.back_up_svg_file(svg_tree, namespaces)
+ """
+ page = Page('xml/Mp_XV_page79r.xml')
+ util.back_up(page, page.xml_file)
+ """
def tearDown(self):
shutil.rmtree(self.tmp_dir, ignore_errors=True)
pass
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_simple_word.py
===================================================================
--- tests_svgscripts/test_simple_word.py (revision 109)
+++ tests_svgscripts/test_simple_word.py (revision 110)
@@ -1,36 +1,44 @@
import unittest
from os import sep, path
from os.path import dirname, isdir
import lxml.etree as ET
+import re
import sys
sys.path.append('svgscripts')
from datatypes.matrix import Matrix
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.simple_word import SimpleWord
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.word import Word
class TestSimpleWord(unittest.TestCase):
def test_get_semanticAndDataDict(self):
dictionary = SimpleWord.get_semantic_dictionary()
#print(dictionary)
def test_create_cls_from_word(self):
word = Word(text='test')
mark = MarkForeignHands.create_cls_from_word(word)
self.assertEqual(mark.text, word.text)
self.assertEqual(type(mark), MarkForeignHands)
def test_attach(self):
word = SimpleWord()
word.transkription_positions.append(TranskriptionPosition(id=0))
word.transkription_positions.append(TranskriptionPosition(id=0))
tree = ET.Element('page')
word.attach_word_to_tree(tree)
self.assertEqual(len(tree.xpath('//' + TranskriptionPosition.XML_TAG)), 2)
+ def test_clean_text(self):
+ word = SimpleWord()
+ self.assertEqual(word._create_clean_text('-asdf'), 'asdf')
+ self.assertEqual(word._create_clean_text('(-asdf)'), 'asdf')
+ self.assertEqual(word._create_clean_text('(a.)'), 'a.')
+ self.assertEqual(word._create_clean_text('.verhehlen'), 'verhehlen')
+
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_imprint.py
===================================================================
--- tests_svgscripts/test_imprint.py (revision 0)
+++ tests_svgscripts/test_imprint.py (revision 110)
@@ -0,0 +1,57 @@
+import unittest
+from os import sep, path, remove
+from os.path import isdir, isfile, dirname
+import shutil
+import sys
+import lxml.etree as ET
+import warnings
+import sys
+
+sys.path.append('svgscripts')
+
+import datatypes.imprint
+from datatypes.imprint import Imprint, extract_imprints, get_lines, UNITTESTING, DEBUG
+from datatypes.matrix import Matrix
+from datatypes.page import Page
+from datatypes.positional_word_part import PositionalWordPart
+from datatypes.transkriptionField import TranskriptionField
+
+class TestExtractImprint(unittest.TestCase):
+ def setUp(self):
+ datatypes.imprint.UNITTESTING = True
+ DATADIR = path.dirname(__file__) + sep + 'test_data'
+ self.test_page = Page.create_cls(DATADIR + sep + 'Mp_XV_page79v.xml')
+ self.test_page.source = self.test_page.page_tree.docinfo.URL.replace('.xml', '.svg')
+
+ def test_extract_footnotes(self):
+ imprints = extract_imprints(self.test_page)
+ self.assertEqual(len(imprints), 4)
+ """
+ for imprint in imprints:
+ print(imprint.reference, imprint.start_line, imprint.end_line)
+ """
+
+ def test_attach(self):
+ imprints = extract_imprints(self.test_page)
+ tree = ET.ElementTree(ET.Element('asdf'))
+ for imprint in imprints:
+ imprint.attach_object_to_tree(tree)
+ tree.xpath('//asdf')[0].set('test', 'This is a Test.')
+ #print(ET.dump(tree.getroot()))
+
+ def test_init_from_node(self):
+ for imprint in extract_imprints(self.test_page):
+ imprint.attach_object_to_tree(self.test_page.page_tree)
+ imprints = [ Imprint.create_cls_from_node(node, self.test_page.lines) for node in self.test_page.page_tree.xpath('//' + Imprint.XML_TAG) ]
+ self.assertEqual(len(imprints), 4)
+ """
+ Imprint.DEBUG = True
+ page = Page('xml/Mp_XV_page81v.xml')
+ line_list_string = '21-24,30-36,65-68'
+ for imprint in page.imprints:
+ print(imprint.reference)
+ for line in imprint.lines: print(line.id)
+ """
+
+if __name__ == "__main__":
+ unittest.main()
Index: tests_svgscripts/test_process_footnotes.py
===================================================================
--- tests_svgscripts/test_process_footnotes.py (revision 109)
+++ tests_svgscripts/test_process_footnotes.py (revision 110)
@@ -1,46 +1,54 @@
import unittest
from os import sep, path, remove
from os.path import isdir, isfile, dirname
import shutil
import sys
import lxml.etree as ET
import warnings
import sys
sys.path.append('svgscripts')
from datatypes.footnotes import extract_footnotes
+from datatypes.imprint import Imprint
from datatypes.page import Page
import process_footnotes
-from process_footnotes import categorize_footnotes, main
+from process_footnotes import categorize_footnotes, main, save_imprints
class TestExtractFootnotes(unittest.TestCase):
def setUp(self):
process_footnotes.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.test_footnote = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.test_footnote_verso = DATADIR + sep + 'N_VII_1_xp5_4_page5.svg'
self.test_footnote_recto = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg'
self.test_footnote_multi = DATADIR + sep + 'N_VII_1_xp5_4_page13.svg'
self.test_footnote_multi_xml = DATADIR + sep + 'N_VII_1_page013.xml'
self.test_categorize_footnote = DATADIR + sep + 'N_VII_1_page006.xml'
def test_categorize_footnotes(self):
page = Page(self.test_categorize_footnote)
footnotes = extract_footnotes(page, svg_file=self.test_footnote_recto)
categorize_footnotes(page, footnotes)
words_with_comments = [ word for word in page.words if word.editor_comment is not None ]
self.assertEqual(len(words_with_comments), 4)
lines_with_comments = [ line for line in page.lines if len(line.editor_comments) > 0 ]
self.assertEqual(len(lines_with_comments), 1)
page = Page('xml/W_II_1_page141.xml')
footnotes = extract_footnotes(page)
categorize_footnotes(page, footnotes, debug=True)
words_with_comments = [ word for word in page.words if word.editor_comment is not None ]
+ def test_save_imprints(self):
+ page = Page(self.test_categorize_footnote)
+ save_imprints(page)
+ self.assertEqual(len(page.page_tree.xpath('//' + Imprint.XML_TAG)), 2)
+ #print(ET.dump(page.page_tree.getroot()))
+
+
def test_main(self):
self.assertEqual(main(['xml/N_VII_1_page005.xml']), 0)
if __name__ == "__main__":
unittest.main()
Index: fixes/fix_old_data.py
===================================================================
--- fixes/fix_old_data.py (revision 109)
+++ fixes/fix_old_data.py (revision 110)
@@ -1,540 +1,551 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to fix old data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
sys.path.append('svgscripts')
from convert_wordPositions import HTMLConverter
from datatypes.box import Box
from datatypes.faksimile import FaksimilePage
+from datatypes.imprint import Imprint
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.matrix import Matrix
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from datatypes.positional_word_part import PositionalWordPart
from datatypes.path import Path
from datatypes.word import Word
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word, update_transkription_position_ids
from join_faksimileAndTranskription import sort_words
from util import back_up, back_up_svg_file, copy_faksimile_svg_file, reset_tp_with_matrix
from process_files import update_svgposfile_status
+from process_footnotes import save_imprints
from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import create_function_dictionary, get_manuscript_files
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
MAX_SVG_XY_THRESHOLD = 10
#TODO: fix all svg graphical files: change xlink:href to href!!!!
def convert_old_matrix(tp, xmin, ymin) ->(Matrix, float, float):
"""Return new matrix, x and y for old transkription_position.
"""
matrix = tp.transform.clone_transformation_matrix()
matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3)
x = round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)\
if tp.left > 0\
else 0
y = round((tp.height-1.5)*-1, 3)
return matrix, x, y
def save_page(page, attach_first=False, backup=False, script_name=None):
"""Write page to xml file
"""
if backup:
back_up(page, page.xml_file)
if attach_first:
page.update_and_attach_words2tree()
if script_name is None:
script_name = f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}'
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=script_name, file_type=FILE_TYPE_SVG_WORD_POSITION)
def page_already_changed(page) -> bool:
"""Return whether page has alreadybeen changed by function
"""
return len(\
page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}:{inspect.currentframe().f_back.f_code.co_name}"]')\
) > 0
def fix_faksimile_line_position(page, redo=False) -> bool:
"""Create a faksimile line position.
"""
if not redo and page_already_changed(page):
return False;
update_faksimile_line_positions(page)
if not UNITTESTING:
save_page(page)
return True
def check_faksimile_positions(page, redo=False) -> bool:
"""Check faksimile line position.
"""
if len(page.page_tree.xpath('//data-source/@file')) > 0:
svg_file = page.page_tree.xpath('//data-source/@file')[0]
svg_tree = ET.parse(svg_file)
positions_are_equal_counter = 0
page_changed = False
for faksimile_page in FaksimilePage.GET_FAKSIMILEPAGES(svg_tree):
if page.title == faksimile_page.title\
and page.number == faksimile_page.page_number:
#print([fp.id for fp in faksimile_page.word_positions ])
for word in page.words:
for fp in word.faksimile_positions:
rect_fps = [ rfp for rfp in faksimile_page.word_positions if rfp.id == fp.id ]
if len(rect_fps) > 0:
rfp = rect_fps[0]
if fp.left != rfp.left or fp.top != rfp.top:
#print(f'{fp.id}: {fp.left}/{rfp.left} {fp.top}/{rfp.top}')
fp.left = rfp.left
fp.top = rfp.top
fp.bottom = fp.top + rfp.height
word.attach_word_to_tree(page.page_tree)
page_changed = True
else:
positions_are_equal_counter += 1
print(f'{positions_are_equal_counter}/{len(page.words)} are equal')
if page_changed and not UNITTESTING:
save_page(page)
return page_changed
def fix_faksimile_positions(page, redo=False) -> bool:
"""Set faksimile positions to absolute values.
[:return:] fixed
"""
if not redo and len(page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}"]')) > 0:
return False
x_min = page.text_field.xmin
y_min = page.text_field.ymin
for word in page.words:
for fp in word.faksimile_positions:
fp.left = fp.left + x_min
fp.top = fp.top + y_min
fp.bottom = fp.bottom + y_min
word.attach_word_to_tree(page.page_tree)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
return True
def _fix_tp_of_word(page, word, text_field):
"""Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top
"""
for tp in word.transkription_positions:
tp.left += text_field.left
tp.top += text_field.top
reset_tp_with_matrix(word.transkription_positions)
if type(word) == Word:
words_in_word = word.word_parts + [ item for item in word.__dict__.items() if type(item) == Word ]
for wp in words_in_word:
_fix_tp_of_word(page, wp, text_field)
def fix_tp_with_matrix(page, redo=False) -> bool:
"""Fix transkription positions with rotation matrix ->set left to 0 and top to -5.
[:return:] fixed
"""
xmin = 0 if page.svg_image is None or page.svg_image.text_field is None else page.svg_image.text_field.left
ymin = 0 if page.svg_image is None or page.svg_image.text_field is None else page.svg_image.text_field.top
for word in page.words:
reset_tp_with_matrix(word.transkription_positions, tr_xmin=xmin, tr_ymin=ymin)
for wp in word.word_parts:
reset_tp_with_matrix(wp.transkription_positions, tr_xmin=xmin, tr_ymin=ymin)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, attach_first=True)
return True
def _fix_old_transkription_positions(page, redo=False) -> bool:
"""Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top
[:return:] fixed
"""
if page.svg_image is not None\
and page.svg_image.text_field is None:
if page.svg_image is None:
if page.svg_file is not None:
transkription_field = TranskriptionField(page.svg_file)
width = round(tf.documentWidth, 3)
height = round(tf.documentHeight, 3)
page.svg_image = SVGImage(file_name=svg_file, width=width,\
height=height, text_field=transkription_field.convert_to_text_field())
page.svg_image.attach_object_to_tree(page.page_tree)
else:
raise Exception(f'ERROR page {page.page_tree.docinfo.URL} does not have a svg_file!')
elif page.svg_image.text_field is None:
page.svg_image.text_field = TranskriptionField(page.svg_image.file_name).convert_to_text_field()
page.svg_image.attach_object_to_tree(page.page_tree)
for line_number in page.line_numbers:
line_number.top += page.svg_image.text_field.top
line_number.bottom += page.svg_image.text_field.top
line_number.attach_object_to_tree(page.page_tree)
for word in page.words:
_fix_tp_of_word(page, word, page.svg_image.text_field)
for mark in page.mark_foreign_hands:
_fix_tp_of_word(page, mark, page.svg_image.text_field)
for tcm in page.text_connection_marks:
_fix_tp_of_word(page, tcm, page.svg_image.text_field)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, attach_first=True)
return True
return False
def _fix_old_pwps(page, old_tps):
"""Adjust positional_word_parts to corrected transkription_positions.
"""
for tp in old_tps:
for pwp in tp.xpath(f'./{PositionalWordPart.XML_TAG}'):
left = float(pwp.get('left'))
top = float(pwp.get('top'))
bottom = float(pwp.get('bottom'))
pwp.set('left', str(left + page.svg_image.text_field.left))
pwp.set('top', str(top + page.svg_image.text_field.top))
pwp.set('bottom', str(bottom + page.svg_image.text_field.top))
def _fix_quotation_mark_tps(page, old_tps):
"""Fix the height of transkription_positions of words with quotation marks.
"""
for tp in old_tps:
heighest_pwp = sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('height')), reverse=True)[0]
toppest_pwp = sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('top')))[0]
new_height = float(tp.get('height')) + abs(float(heighest_pwp.get('top'))-float(toppest_pwp.get('top')))
tp.set('height', str(new_height))
def fix_transkription_positions(page, redo=False) -> bool:
"""Fix transkription positions ->set relative to 0,0 instead of text_field.left,text_field.top
[:return:] fixed
"""
THRESHOLD = 10
if page.svg_image is not None\
and page.svg_image.text_field is None:
if not _fix_old_transkription_positions(page):
return False
_fix_old_pwps(page, [ pwp.getparent() for pwp in page.page_tree.xpath(f'//{PositionalWordPart.XML_TAG}[@id="0"]')\
if abs(float(pwp.get('left')) - float(pwp.getparent().get('left'))) > THRESHOLD ])
_fix_quotation_mark_tps(page, [ tp for tp in page.page_tree.xpath(f'//{TranskriptionPosition.XML_TAG}')\
if len(tp.xpath(f'./{PositionalWordPart.XML_TAG}')) > 0\
and sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('height')), reverse=True)[0]\
!= sorted(tp.xpath(f'./{PositionalWordPart.XML_TAG}'), key=lambda pwp: float(pwp.get('top')))[0] ])
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page)
return True
def fix_styles(page, redo=False):
"""Remove unused styles from tree.
"""
if len(page.page_tree.xpath('//style')) > 1:
for node in page.page_tree.xpath('//style')[1:]: node.getparent().remove(node)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page)
return True
+def fix_imprints(page, redo=False):
+ """Remove unused styles from tree.
+ """
+ if len(page.page_tree.xpath('//' + Imprint.XML_TAG)) == 0:
+ save_imprints(page)
+ return True
+
def merge_transkription_positions(page, redo=False) -> bool:
"""Fix transkription positions of merged words
[:return:] fixed
"""
if not isdir(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR)\
or not isfile(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL)):
return False
merged_page = Page(dirname(page.page_tree.docinfo.URL) + sep + MERGED_DIR + sep + basename(page.page_tree.docinfo.URL))
sync_dictionary = sync_words_linewise(merged_page.words, page.words, merged_page.line_numbers)
words = []
for source_word in merged_page.words:
words.append(source_word)
if bool(sync_dictionary.get(source_word)):
_sync_transkriptions_with_words(source_word, sync_dictionary)
if source_word.text != ''.join([ t.get_text() for t in source_word.transkription_positions ]):
text = ''.join([ t.get_text() for t in source_word.transkription_positions ])
print(f'{source_word.line_number}: {source_word.text} has transkription_positions with text "{text}".')
response = input('Change? [Y/n]>')
if not response.startswith('n'):
new_sync_dictionary = sync_words_linewise(merged_page.words, page.words,\
[ line for line in merged_page.line_numbers if line.id == source_word.line_number ], force_sync_on_word=source_word)
if bool(new_sync_dictionary.get(source_word)):
_sync_transkriptions_with_words(source_word, new_sync_dictionary)
else:
raise Exception(f'Could not find sourc_word {source_word.text} in {new_sync_dictionary}!')
page.words = words
page.update_and_attach_words2tree()
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page)
return True
def fix_graphical_svg_file(page, redo=False) -> bool:
"""Fix glyphs of word for which there is a /changed-word in page.page_tree
"""
svg_tree = ET.parse(page.svg_file)
transkription_field = TranskriptionField(page.source)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
back_up_svg_file(svg_tree, namespaces=namespaces)
tr_xmin = transkription_field.xmin if (page.svg_image is None or page.svg_image.text_field is None) else 0
tr_ymin = transkription_field.ymin if (page.svg_image is None or page.svg_image.text_field is None) else 0
for deleted_word_node in page.page_tree.xpath('//deleted-word'):
deleted_word = Word.create_cls(deleted_word_node)
_run_function_on_nodes_for_word(svg_tree, namespaces, deleted_word, tr_xmin, tr_ymin, _set_node_attribute_to, 'visibility', 'hidden')
for changed_word_node in page.page_tree.xpath('//changed-word'):
changed_word = Word.create_cls(changed_word_node)
try:
word = [ word for word in page.words if word.id == changed_word.id and word.text == changed_word.text ][0]
left_difference = word.transkription_positions[0].left - changed_word.transkription_positions[0].left
_run_function_on_nodes_for_word(svg_tree, namespaces, word, tr_xmin, tr_ymin, _add_value2attribute, 'x', left_difference)
except IndexError:
warnings.warn(f'There is no word for changed_word {changed_word.id}: "{changed_word.text}" in {page.page_tree.docinfo.URL}!')
copy_faksimile_svg_file(target_file=page.svg_file, faksimile_tree=svg_tree, namespaces=namespaces)
def _add_value2attribute(node, attribute, value):
"""Add left_difference to x of node.
"""
node.set(attribute, str(float(node.get(attribute)) + value))
node.set('changed', 'true')
def _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=0.1) -> list:
"""Return nodes with symbol_id n x = svg_x and y = svg_y.
"""
nodes = [ node for node in svg_tree.xpath(\
f'//ns:use[@xlink:href="#{symbol_id}" and @x > {svg_x-threshold} and @x < {svg_x+threshold} and @y > {svg_y-threshold} and @y < {svg_y+threshold} ]',\
namespaces=namespaces) if not bool(node.get('changed')) ]
if len(nodes) == 0 and threshold < MAX_SVG_XY_THRESHOLD:
return _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y, threshold=threshold+1)
return nodes
def _run_function_on_nodes_for_word(svg_tree, namespaces, word, tr_xmin, tr_ymin, function_on_node, attribute, value):
"""Run function on nodes for words.
"""
for tp in word.transkription_positions:
for pwp in tp.positional_word_parts:
symbol_id = pwp.symbol_id
svg_x = pwp.left + tr_xmin
svg_y = pwp.bottom + tr_ymin
nodes = _get_nodes_with_symbol_id(svg_tree, namespaces, symbol_id, svg_x, svg_y)
if len(nodes) > 0:
node = nodes[0]
function_on_node(node, attribute, value)
def _set_node_attribute_to(node, attribute, value):
"""Set attribute of node to value.
"""
node.set(attribute, str(value))
node.set('changed', 'true')
def sync_words_linewise(source_words, target_words, lines, force_sync_on_word=None) -> dict:
"""Sync words an create a dictionary with source_words as keys, refering to a list of corresponding words.
"""
result_dict = {}
for word in target_words + source_words: word.processed = False
for line in lines:
source_words_on_line = sorted([ word for word in source_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left)
target_words_on_line = sorted([ word for word in target_words if word.line_number == line.id ], key=lambda word: word.transkription_positions[0].left)
if len(target_words_on_line) == len(source_words_on_line):
_sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word)
elif len(source_words_on_line) < len(target_words_on_line):
_sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=force_sync_on_word)
else:
print('okey dokey')
return result_dict
def _force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict):
"""Force sync on word.
"""
unprocessed_target_words = [t_word for t_word in target_words_on_line if not t_word.processed]
if len(unprocessed_target_words) > 0:
print([ (i, t_word.text) for i, t_word in enumerate(unprocessed_target_words)])
response = input(f'Please specify indices of words to sync {force_sync_on_word.text} with: [default:0-{len(unprocessed_target_words)-1}]>')
indices = [ i for i in range(0, len(unprocessed_target_words)) ]
if re.match(r'\d+-\d+', response):
index_strings = response.split('-')
indices = [ i for i in range(int(index_strings[0]), int(index_strings[1])+1) ]
elif response != '':
indices = [ int(i) for i in response.split(' ') ]
target_words = []
for i in indices: target_words.append(unprocessed_target_words[i])
result_dict.update({ force_sync_on_word: target_words })
else:
raise Exception(f'There are no unprocessed target_words for {force_sync_on_word.text} on line {force_sync_on_word.line_number}!')
def _sync_transkriptions_with_words(word, sync_dictionary):
"""Sync transkription_positions of word with syncronized words.
"""
word.transkription_positions = []
for target_word in sync_dictionary[word]:
word.transkription_positions += target_word.transkription_positions
def _sync_more_target_words(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None):
"""Sync if there are more target words.
"""
current_source_word = None
for target_word in target_words_on_line:
if current_source_word is not None\
and current_source_word.text.startswith(''.join([ w.text for w in result_dict[current_source_word]]) + target_word.text):
result_dict[current_source_word].append(target_word)
target_word.processed = True
if current_source_word.text == ''.join([ w.text for w in result_dict[current_source_word]]):
current_source_word = None
elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ]) > 0:
source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text == target_word.text ][0]
target_word.processed = True
source_word.processed = True
result_dict.update({ source_word: [ target_word ] })
elif len([ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ]) > 0:
current_source_word = [ s_word for s_word in source_words_on_line if not s_word.processed and s_word.text.startswith(target_word.text) ][0]
current_source_word.processed = True
target_word.processed = True
result_dict.update({ current_source_word: [ target_word ] })
else:
msg = f'On line {target_word.line_number}: target_word "{target_word.text}" does not have a sibling in {[ s.text for s in source_words_on_line if not s.processed ]}'
warnings.warn(msg)
if force_sync_on_word is not None:
_force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict)
def _sync_same_length(result_dict, source_words_on_line, target_words_on_line, force_sync_on_word=None):
"""Sync same length
"""
for i, word in enumerate(source_words_on_line):
if word.text == target_words_on_line[i].text:
word.processed = True
target_words_on_line[i].processed = True
result_dict.update({ word: [ target_words_on_line[i] ] })
elif len([ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ]) > 0:
target_word = [ t_word for t_word in target_words_on_line if not t_word.processed and t_word.text == word.text ][0]
word.processed = True
target_word.processed = True
result_dict.update({ word: [ target_word ] })
else:
msg = f'On line {word.line_number}: source_word "{word.text}" does not have a sibling in {[ s.text for s in target_words_on_line]}'
warnings.warn(msg)
if force_sync_on_word is not None:
_force_sync_on_word(force_sync_on_word, target_words_on_line, result_dict)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix old data.
svgscripts/fix_old_data.py [OPTIONS]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-c|--check-faksimile-positions check whether faksimile positions have been updated
+ -i|--fix-imprints add imprints to page
-l|--faksimile-line-position create faksimile line positions
-p|--faksimile-positions fix old faksimile positions
-r|--redo rerun
-s|--fix-graphical-svg fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file.
-S|--fix-styles fix use position of glyphs for words changed by 'changed-word' and 'deleted-word' in xml file.
-t|--transkription-positions fix old transkription positions
-M|--matrix fix old transkription positions with transform matrix
:return: exit code (int)
"""
function_list = []
function_dict = create_function_dictionary(['-c', '--check-faksimile-positions'], check_faksimile_positions)
function_dict = create_function_dictionary(['-l', '--faksimile-line-position'], fix_faksimile_line_position, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-p', '--faksimile-positions'], fix_faksimile_positions, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-m', '--merge-positions'], merge_transkription_positions, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-s', '--fix-graphical-svg'], fix_graphical_svg_file, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-M', '--matrix'], fix_tp_with_matrix, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-t', '--transkription-positions'], fix_transkription_positions, function_dictionary=function_dict)
- function_dict = create_function_dictionary(['default', '-S', '--fix-styles'], fix_styles, function_dictionary=function_dict)
+ function_dict = create_function_dictionary(['-S', '--fix-styles'], fix_styles, function_dictionary=function_dict)
+ function_dict = create_function_dictionary(['default', '-i', '--fix-imprints'], fix_imprints, function_dictionary=function_dict)
redo = False;
try:
- opts, args = getopt.getopt(argv, "hcplrmsStM", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position",\
- "redo", "merge-positions", "fix-graphical-svg", "fix-styles", "transkription-positions", 'matrix' ])
+ opts, args = getopt.getopt(argv, "hcplrmsStMi", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position",\
+ "redo", "merge-positions", "fix-graphical-svg", "fix-styles", "transkription-positions", 'matrix', 'fix-imprints' ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-r', '--redo'):
redo = True;
elif opt in function_dict.keys():
function_list.append(function_dict[opt])
if len(function_list) == 0:
function_list.append(function_dict['default'])
if len(args) < 1:
usage()
return 2
exit_status = 0
for xml_file in get_manuscript_files(args):
if isfile(xml_file):
counters = { f.__name__: 0 for f in function_list }
for current_function in function_list:
status_contains = STATUS_MERGED_OK if 'faksimile' in current_function.__name__ else 'OK'
for page in Page.get_pages_from_xml_file(xml_file, status_contains=status_contains):
if not UNITTESTING:
print(Fore.CYAN + f'Processing {page.title}, {page.number} with function {current_function.__name__} ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
counters[current_function.__name__] += 1 if current_function(page, redo=redo) else 0
if not UNITTESTING:
for function_name, counter in counters.items():
print(Style.RESET_ALL + f'[{counter} pages changed by {function_name}]')
else:
raise FileNotFoundError('File {} does not exist!'.format(xml_file))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: py2ttl/py2ttl_ontology.py
===================================================================
--- py2ttl/py2ttl_ontology.py (revision 109)
+++ py2ttl/py2ttl_ontology.py (revision 110)
@@ -1,369 +1,371 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert py classes that are
subclasses of class_spec.SemanticClass to
a owl ontology in turtle format.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
+from datetime import datetime
import getopt
import importlib
import importlib.util
import inspect
import lxml.etree as ET
from os import sep, path, listdir
from os.path import isfile, isdir, dirname, basename
from progress.bar import Bar
from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD
import re
import requests
import sys
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from class_spec import SemanticClass, UnSemanticClass
from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL, PROJECT_ONTOLOGY_FILE_URL
from data_handler import RDFDataHandler
sys.path.append('shared_util')
from myxmlwriter import dict2xml
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Py2TTLOntologyConverter:
"""This class can be used convert semantic_dictionaries to a owl ontology in turtle format.
"""
UNITTESTING = False
INFERRED_SUB_CLASS = RDFS.subClassOf * '*'
def __init__(self, project_ontology_file=None):
self.class_uri_dict = {}
self.uri_mapping4cls_and_properties = {}
self.project_graph = Graph()
self.base_uriref = URIRef(PROJECT_URL)
self.project_name = PROJECT_NAME
self.ns = { self.base_uriref + '#': self.project_name }
if project_ontology_file is not None and isfile(project_ontology_file):
if project_ontology_file == PROJECT_ONTOLOGY_FILE:
r = requests.get(PROJECT_ONTOLOGY_FILE_URL)
with open(project_ontology_file, 'wb') as f:
f.write(r.content)
print(f'{project_ontology_file} updated from github repository')
self.project_graph.parse(project_ontology_file, format="turtle")
if len(self.project_graph) > 0:
self.base_uriref = self.project_graph.value(predicate=RDF.type, object=OWL.Ontology, any=False)
self.ns = { uriref: ns for ns, uriref in self.project_graph.namespace_manager.namespaces() }
self.project_name = self.ns.get(self.base_uriref + '#')
self.project_graph.bind(self.project_name, self.base_uriref + '#')
+ self.project_graph.add((self.base_uriref, OWL.versionInfo, Literal(datetime.now().strftime('%Y-%m-%d'))))
self.uri_mapping4cls_and_properties.update({ 'ontology': { 'project_name': self.project_name, 'project_uri': self.base_uriref + '#' }})
self.uri_mapping4cls_and_properties.update({ 'classes': {} })
def addClass2Graph(self, cls, semantic_dict=None) -> (URIRef, type):
"""Add a class to project_graph.
:return: (cls_uri (URIRef), super_cls (cls))
"""
if semantic_dict is None:
semantic_dict = cls.get_semantic_dictionary()
comment, label = self.get_comment_label(cls)
cls_uri = URIRef(self.base_uriref + '#' + cls.__name__)
self.project_graph.add((cls_uri, RDF.type, OWL.Class))
self.project_graph.add((cls_uri, RDFS.isDefinedBy, self.base_uriref))
if comment != '':
self.project_graph.add((cls_uri, RDFS.comment, Literal(comment, lang='en')))
if label != '':
self.project_graph.add((cls_uri, RDFS.label, Literal(label, lang='en')))
super_uri = None
super_cls = None
if bool(semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.TYPE)):
super_cls = semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.TYPE)
super_uri = self.createClassAndProperties(super_cls)
if super_uri is not None:
self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri))
if SemanticClass.SUBCLASS_OF in semantic_dict[SemanticClass.CLASS_KEY].keys()\
and len(semantic_dict[SemanticClass.CLASS_KEY][SemanticClass.SUBCLASS_OF]) > 0:
for super_uri_string in semantic_dict[SemanticClass.CLASS_KEY].get(SemanticClass.SUBCLASS_OF):
super_uri = URIRef(super_uri_string)
if not (cls_uri, self.INFERRED_SUB_CLASS, super_uri) in self.project_graph:
self.project_graph.add((cls_uri, RDFS.subClassOf, super_uri))
return cls_uri, super_cls
def addProperty2Graph(self, property_uri, domain_uri, range_uri, info_dict, property_type=OWL.ObjectProperty):
"""Add a property to self.project_graph.
"""
label = 'has ' + property_uri.split('#')[1].replace('has','')\
if SemanticClass.PROPERTY_LABEL not in info_dict.keys() else info_dict[SemanticClass.PROPERTY_LABEL]
self.project_graph.add((property_uri, RDF.type, property_type))
self.project_graph.add((property_uri, RDFS.isDefinedBy, self.base_uriref))
self.project_graph.add((property_uri, RDFS.domain, domain_uri))
self.project_graph.add((property_uri, RDFS.range, range_uri))
if SemanticClass.PROPERTY_COMMENT in info_dict.keys():
comment = info_dict[SemanticClass.PROPERTY_COMMENT]
self.project_graph.add((property_uri, RDFS.comment, Literal(comment, lang='en')))
self.project_graph.add((property_uri, RDFS.label, Literal(label, lang='en')))
if SemanticClass.CARDINALITY in info_dict.keys()\
and info_dict[SemanticClass.CARDINALITY] > 0:
self.addRestriction2Class(domain_uri, property_uri, info_dict)
def addRestriction2Class(self, cls_uri, property_uri, info_dict):
"""Adds restriction on property_uri to class cls_uri.
"""
if SemanticClass.CARDINALITY in info_dict.keys()\
and info_dict[SemanticClass.CARDINALITY] > 0:
if (cls_uri, None, None) not in self.project_graph:
warnings.warn('{} not in graph!'.format(cls_uri))
restriction = BNode()
cardinality_restriction = URIRef(OWL + info_dict[SemanticClass.CARDINALITY_RESTRICTION])\
if SemanticClass.CARDINALITY_RESTRICTION in info_dict.keys()\
else OWL.cardinality
cardinality = info_dict[SemanticClass.CARDINALITY]
self.project_graph.add((cls_uri, RDFS.subClassOf, restriction))
self.project_graph.add((restriction, RDF.type, OWL.Restriction))
self.project_graph.add((restriction, OWL.onProperty, property_uri))
self.project_graph.add((restriction, cardinality_restriction, Literal(str(cardinality), datatype=XSD.nonNegativeInteger)))
def create_ontology(self, datatypes_dir, target_ontology_file):
"""Convert all classes contained in datatypes_dir that are subclasses of class_spec.SemanticClass to rdf.
:return: exit code (int)
"""
if isdir(datatypes_dir):
semantic_classes = self.get_semantic_classes(datatypes_dir)
if not Py2TTLOntologyConverter.UNITTESTING:
bar = Bar('creating classes and properties', max=len(semantic_classes))
for cls in semantic_classes:
self.createClassAndProperties(cls)
not bool(Py2TTLOntologyConverter.UNITTESTING) and bar.next()
not bool(Py2TTLOntologyConverter.UNITTESTING) and bar.finish()
self.uri_mapping4cls_and_properties['ontology'].update({'ontology_file': target_ontology_file})
f = open(target_ontology_file, 'wb+')
f.write(self.project_graph.serialize(format="turtle"))
f.close()
if not Py2TTLOntologyConverter.UNITTESTING:
xml_file = 'mapping_file4' + datatypes_dir.replace(sep, '.') + '2' + target_ontology_file.replace('.' + sep, '').replace(sep, '.').replace('.ttl', '.xml')
dict2xml(self.uri_mapping4cls_and_properties, xml_file)
else:
print('Error: dir {} does not exist!'.format(datatypes_dir))
usage
return 1
return 0
def createClassAndProperties(self, cls):
"""Creates a owl:Class and some owl:ObjectProperty from semantic_dictionary of a python class.
"""
if not cls.__name__ in self.class_uri_dict:
self.class_uri_dict.update({cls.__name__: cls})
semantic_dict = cls.get_semantic_dictionary()
cls_uri, super_cls = self.addClass2Graph(cls, semantic_dict)
uri_mapping4properties = {}
for property_key in self._get_semantic_dictionary_keys_super_first(semantic_dict['properties']):
super_semantic_dict = {} if super_cls is None else super_cls.get_semantic_dictionary()
if len(super_semantic_dict) == 0 or not bool(super_semantic_dict['properties'].get(property_key)):
property_dict4key = semantic_dict['properties'].get(property_key)
property_cls = property_dict4key.get('class')
subject_uri, property_uri = self.createProperty(cls_uri, property_key, property_cls, property_dict4key)
uri_mapping4properties.update({ property_key: property_uri })
elif bool(self.uri_mapping4cls_and_properties.get('classes').get(super_cls.__name__).get('properties').get(property_key)):
property_uri = self.uri_mapping4cls_and_properties['classes'][super_cls.__name__]['properties'][property_key]
uri_mapping4properties.update({ property_key: property_uri})
self.uri_mapping4cls_and_properties.get('classes').update({ cls.__name__: { 'class_uri': cls_uri, 'properties': uri_mapping4properties }})
return URIRef(self.base_uriref + '#' + cls.__name__)
def createProperty(self, domain_uri, property_name, range_cls, info_dict) -> (URIRef, URIRef):
"""Creates a owl:ObjectProperty.
:return: tuple of domain_uri (rdflib.URIRef) and property_uri (rdflib.URIRef) of created property
"""
name = self.createPropertyName(property_name=property_name)\
if SemanticClass.PROPERTY_NAME not in info_dict.keys() else info_dict[SemanticClass.PROPERTY_NAME]
property_uri = URIRef(self.base_uriref + '#' + name)
inferredSubClass = RDFS.subClassOf * '*'
range_uri = URIRef(self.base_uriref + '#' + range_cls.__name__)
super_property_uri = None
if SemanticClass.SUBPROPERTYOF in info_dict.keys():
super_property_uri = URIRef(info_dict[SemanticClass.SUBPROPERTYOF])
elif SemanticClass.SUPER_PROPERTY in info_dict.keys():
domain_uri, super_property_uri = self.createProperty(domain_uri,\
info_dict[SemanticClass.SUPER_PROPERTY].get(SemanticClass.PROPERTY_NAME),\
range_cls, info_dict[SemanticClass.SUPER_PROPERTY])
if (property_uri, None, None) not in self.project_graph:
property_type = OWL.ObjectProperty
if range_cls.__module__ == 'builtins':
if range_cls != list:
property_type = OWL.DatatypeProperty
range_uri = RDFDataHandler.SIMPLE_DATA_TYPE_MAPPING.get(range_cls)
if range_uri == XSD.string and property_name == 'URL':
range_uri = XSD.anyURI
self.addProperty2Graph(property_uri, domain_uri, range_uri, info_dict, property_type=property_type)
elif not True in [\
(domain_uri, inferredSubClass, o) in self.project_graph\
for o in self.project_graph.objects(property_uri, RDFS.domain)\
]:
# if domain_uri is NOT a subclass of a cls specified by RDFS.domain
if SemanticClass.CARDINALITY in info_dict.keys()\
and info_dict[SemanticClass.CARDINALITY] > 0:
self.addRestriction2Class(domain_uri, property_uri, info_dict)
self.project_graph.add((property_uri, RDFS.domain, domain_uri))
if super_property_uri is not None\
and (property_uri, RDFS.subPropertyOf, super_property_uri) not in self.project_graph:
self.project_graph.add((property_uri, RDFS.subPropertyOf, super_property_uri))
return domain_uri, property_uri
def createPropertyName(self, property_name=None, subject_uri=None, object_uri=None, connector='BelongsTo', prefix='has'):
"""Returns a property name.
"""
if property_name is not None:
property_name = ''.join([ property_name.split('_')[0].lower() ] + [ text.capitalize() for text in property_name.split('_')[1:] ])
return prefix + property_name[0].upper() + property_name[1:] if property_name[0].islower()\
else prefix + property_name
elif subject_uri is not None:
property_name = subject_uri.split('#')[1] + self.createPropertyName(object_uri=object_uri, prefix=connector)
return property_name[0].lower() + property_name[1:]
elif object_uri is not None:
return prefix + object_uri.split('#')[1]
else:
return prefix
def get_comment_label(self, cls):
"""Returns comment and label from cls __doc__.
"""
comment = cls.__doc__.replace('\n','').lstrip()
label = cls.__name__
if '.' in cls.__doc__:
comment = [ text for text in cls.__doc__.split('\n') if text != '' ][0].lstrip()
if '@label' in cls.__doc__:
m = re.search('(@label[:]*\s)(.*[\.]*)', cls.__doc__)
label_tag, label = m.groups()
elif re.search('([A-Z][a-z]+)', label):
m = re.search('([A-Z]\w+)([A-Z]\w+)', label)
label = ' '.join([ text.lower() for text in re.split(r'([A-Z][a-z]+)', label) if text != '' ])
return comment, label
def get_semantic_classes(self, datatypes_dir):
"""Returns a list of all classes that are contained in datatypes_dir that are subclasses of class_spec.SemanticClass.
:return: a list of (str_name, class)
"""
base_dir = dirname(dirname(__file__))
sys.path.append(base_dir)
root_modul_name = datatypes_dir.replace('/','.')
files = [ file.replace('.py','') for file in listdir(datatypes_dir) if file.endswith('.py') and not file.startswith('test_') and not file.startswith('_')]
all_modules = []
for name in files:
all_modules.append(importlib.import_module('{}.{}'.format(root_modul_name, name)))
all_classes = []
for modul in all_modules:
all_classes += inspect.getmembers(modul, inspect.isclass)
#all_classes = sorted(set(all_classes))
all_classes = sorted(set(all_classes), key=lambda current_class: current_class[0])
semantic_classes = [ cls for name, cls in all_classes if issubclass(cls, SemanticClass)\
and not issubclass(cls, UnSemanticClass)\
and not (cls == SemanticClass)]
return semantic_classes
def _get_builtin_cls_keys(self, property_dict):
"""Returns a list of keys for classes that are builtin.
"""
builtin_cls_keys = []
for key in property_dict.keys():
property_cls = property_dict.get(key).get('class')\
if type(property_dict.get(key)) is dict\
else property_dict.get(key)[0]
if type(property_cls) != dict\
and property_cls.__module__ == 'builtins':
builtin_cls_keys.append(key)
return builtin_cls_keys
def _get_semantic_dictionary_keys_super_first(self, property_dict):
"""Sorts the keys of the property part of a semantic dictionary
and returns the keys for super classes before keys of subclasses.
:return: a sorted list of keys.
"""
builtin_cls_keys = self._get_builtin_cls_keys(property_dict)
complex_cls_keys = []
for key in [ key for key in property_dict.keys()\
if key not in builtin_cls_keys ]:
current_cls = property_dict.get(key).get('class')
key_inserted = False
for index, cls_key in enumerate(complex_cls_keys):
potential_sub_cls = property_dict.get(cls_key).get('class')
if issubclass(potential_sub_cls, current_cls):
complex_cls_keys.insert(index, key)
key_inserted = True
break
if not key_inserted:
complex_cls_keys.append(key)
return builtin_cls_keys + complex_cls_keys
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert py classes that are subclasses of class_spec.SemanticClass to owl:Class
and its properties to owl:ObjectProperty.
py2ttl/py2ttl_ontology.py [OPTIONS ]
[optional] directory containing datatypes that are subclasses of class_spec.SemanticClass.
Overwrites DATATYPES_DIR in py2ttl/config.py.
OPTIONS:
-h|--help: show help
-s|--source=source_ontology_file source ontology ttl file, option overwrites PROJECT_ONTOLOGY_FILE in py2ttl/config.py
-t|--target=target_ontology_file target ontology ttl file, default: 'PROJECT_PREFIX-ontology_autogenerated.ttl'
:return: exit code (int)
"""
check_config_files_exist()
datatypes_dir = get_datatypes_dir()
source_ontology_file = PROJECT_ONTOLOGY_FILE
target_ontology_file = ''
try:
opts, args = getopt.getopt(argv, "hs:t:", ["help","source=", "target="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-t', '--target'):
target_ontology_file = arg
elif opt in ('-s', '--source'):
source_ontology_file = arg
converter = Py2TTLOntologyConverter(project_ontology_file=source_ontology_file)
if len(args) > 0:
datatypes_dir = args[0]
if target_ontology_file == '':
target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, converter.project_name)
return converter.create_ontology(datatypes_dir, target_ontology_file)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: py2ttl/xml_conform_dictionary.py
===================================================================
--- py2ttl/xml_conform_dictionary.py (revision 109)
+++ py2ttl/xml_conform_dictionary.py (revision 110)
@@ -1,121 +1,124 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This represents a xml conform dictionary of data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import inspect
import re
import warnings
from class_spec import SemanticClass
class XMLConformDictionary:
"""
This represents a xml conform dictionary of data.
"""
def __init__(self):
self.attachables = []
self.builtins = {}
self.builtin_list = {}
def attach_data_to_tree(self, node):
"""Attach data to node
"""
for xml_key, value in self.builtins.items():
node.set(xml_key, value)
for xml_key, value_list in self.builtin_list.items():
node.set(xml_key, ' '.join([ str(i) for i in value_list]))
for attachable in self.attachables:
attachable.attach_object_to_tree(node)
@classmethod
def create_cls_from_data_object(cls, data_object):
"""Create a XMLConformDictionary.
"""
if not issubclass(type(data_object), SemanticClass):
msg = f'{type(data_object)} is not a subclass of {SemanticClass}'
raise TypeError(msg)
property_d = data_object.get_semantic_dictionary()[data_object.PROPERTIES_KEY]
xml_d = cls()
for key in property_d.keys():
value = data_object.__dict__.get(key)
if value is not None and (type(value) != list or len(value) > 0):
semantic_type = property_d[key][data_object.CLASS_KEY]\
if type(property_d[key]) is dict\
else property_d[key][0]
if type(value) != list and semantic_type.__module__ == 'builtins':
if semantic_type == bool:
+ if value != True and value != False:
+ msg = f'Value "{value}" for key "{key}" is not of type "bool"'
+ raise TypeError(msg)
xml_d.builtins.update({key.replace('_','-'): str(value).lower()})
else:
xml_d.builtins.update({key.replace('_','-'): str(value)})
elif semantic_type.__module__ != 'builtins':
if type(value) != list:
xml_d.attachables.append(value)
else:
for item in value:
xml_d.attachables.append(item)
else:
xml_d.builtin_list.update({key.replace('_','-'): value})
return xml_d
@staticmethod
def CREATE_INSTANCEOF_CLASS_FROM_NODE(semantic_class, node):
"""Create a instance of semantic_class from node.
"""
if not issubclass(semantic_class, SemanticClass):
msg = f'{semantic_class} is not a subclass of {SemanticClass}'
raise TypeError(msg)
property_d = semantic_class.get_semantic_dictionary()[semantic_class.PROPERTIES_KEY]
class_instance = semantic_class()
for key in property_d.keys():
semantic_type = property_d[key][semantic_class.CLASS_KEY]\
if type(property_d[key]) is dict\
else property_d[key][0]
if semantic_type.__module__ == 'builtins' and semantic_type != list:
value = node.get(key.replace('_','-'))
if semantic_type == bool:
class_instance.__dict__.update({key: (value == 'true')})
elif semantic_type != str:
if re.match(r'(.*)(\s)', value):
class_instance.__dict__.update({key: [ semantic_type(item) for item in value.split(' ')] })
else:
class_instance.__dict__.update({key: semantic_type(value)})
else:
class_instance.__dict__.update({key: value})
else:
attachables = []
for sub_node in node.xpath(semantic_type.XML_TAG):
sub_instance = semantic_type.create_cls_from_node(sub_node)\
if 'create_cls_from_node' in semantic_type.__dict__\
else XMLConformDictionary.CREATE_INSTANCEOF_CLASS_FROM_NODE(semantic_type, sub_node)
attachables.append(sub_instance)
if len(attachables) > 0:
if len(attachables) > 1:
class_instance.__dict__.update({key: attachables})
else:
class_instance.__dict__.update({key: attachables[0]})
return class_instance
Index: tests_py2ttl/test_data/mapping_dict.xml
===================================================================
--- tests_py2ttl/test_data/mapping_dict.xml (revision 109)
+++ tests_py2ttl/test_data/mapping_dict.xml (revision 110)
@@ -1,391 +1,408 @@
tln
http://www.nie.org/ontology/nietzsche#
./tln-ontology_autogenerated.ttl
http://www.nie.org/ontology/nietzsche#ManuscriptUnity
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasManuscriptType
http://www.nie.org/ontology/nietzsche#hasPages
http://www.nie.org/ontology/nietzsche#hasDescription
http://www.nie.org/ontology/nietzsche#ArchivalManuscriptUnity
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasManuscriptType
http://www.nie.org/ontology/nietzsche#hasPages
http://www.nie.org/ontology/nietzsche#hasStyles
+ http://www.nie.org/ontology/nietzsche#hasGsaSignature
http://www.nie.org/ontology/nietzsche#hasDescription
http://www.nie.org/ontology/nietzsche#partsBelongToReconstructedKonvolut
http://www.nie.org/ontology/nietzsche#hasEarlierDescriptions
http://www.nie.org/ontology/nietzsche#EditorComment
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#AtypicalWriting
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#atypicalWritingHasText
http://www.nie.org/ontology/nietzsche#Path
http://www.nie.org/ontology/nietzsche#hasDAttribute
http://www.nie.org/ontology/nietzsche#Box
http://www.nie.org/ontology/nietzsche#hasDAttribute
http://www.nie.org/ontology/nietzsche#hasEarlierText
http://www.nie.org/ontology/nietzsche#Clarification
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#clarificationHasText
http://www.nie.org/ontology/nietzsche#Color
http://www.nie.org/ontology/nietzsche#colorHasName
http://www.nie.org/ontology/nietzsche#hasHexadecimalValue
http://www.nie.org/ontology/nietzsche#Text
http://www.nie.org/ontology/nietzsche#textHasContent
http://www.nie.org/ontology/nietzsche#textHasMarkup
http://www.nie.org/ontology/nietzsche#Description
http://www.nie.org/ontology/nietzsche#textHasContent
http://www.nie.org/ontology/nietzsche#textHasMarkup
http://www.nie.org/ontology/nietzsche#EarlierDescription
http://www.nie.org/ontology/nietzsche#textHasContent
http://www.nie.org/ontology/nietzsche#hasAuthor
http://www.nie.org/ontology/nietzsche#hasCitation
http://www.nie.org/ontology/nietzsche#textHasMarkup
http://www.nie.org/ontology/nietzsche#EditorCorrection
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#hasCorrectionText
http://www.nie.org/ontology/nietzsche#Image
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasFileName
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasPrimaryurl
http://www.nie.org/ontology/nietzsche#hasSecondaryurl
http://www.nie.org/ontology/nietzsche#hasTextField
http://www.nie.org/ontology/nietzsche#FaksimileImage
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasFileName
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasPrimaryurl
http://www.nie.org/ontology/nietzsche#hasSecondaryurl
http://www.nie.org/ontology/nietzsche#hasApiurl
http://www.nie.org/ontology/nietzsche#hasThumburl
http://www.nie.org/ontology/nietzsche#hasMediumurl
http://www.nie.org/ontology/nietzsche#hasTextField
http://www.nie.org/ontology/nietzsche#PositionalObject
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#WordPosition
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#FaksimilePosition
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
+
+ http://www.nie.org/ontology/nietzsche#Imprint
+
+ http://www.nie.org/ontology/nietzsche#imprintHasReference
+ http://www.nie.org/ontology/nietzsche#imprintRefersToLines
+
+
http://www.nie.org/ontology/nietzsche#Line
http://www.nie.org/ontology/nietzsche#lineHasNumber
http://www.nie.org/ontology/nietzsche#lineHasBottomValueOnTranskription
http://www.nie.org/ontology/nietzsche#lineHasTopValueOnTranskription
http://www.nie.org/ontology/nietzsche#lineHasInnerBottomValueOnFaksimile
http://www.nie.org/ontology/nietzsche#lineHasInnerTopValueOnFaksimile
http://www.nie.org/ontology/nietzsche#lineHasOuterBottomValueOnFaksimile
http://www.nie.org/ontology/nietzsche#lineHasOuterTopValueOnFaksimile
http://www.nie.org/ontology/nietzsche#isMainLine
http://www.nie.org/ontology/nietzsche#lineHasEditorComment
http://www.nie.org/ontology/nietzsche#LineContinuation
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#isLineAContinuationTo
http://www.nie.org/ontology/nietzsche#lineContinuationHasReference
http://www.nie.org/ontology/nietzsche#SimpleWord
http://www.nie.org/ontology/nietzsche#hasText
+ http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#SpecialWord
http://www.nie.org/ontology/nietzsche#hasText
+ http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#MarkForeignHands
http://www.nie.org/ontology/nietzsche#hasText
+ http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#penOfForeignHands
+ http://www.nie.org/ontology/nietzsche#resolutionOfAbbreviation
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#textOfForeignHands
http://www.nie.org/ontology/nietzsche#Page
http://www.nie.org/ontology/nietzsche#hasNumber
http://www.nie.org/ontology/nietzsche#hasOrientation
+ http://www.nie.org/ontology/nietzsche#hasImprints
http://www.nie.org/ontology/nietzsche#hasLines
http://www.nie.org/ontology/nietzsche#hasMarkForeignHands
http://www.nie.org/ontology/nietzsche#hasWords
http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths
http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks
http://www.nie.org/ontology/nietzsche#hasFaksimileImage
http://www.nie.org/ontology/nietzsche#pageIsOnSVGTextField
http://www.nie.org/ontology/nietzsche#pageIsOnFaksimileTextField
http://www.nie.org/ontology/nietzsche#hasSvgImage
http://www.nie.org/ontology/nietzsche#NonExistentPage
http://www.nie.org/ontology/nietzsche#hasNumber
http://www.nie.org/ontology/nietzsche#hasOrientation
+ http://www.nie.org/ontology/nietzsche#hasImprints
http://www.nie.org/ontology/nietzsche#hasLines
http://www.nie.org/ontology/nietzsche#hasMarkForeignHands
http://www.nie.org/ontology/nietzsche#hasWords
http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths
http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks
http://www.nie.org/ontology/nietzsche#hasStatus
http://www.nie.org/ontology/nietzsche#hasFaksimileImage
http://www.nie.org/ontology/nietzsche#pageIsOnSVGTextField
http://www.nie.org/ontology/nietzsche#pageIsOnFaksimileTextField
http://www.nie.org/ontology/nietzsche#hasSvgImage
http://www.nie.org/ontology/nietzsche#ReconstructedKonvolut
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasManuscriptType
http://www.nie.org/ontology/nietzsche#hasPages
http://www.nie.org/ontology/nietzsche#hasDescription
http://www.nie.org/ontology/nietzsche#Reference
http://www.nie.org/ontology/nietzsche#firstLineOfReference
http://www.nie.org/ontology/nietzsche#lastLineOfReference
http://www.nie.org/ontology/nietzsche#wordReference
http://www.nie.org/ontology/nietzsche#IsUncertain
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasPageNumber
http://www.nie.org/ontology/nietzsche#SVGImage
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasFileName
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasPrimaryurl
http://www.nie.org/ontology/nietzsche#hasSecondaryurl
http://www.nie.org/ontology/nietzsche#hasTextField
http://www.nie.org/ontology/nietzsche#StandoffTag
http://www.nie.org/ontology/nietzsche#standoffTagHasStartIndex
http://www.nie.org/ontology/nietzsche#standoffTagHasEndIndex
http://www.nie.org/ontology/nietzsche#standoffTagHasCSS
http://www.nie.org/ontology/nietzsche#TextConnectionMark
http://www.nie.org/ontology/nietzsche#hasText
+ http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#textConnectionMarkHasTextSource
http://www.nie.org/ontology/nietzsche#TextField
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#TranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#UncertainDecipherment
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#Word
http://www.nie.org/ontology/nietzsche#hasText
+ http://www.nie.org/ontology/nietzsche#hasCleanText
http://www.nie.org/ontology/nietzsche#hasEditedText
+ http://www.nie.org/ontology/nietzsche#hasCleanEditedText
http://www.nie.org/ontology/nietzsche#wordHasWordParts
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#wordHasStyle
http://www.nie.org/ontology/nietzsche#overwritesWord
http://www.nie.org/ontology/nietzsche#isTransformationOfWord
http://www.nie.org/ontology/nietzsche#isExtensionOfWord
http://www.nie.org/ontology/nietzsche#isDeletionOfWord
http://www.nie.org/ontology/nietzsche#isClarificationOfWord
http://www.nie.org/ontology/nietzsche#wordHasEarlierVersion
http://www.nie.org/ontology/nietzsche#wordHasCorrection
http://www.nie.org/ontology/nietzsche#wordIsDeletedByPath
http://www.nie.org/ontology/nietzsche#wordHasEditorComment
http://www.nie.org/ontology/nietzsche#WordDeletionPath
http://www.nie.org/ontology/nietzsche#hasDAttribute
http://www.nie.org/ontology/nietzsche#WordInsertionMark
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasMarkType
http://www.nie.org/ontology/nietzsche#hasSymbolId
http://www.nie.org/ontology/nietzsche#hasNextWord
http://www.nie.org/ontology/nietzsche#hasPreviousWord
http://www.nie.org/ontology/nietzsche#wordInsertionMarkBelongsToLine
xml-dictionary
- 2021-02-26 15:46:21
+ 2021-08-23 09:52:15
Index: Friedrich-Nietzsche-late-work-ontology.ttl
===================================================================
--- Friedrich-Nietzsche-late-work-ontology.ttl (revision 109)
+++ Friedrich-Nietzsche-late-work-ontology.ttl (revision 110)
@@ -1,160 +1,167 @@
@prefix dct: .
-@prefix document: .
+@prefix document: .
@prefix homotypic: .
@prefix stoff: .
@prefix text: .
@prefix owl: .
@prefix rdfs: .
@prefix rdf: .
@prefix skos: .
@prefix xsd: .
@prefix tln: .
a owl:Ontology;
dct:license ;
dct:title "An ontology about the collected late works of Friedrich Nietzsche"@en;
dct:description """Formal description of specific concepts in the scientific study of Friedrich Nietzsches late work."""@en;
dct:creator "Dominique Steinbach, tool coordinator/software developer, NIE-INE/digital edition of der späte Nietzsche, Basel University, Switzerland"@en;
dct:contributor "Christian Steiner, software developer, digital edition of der späte Nietzsche, University of Basel, Switzerland"@en;
dct:publisher "Basel University, Switzerland"@en.
tln:TextGenesis a owl:Class ;
rdfs:label "identifies a genetic order of text versions"@en ;
rdfs:comment "Identifies a genetic order of text versions, i.e. groups text units as earlier and later versions of each other."@en ;
rdfs:isDefinedBy .
tln:IdentifiedTextVersion a owl:Class ;
rdfs:label "identifies a list of text unities as a text version"@en ;
rdfs:comment "Identification of a list of text unities (e.g. pages or parts of pages) as a text version for which there is an earlier or later version."@en ;
rdfs:isDefinedBy .
tln:PartOfPageTextUnit a owl:Class ;
rdfs:label "identifies a part of a page as a text unity"@en ;
rdfs:comment "Identification of a part of page as a text unity."@en ;
rdfs:isDefinedBy ;
rdfs:subClassOf [ a owl:Restriction ;
owl:cardinality "1"^^xsd:nonNegativeInteger ;
owl:onProperty tln:belongsToPage ],
[ a owl:Restriction ;
owl:cardinality "1"^^xsd:nonNegativeInteger ;
owl:onProperty tln:startLine ],
[ a owl:Restriction ;
owl:cardinality "1"^^xsd:nonNegativeInteger ;
owl:onProperty tln:endLine ] .
tln:ExternalTextUnit a owl:Class ;
rdfs:label "a list text unit that has been published external to the digital edition"@en ;
rdfs:comment "A text unit that has been published external to the digital edition."@en ;
rdfs:isDefinedBy ;
rdfs:subClassOf tln:IdentifiedTextVersion .
tln:Page a owl:Class ;
rdfs:subClassOf document:Page .
tln:belongsToPage a owl:ObjectProperty ;
rdfs:label "relates a part of a page with the page it is a part of"@en ;
rdfs:comment "Relates a part of a page with the page it is a part of."@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:PartOfPageTextUnit ;
rdfs:range tln:Page.
tln:startLine a owl:ObjectProperty ;
rdfs:label "relates a part of a page with the line it starts with"@en ;
rdfs:comment "Relates a part of a page with the line it starts with."@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:PartOfPageTextUnit ;
rdfs:range tln:Line.
tln:endLine a owl:ObjectProperty ;
rdfs:label "relates a part of a page with the line it ends with"@en ;
rdfs:comment "Relates a part of a page with the line it ends with."@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:PartOfPageTextUnit ;
rdfs:range tln:Line.
tln:identifiesAsVersion a owl:ObjectProperty ;
rdfs:label "groups a list of text unities together as a identified text version"@en ;
rdfs:comment "Groups a list of text unities together as a identified text version for which there is an ealier or later version."@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:IdentifiedTextVersion ;
rdfs:range rdf:List.
tln:hasGeneticOrder a owl:ObjectProperty ;
rdfs:label "relates a list of text versions to an identified genetic order"@en ;
rdfs:comment "Relates a list of text versions to an identified genetic order. The position in the list determines the version of a text unit."@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:TextGenesis ;
rdfs:range rdf:List.
tln:textUnitHasTitle a owl:ObjectProperty ;
rdfs:label "relates a external published text unit with a title"@en ;
rdfs:comment "Relates a external published text unit with a title by which it can be identified."@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:ExternalTextUnit ;
rdfs:range xsd:string .
tln:textUnitHasUrl a owl:ObjectProperty ;
rdfs:label "relates a external published text unit with a URL"@en ;
rdfs:comment "Relates a external published text unit with a URL by which it can be visited."@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:ExternalTextUnit ;
rdfs:range xsd:anyURI .
tln:hasImage a owl:ObjectProperty ;
rdfs:label "relates a page to a image"@en ;
rdfs:comment "relates a page to an image that has a textfield that specifies the area where the writing that constitutes the page can be found."@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:Page ;
rdfs:range tln:Image .
tln:hasUrl a owl:DatatypeProperty ;
rdfs:label "has Url"@en ;
rdfs:domain tln:Image ;
rdfs:isDefinedBy ;
rdfs:range xsd:anyURI .
-tln:inheritOverwritesWord a owl:ObjectProperty ;
- rdfs:subPropertyOf tln:overwritesWord;
- rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ;
- rdfs:comment "The author has used this word in order to overwrite that word."@en ;
- rdfs:isDefinedBy ;
- owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ).
+#tln:inheritOverwritesWord a owl:ObjectProperty ;
+# rdfs:subPropertyOf tln:overwritesWord;
+# rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ;
+# rdfs:comment "The author has used this word in order to overwrite that word."@en ;
+# rdfs:isDefinedBy ;
+# owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ).
+
+tln:hasStandoffMarkup4PartThatOverwritesWord a owl:ObjectProperty ;
+ rdfs:label "word has standoff markup for the part that overwrites a word"@en ;
+ rdfs:comment "word has standoff markup that highlights the part of its text that overwrites a word"@en ;
+ rdfs:isDefinedBy ;
+ rdfs:domain tln:Word ;
+ rdfs:range stoff:StandoffMarkup .
tln:lineContinuesOn a owl:ObjectProperty ;
rdfs:label "writing from subject line continues on object line"@en ;
rdfs:comment "the writing that ends on subject line continues on object line"@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:Line ;
rdfs:range tln:Line .
tln:pageIsOnTextField a owl:ObjectProperty ;
rdfs:label "page is on text field"@en ;
rdfs:comment "the writing that is referred to as subject can be found on object"@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:Page ;
rdfs:range tln:TextField .
tln:writingContinuesWithWord a owl:ObjectProperty ;
rdfs:label "writing continues with next word"@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:Word ;
rdfs:range tln:Word .
tln:selectableWordProperty a owl:ObjectProperty ;
rdfs:label "a property of a word for which it can be selected"@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:Word .
tln:cardinalityGreaterOne a rdf:Property ;
rdfs:label "whether a tln:selectableWordProperty can have a greater cardinality then one"@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:selectableWordProperty ;
rdfs:range xsd:boolean .
tln:suggestedMaxCardinality a rdf:Property ;
rdfs:label "the suggested max cardinaltiy of a tln:selectableWordProperty on a word"@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:selectableWordProperty ;
rdfs:range xsd:nonNegativeInteger .