Index: fixes/interactive_editor.py
===================================================================
--- fixes/interactive_editor.py (revision 106)
+++ fixes/interactive_editor.py (revision 107)
@@ -1,967 +1,1026 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from datetime import datetime
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
from svgpathtools.parser import parse_path
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
from checker_handler import CheckerHandler
from fix_old_data import save_page
from fix_boxes import attach_box, split_into_parts_and_attach_box
sys.path.append('svgscripts')
from convert_wordPositions import HTMLConverter, JSONConverter
from datatypes.box import Box
from datatypes.faksimile import FaksimilePage
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word, update_transkription_position_ids
from datatypes.word_deletion_path import WordDeletionPath
-from join_faksimileAndTranskription import sort_words
-from util import back_up, back_up_svg_file, copy_faksimile_svg_file
+from join_faksimileAndTranskription import sort_words, add_faksimile_image
+from util import back_up, back_up_svg_file, copy_faksimile_svg_file, change_title_of_svg
from process_files import update_svgposfile_status
from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import create_function_dictionary
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
MAX_SVG_XY_THRESHOLD = 10
class ResponseHandler:
def __init__(self, response_starts_with=None, dialog_string=None, action_name=None, description=None):
self.action_name = action_name
self.dialog_string = dialog_string
self.description = description
self.response_starts_with = response_starts_with
def create_requirement_list(self) ->list:
"""Create a requirement dictionary.
"""
return []
def create_json_dict(self)->dict:
"""Create a json dictionary.
"""
json_dict = { 'action_name': self.action_name, 'description': self.description }
requirements = self.create_requirement_list()
if len(requirements) > 0:
json_dict.update({ 'requirements': requirements })
return json_dict
def get_transkription_words(self, json_dict: dict) ->list:
"""Return words with transkription positions only.
"""
words = json_dict['words']\
if bool(json_dict.get('words'))\
else []
return [ w for w in words if bool(w.get('tp_id')) ]
def get_requirement(self, json_dict: dict, index=0) ->tuple:
"""Return requirement tuple (name, input).
"""
name = requirement = None
if dict_contains_keys(json_dict, ['response_handler','requirements'])\
and index < len(json_dict['response_handler']['requirements']):
requirement_dict = json_dict['response_handler']['requirements'][index]
if dict_contains_keys(requirement_dict, ['name'])\
and dict_contains_keys(requirement_dict, ['input']):
name = requirement_dict['name']
requirement = requirement_dict['input']
return name, requirement
def match(self, response: str) ->bool:
"""Return whether response matchs with handler.
"""
if self.response_starts_with is not None:
return response.startswith(self.response_starts_with)
return True
def print_dialog(self):
"""Print dialog.
"""
if self.dialog_string is not None:
print(f'[{self.dialog_string}]')
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words ]
action_dictionary = { 'words': [ word for word in page.words if word.id in json_word_ids ] }
for index, item in enumerate(self.create_requirement_list()):
name, requirement = self.get_requirement(json_dict, index=index)
action_dictionary.update({name: requirement})
return self.run_change(page, action_dictionary)
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
return self.run_change(page, {})
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
return exit_code
class JoinWords(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response interactively and return exit code.
"""
action_dictionary = { 'words' : shell._get_words_from_response(re.compile('^\D+\s').sub('', response), page.words),\
'add_white_space_between_words': re.match(r'^\D+\s', response) }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
add_white_space_between_words = action_dictionary['add_white_space_between_words']\
if bool(action_dictionary.get('add_white_space_between_words'))\
else False
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
if len(words) > 0:
if len(set([ word.line_number for word in words ])) == 1\
and len(set([ word.deleted for word in words ])) == 1:
new_word = words[0]
for word2join in words[1:]:
page.words.remove(word2join)
new_word.join(word2join, add_white_space_between_words=add_white_space_between_words)
else:
new_word = Word.join_words(words, add_white_space_between_words=add_white_space_between_words)
index = len(page.words)
if words[0] in page.words:
index = page.words.index(words[0])
elif len([ word for word in page.words if words[0] in word.word_parts ]) > 0:
index = page.words.index([ word for word in page.words if words[0] in word.word_parts ][0])
for word2join in words:
if word2join in page.words:
page.words.remove(word2join)
elif len([ word for word in page.words if word2join in word.word_parts ]) > 0:
page.words.remove([ word for word in page.words if word2join in word.word_parts ][0])
page.words.insert(index, new_word)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class SimpleJoinWords(JoinWords):
def match(self, response: str) ->bool:
"""Return whether response matchs with handler.
"""
return re.match(r'\d+', response)
class SaveChanges(ResponseHandler):
WORD_INDEX = 0
WDICT_INDEX = 1
RELEVANT_PROPERTIES = [ ('deleted','deleted'), ('line_number','line') ]
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
self.run_change(page, {})
return shell.run_interactive_editor(page)
def _update_transkription_word(self, word, word_dict) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
for relevant_property in self.RELEVANT_PROPERTIES:
if len(word.word_parts) > 0:
if len(word_dict['tp_id'].split(':')) == 3:
wp_index = int(word_dict['tp_id'].split(':')[1].replace('w',''))
word.word_parts[wp_index].__dict__[relevant_property[self.WORD_INDEX]] = word_dict[relevant_property[self.WDICT_INDEX]]
else:
return 2
else:
word.__dict__[relevant_property[self.WORD_INDEX]] = word_dict[relevant_property[self.WDICT_INDEX]]
return exit_code
def _update_faksimile_word(self, word, word_dict, words) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
if word_dict.get('old_id') is not None:
fp_id = word_dict['fp_id']
old_id = int(word_dict['old_id'])
if len([w for w in words if w.id == old_id ]) > 0:
old_word = [w for w in words if w.id == old_id ][0]
faksimile_position = None
if len([ fp for fp in old_word.faksimile_positions if fp.id == fp_id ]) > 0:
faksimile_position = [ fp for fp in old_word.faksimile_positions if fp.id == fp_id ][0]
old_word.faksimile_positions.remove(faksimile_position)
elif len([ fp for w in old_word.word_parts for fp in w.faksimile_positions if fp.id == fp_id ]) > 0:
for w in old_word.word_parts:
for fp in w.faksimile_positions:
if fp.id == fp_id:
faksimile_position = fp
w.faksimile_positions.remove(faksimile_position)
break
if faksimile_position is not None:
word.faksimile_positions.append(faksimile_position)
else:
return 2
else:
return 3
+ else:
+ fp_id = word_dict['fp_id']
+ print(word.id, fp_id);
return exit_code
def _update_word(self, word, word_dict, words) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
if bool(word_dict.get('tp_id')):
exit_code = self._update_transkription_word(word, word_dict)
if exit_code > 0:
return exit_code
elif bool(word_dict.get('fp_id')):
exit_code = self._update_faksimile_word(word, word_dict, words)
if exit_code > 0:
print(exit_code)
return exit_code
else:
return 2
return exit_code
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
- json_word_ids = [ int(jw.get('id')) for jw in json_dict['words'] ]
- print('updating word', json_dict, json_word_ids, page.words[0].id)
+ svg_words = [ word for word in json_dict['words'] if str(word.get('id')).startswith('rect') ]
+ if page.faksimile_svgFile is not None:
+ for word in svg_words:
+ word_id = word.get('id')
+ word_text = word.get('text')
+ print(f'Changing rect {word_id} to {word_text}')
+ change_title_of_svg(page.faksimile_svgFile, word_id, word_text)
+ json_word_ids = [ int(jw.get('id')) for jw in json_dict['words'] if not str(jw.get('id')).startswith('rect') ]
for word in page.words:
if word.id in json_word_ids:
print('updating word', word.id, word.text)
word_dict = [ jw for jw in json_dict['words'] if int(jw.get('id')) == word.id ][0]
if self._update_word(word, word_dict, page.words) > 0:
return 2
return self.run_change(page, {})
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
return exit_code
class SavePositions(SaveChanges):
def _update_word(self, word, word_dict_list) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
for word_dict in word_dict_list:
if bool(word_dict.get('tp_id')):
exit_code = self._update_transkription_position(word, word_dict)
if exit_code > 0:
return exit_code
elif bool(word_dict.get('fp_id')):
exit_code = self._update_faksimile_position(word, word_dict)
if exit_code > 0:
return exit_code
return exit_code
def _update_transkription_position(self, word, word_dict) ->int:
"""Update transkription position properites of word according to word_dict,
return exit_code
"""
tp_id_list = word_dict['tp_id'].split(':')
if len(tp_id_list) == 3 and len(word.word_parts) > 0:
wp_index = int(tp_id_list[1].replace('w',''))
tp_index = int(tp_id_list[2].replace('tp',''))
if wp_index < len(word.word_parts) and tp_index < len(word.word_parts[wp_index].transkription_positions):
word.word_parts[wp_index].transkription_positions[tp_index].left = float(word_dict['left'])
word.word_parts[wp_index].transkription_positions[tp_index].top = float(word_dict['top'])
word.word_parts[wp_index].transkription_positions[tp_index].bottom = word.word_parts[wp_index].transkription_positions[tp_index].top\
+ word.word_parts[wp_index].transkription_positions[tp_index].height
else:
return 2
elif len(tp_id_list) == 2:
tp_index = int(tp_id_list[1].replace('tp',''))
if tp_index < len(word.transkription_positions):
word.transkription_positions[tp_index].left = float(word_dict['left'])
word.transkription_positions[tp_index].top = float(word_dict['top'])
word.transkription_positions[tp_index].bottom = word.transkription_positions[tp_index].top\
+ word.transkription_positions[tp_index].height
else:
return 2
else:
return 2
return 0
def _update_faksimile_position(self, word, word_dict) ->int:
"""Update faksimile position properites of word according to word_dict,
return exit_code
"""
exit_code = 0
fp_id = word_dict['fp_id']
faksimile_position = None
if len([ fp for fp in word.faksimile_positions if fp.id == fp_id ]) > 0:
faksimile_position = [ fp for fp in word.faksimile_positions if fp.id == fp_id ][0]
if len([ fp for w in word.word_parts for fp in w.faksimile_positions if fp.id == fp_id ]) > 0:
faksimile_position = [ fp for w in word.word_parts for fp in w.faksimile_positions if fp.id == fp_id ][0]
if faksimile_position is not None:
faksimile_position.left = float(word_dict['left'])
faksimile_position.top = float(word_dict['top'])
faksimile_position.bottom = faksimile_position.top + faksimile_position.height
else:
return 2
return exit_code
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
json_word_ids = [ jw.get('id') for jw in json_dict['words'] ]
for word in page.words:
if word.id in json_word_ids:
word_dict_list = [ jw for jw in json_dict['words'] if jw.get('id') == word.id ]
if self._update_word(word, word_dict_list) > 0:
return 2
return self.run_change(page, {})
class AddDeletionPath(SaveChanges):
def _add_deletion_path(self, page, word, word_dict_list) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 0
for word_dict in word_dict_list:
if len([ path for path in word.deletion_paths if path.d_attribute == word_dict['deletion_path']]) == 0:
dpath = page.get_word_deletion_path(d_attribute=word_dict['deletion_path'])
if dpath is not None:
word.deletion_paths.append(dpath)
else:
exit_code = 2
return exit_code
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ]
for word in page.words:
if word.id in json_word_ids:
word_dict_list = [ jw for jw in transkription_words if jw.get('id') == word.id ]
if self._add_deletion_path(page, word, word_dict_list) > 0:
return 2
return self.run_change(page, {})
class RemoveDeletionPath(SaveChanges):
def _remove_deletion_path(self, page, word, word_dict_list) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
exit_code = 2
if len(word.word_parts) > 0:
exit_code = 2
for wpart in word.word_parts:
result = self._remove_deletion_path(page, wpart, word_dict_list)
if result == 0:
exit_code = 0
deletion_paths = [ path for path in word.deletion_paths if path.d_attribute in\
[ word_dict['deletion_path'] for word_dict in word_dict_list ] ]
if len(deletion_paths) > 0:
for path in deletion_paths:
if path in word.deletion_paths:
word.deletion_paths.remove(path)
for node in page.page_tree.xpath(f'./{WordDeletionPath.XML_TAG}[@d="{path.d_attribute}"]'):
node.getparent().remove(node)
exit_code = 0
return exit_code
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ]
for word in page.words:
if word.id in json_word_ids:
word_dict_list = [ jw for jw in transkription_words if jw.get('id') == word.id ]
if self._remove_deletion_path(page, word, word_dict_list) > 0:
return 2
return self.run_change(page, {})
class JoinDeletionPath(SaveChanges):
def _join_deletion_path(self, page, word, word_dict_list) ->int:
"""Update properites of word according to word_dict,
return exit_code
"""
deletion_paths = [ path for path in word.deletion_paths if path.d_attribute in\
[ word_dict['deletion_path'] for word_dict in word_dict_list ] ]
if len(deletion_paths) > 1:
path_string = ''
for p in deletion_paths:
path_string = path_string + ' ' + p.d_attribute.replace('M', 'L')\
if path_string != ''\
else p.d_attribute
word.deletion_paths.remove(p)
if p in page.word_deletion_paths:
page.word_deletion_paths.remove(p)
new_path = parse_path(path_string)
word.deletion_paths.append(WordDeletionPath(Path(id=deletion_paths[0].id, path=new_path), deletion_paths[0].style))
page.word_deletion_paths.append(word.deletion_paths[-1])
for node in page.page_tree.xpath(f'./{WordDeletionPath.XML_TAG}'): node.getparent().remove(node)
for p in page.word_deletion_paths: p.attach_object_to_tree(page.page_tree)
return 0
return 2
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ]
for word in page.words:
if word.id in json_word_ids:
word_dict_list = [ jw for jw in transkription_words if jw.get('id') == word.id ]
if self._join_deletion_path(page, word, word_dict_list) > 0:
return 2
return self.run_change(page, {})
class RequestPathsNearWords(SaveChanges):
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
transkription_words = self.get_transkription_words(json_dict)
json_word_ids = [ jw.get('id') for jw in transkription_words if bool(jw.get('deletion_path')) ]
for word in page.words:
if word.id in json_word_ids\
and 'add_paths_near_words' not in word.process_flags:
word.process_flags.append('add_paths_near_words')
return self.run_change(page, {})
class SetTaskDone(SaveChanges):
def handle_response(self, page: Page, json_dict: dict) -> int:
"""Handle response and return exit code.
"""
if not bool(json_dict.get('task')):
return 2
task = json_dict.get('task')
checker = CheckerHandler(page)
checker.set_task_done(task)
return self.run_change(page, {})
class Reload(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
return shell.run_interactive_editor(Page(page.page_tree.docinfo.URL))
class RestoreBackup(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
if page.bak_file is not None:
return shell.run_interactive_editor(Page(page.bak_file))
else:
print('Could not restore backup file, please restore manually!')
return 2
class ChangeLine2Value(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
words = []
line_number = -1
if re.match(r'l:\d+\s\d+', response):
line_number = int(response.replace('l:', '').split(' ')[0])
words = shell._get_words_from_response(re.compile('l:\d+\s').sub('', response), page.words)
else:
if not re.match(r'l:\d+$', response):
new_response_line = input('Specify new line number>')
if re.match(r'^\d+$', new_response_line):
line_number = int(new_response_line)
else:
line_number = int(response.replace('l:', ''))
new_response = input(f'Specify ids of words for which line number should be changed to {line_number}>')
if re.match(r'\d+', new_response):
words = shell_get_words_from_response(new_response, page.words)
action_dictionary = { 'words': words, 'line_number' : line_number }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
line_number = action_dictionary['line_number']\
if bool(action_dictionary.get('line_number'))\
else -1
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
if line_number != -1:
for word in words: word.line_number = line_number
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class CreateCorrectionHistory(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
if re.match(r'c\w*\s\d+', response):
words = shell._get_words_from_response(re.compile('c\w*\s').sub('', response), page.words)
else:
new_response = input(f'Specify ids of words to create a correction history. >')
if re.match(r'\d+', new_response):
words = shell._get_words_from_response(new_response, page.words)
action_dictionary = { 'words': words }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
if len(words) > 0:
for word in words: word.create_correction_history()
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class DeleteCorrectionHistory(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response interactively and return exit code.
"""
if re.match(r'D\w*\s\d+', response):
words = shell._get_words_from_response(re.compile('D\w*\s').sub('', response), page.words)
else:
new_response = input(f'Specify ids of words to delete their correction history. >')
if re.match(r'\d+', new_response):
words = shell._get_words_from_response(new_response, page.words)
action_dictionary = { 'words' : words }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
if len(words) > 0:
for word in words:
print(word.text)
word.earlier_version = None
word.corrections = []
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class ChangeDeletionStatus(ResponseHandler):
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
if re.match(r'[du]\w*\s\d+', response):
words = shell._get_words_from_response(re.compile('[du]\w*\s').sub('', response), page.words)
else:
deletion_target = 'delete' if response.startswith('d') else 'undelete'
new_response = input(f'Specify ids of words to {deletion_target}. >')
if re.match(r'\d+', new_response):
words = shell._get_words_from_response(new_response, page.words)
action_dictionary = { 'words': words, 'deleted': response.startswith('d') }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
word_should_be_deleted = bool(action_dictionary.get('deleted'))
if len(words) > 0:
for word in words: word.deleted = word_should_be_deleted
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class SplitWords(ResponseHandler):
def _split_word(self, page, word, split_text):
"""Split word.
"""
index = page.words.index(word)
_, left, right = word.split(split_text)
page.words[index] = left
page.words.insert(index+1, right)
def create_requirement_list(self) ->list:
"""Create a requirement dictionary.
"""
return [{ 'name': 'split_text', 'type': 'string', 'input': None }]
def handle_interactive_response(self, page: Page, response: str, shell) -> int:
"""Handle response and return exit code.
"""
if re.match(r's\s\w+\s\d+', response):
words = shell._get_words_from_response(re.compile('s\s\w+\s').sub('', response), page.words)
split_text = response.split(' ')[1]
else:
split_text = input('Input split text>')
new_response = input(f'Specify ids of words to split. >')
if re.match(r'\d+', new_response):
words = shell._get_words_from_response(new_response, page.words)
action_dictionary = { 'words': words, 'split_text': split_text }
if self.run_change(page, action_dictionary) == 0:
return shell.run_interactive_editor(page)
return 2
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
split_text = action_dictionary['split_text']\
if bool(action_dictionary.get('split_text'))\
else ''
if len(words) > 0 and split_text != '':
for word in words: self._split_word(page, word, split_text)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class AddBox(ResponseHandler):
def create_requirement_list(self) ->list:
"""Create a requirement dictionary.
"""
return [{ 'name': 'box_text', 'type': 'string', 'input': None },\
{ 'name': 'overwritten_by', 'type': 'string', 'input': None },\
{ 'name': 'is_earlier_version', 'type': 'boolean', 'input': False }]
def run_change(self, page: Page, action_dictionary: dict) -> int:
"""Run changes on page and return exit code.
"""
exit_code = 0
words = action_dictionary['words']\
if bool(action_dictionary.get('words'))\
else []
missing_text = action_dictionary.get('box_text')
is_earlier_version = action_dictionary.get('is_earlier_version')
overwritten_by = action_dictionary.get('overwritten_by')
if len(words) > 0 and missing_text is not None:
for word in words:
if overwritten_by is not None:
split_into_parts_and_attach_box(word, 0, missing_text, is_earlier_version, overwritten_by)
else:
attach_box(word, 0, missing_text, False)
word.create_correction_history()
if len(word.corrections) > 0:
for wp in word.word_parts:
wp.overwrites_word = None
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
page = Page(page.page_tree.docinfo.URL)
else:
exit_code = 2
return exit_code
class ResponseOrganizer:
RESULT = 'result'
TIMESTAMP_NOT_SET = -1
def __init__(self, manuscript=None):
self.manuscript = manuscript
self.do_not_send = []
+ self.after_faksimile_merged = []
+ self.join_faksimile_positions = False
self.response_handler_dictionary = {}
self._add_response_handler(JoinWords(action_name='join words', description='join words'))
self._add_response_handler(SplitWords(action_name='split words', description='split word according to split text'))
- self._add_response_handler(CreateCorrectionHistory(action_name='create correction history', description='creates a correction history for selected words'))
- self._add_response_handler(DeleteCorrectionHistory(action_name='delete correction history', description='deletes the correction history of selected words'))
- self._add_response_handler(AddBox(action_name='add box', description='add box with overwritten text'))
+ self._add_response_handler(CreateCorrectionHistory(action_name='create correction history', description='creates a correction history for selected words'),\
+ is_after_faksimile_merged=True)
+ self._add_response_handler(DeleteCorrectionHistory(action_name='delete correction history', description='deletes the correction history of selected words'),\
+ is_after_faksimile_merged=True)
+ self._add_response_handler(AddBox(action_name='add box', description='add box with overwritten text'),\
+ is_after_faksimile_merged=True)
self._add_response_handler(SaveChanges(action_name='save changes', description='save change to line number/deletion status for word(s)' ))
self._add_response_handler(SavePositions(action_name='save positions', description='save new transkription position(s)' ))
- self._add_response_handler(AddDeletionPath(action_name='add deletion paths', description='add new deletion paths to word' ))
- self._add_response_handler(JoinDeletionPath(action_name='join deletion paths', description='join deletion paths of selected words' ))
- self._add_response_handler(RemoveDeletionPath(action_name='remove deletion paths', description='remove deletion paths of selected words' ))
- self._add_response_handler(RequestPathsNearWords(action_name='request paths near words', description='request paths near selected words' ))
+ self._add_response_handler(AddDeletionPath(action_name='add deletion paths', description='add new deletion paths to word' ),\
+ is_after_faksimile_merged=True)
+ self._add_response_handler(JoinDeletionPath(action_name='join deletion paths', description='join deletion paths of selected words' ),\
+ is_after_faksimile_merged=True)
+ self._add_response_handler(RemoveDeletionPath(action_name='remove deletion paths', description='remove deletion paths of selected words' ),\
+ is_after_faksimile_merged=True)
+ self._add_response_handler(RequestPathsNearWords(action_name='request paths near words', description='request paths near selected words' ),\
+ is_after_faksimile_merged=True)
self._add_response_handler(Reload(action_name='reload', description='reload page from file' ))
self._add_response_handler(SetTaskDone(action_name='set task done', description='reload page from file' ), add_to_do_not_send=True)
+ def _add_faksimile_image(self, page, faksimile_page):
+ """Add faksimile image to page.
+ """
+ if faksimile_page.faksimile_image.text_field is None\
+ and faksimile_page.text_field is not None:
+ faksimile_page.faksimile_image.text_field = faksimile_page.text_field
+ page.faksimile_image = faksimile_page.faksimile_image
+ page.faksimile_image.attach_object_to_tree(page.page_tree)
+ page.update_data_source(faksimile_svgFile=faksimile_page.svg_source_file)
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, backup=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
- def _add_response_handler(self, response_handler: ResponseHandler, add_to_do_not_send=False):
+ def _add_response_handler(self, response_handler: ResponseHandler, add_to_do_not_send=False, is_after_faksimile_merged=False):
"""Add response_handler to response_handler_dictionary.
"""
if add_to_do_not_send:
self.do_not_send.append(response_handler)
+ if is_after_faksimile_merged:
+ self.after_faksimile_merged.append(response_handler)
self.response_handler_dictionary.update({response_handler.action_name: response_handler})
def _get_response_handlers(self) ->list:
"""Return a list of response_handlers.
"""
return [ response_handler for response_handler in self.response_handler_dictionary.values()\
- if response_handler not in self.do_not_send ]
-
- def create_json_dict(self, xml_file: str, last_operation_result=None) ->dict:
+ if response_handler not in self.do_not_send\
+ and (not self.join_faksimile_positions or response_handler not in self.after_faksimile_merged) ]
+
+
+ def create_json_dict(self, xml_file: str, svg_file=None, last_operation_result=None) ->dict:
"""Return a json dict of page with information about action.
"""
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
page = Page(xml_file, add_paths_near_words=True, warn=True)
checker = CheckerHandler(page)
+ todos = checker.get_todos()
replace_ligatures(page)
- converter = JSONConverter(page)
+ faksimile_page = None
+ faksimile_source_file = None
+ if svg_file is None and page.faksimile_svgFile is not None:
+ svg_file = page.faksimile_svgFile
+ if svg_file is not None:
+ fps = FaksimilePage.get_faksimile_pages(svg_file, page_number=page.number)
+ if len(fps) > 0:
+ faksimile_page = fps[0]
+ if page.faksimile_image is None:
+ add_faksimile_image(page, faksimile_page)
+ if not UNITTESTING:
+ print(f'writing to {page.page_tree.docinfo.URL}')
+ save_page(page, backup=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
+ if not isfile(xml_file.replace('xml/', 'xml/merged/'))\
+ and len([ word for word in page.words if len(word.faksimile_positions) == 0 ]) > 0:
+ self.join_faksimile_positions = True
+ faksimile_source_file = svg_file
+ todos = []
+ converter = JSONConverter(page, faksimile_page=faksimile_page)
json_dict = converter.create_json_dict()
pages = []
if self.manuscript is not None and isfile(self.manuscript):
manuscript_tree = ET.parse(self.manuscript)
pages = [ p.replace('./', '') for p in manuscript_tree.xpath('//page/@output') if isfile(p) ]
action_dict = { 'target_file': xml_file,\
+ 'faksimile_source_file': faksimile_source_file,\
'pages': pages,\
'date_stamp': os.path.getmtime(xml_file),\
- 'tasks': checker.get_todos() }
+ 'join_faksimile_positions': str(self.join_faksimile_positions).lower(),\
+ 'tasks': todos }
if last_operation_result is not None:
action_dict.update({self.RESULT: last_operation_result })
if len(w) > 0:
msg = str(w[-1].message)\
if last_operation_result is None\
else last_operation_result + '\n' + str(w[-1].message)
action_dict.update({self.RESULT: msg })
response_handlers = []
for response_handler in self._get_response_handlers():
response_handlers.append(response_handler.create_json_dict())
action_dict.update({ 'response_handlers': response_handlers })
json_dict.update({ 'actions': action_dict})
return json_dict
def handle_response(self, json_dict: dict) ->dict:
"""Handle response in json_dict and return new data json_dict.
"""
if bool(json_dict.get('target_file')):
target_file = json_dict['target_file']
+ svg_file = json_dict['faksimile_source_file']\
+ if bool(json_dict.get('faksimile_source_file'))\
+ else None
if bool(json_dict.get('date_stamp')):
if json_dict['date_stamp'] == self.TIMESTAMP_NOT_SET\
or os.path.getmtime(target_file) <= json_dict['date_stamp']:
exit_code = 2
operation = 'unknown'
if bool(json_dict.get('response_handler'))\
and bool(self.response_handler_dictionary.get(json_dict['response_handler']['action_name'])):
operation = json_dict['response_handler']['action_name']
response_handler = self.response_handler_dictionary[operation]
exit_code = response_handler.handle_response(Page(target_file), json_dict)
message = f'Operation "{operation}" succeeded!' if exit_code == 0 else f'Operation "{operation}" failed'
- return self.create_json_dict(target_file, last_operation_result=message)
+ return self.create_json_dict(target_file, svg_file=svg_file, last_operation_result=message)
else:
return self.create_json_dict(target_file,\
last_operation_result=f'FAIL: file {target_file} was changed between operations!')
else:
return self.create_json_dict(target_file,\
last_operation_result='ERROR: there was no key "date_stamp" in json')
else:
return { 'actions': { self.RESULT: 'ERROR: there was no key "target_file" in json!' }}
class InteractiveShell:
def __init__(self):
self.response_handlers = []
self.response_handlers.append(SimpleJoinWords(dialog_string='specify ids of words to join [default]'))
self.response_handlers.append(RestoreBackup(response_starts_with='b', dialog_string='b=restore backup'))
self.response_handlers.append(CreateCorrectionHistory(response_starts_with='c', dialog_string='c=create correction history [+ ids]'))
self.response_handlers.append(DeleteCorrectionHistory(response_starts_with='D', dialog_string='D=delete correction history [+ ids]'))
self.response_handlers.append(ChangeDeletionStatus(response_starts_with='d', dialog_string='d=mark deleted [+ ids]'))
self.response_handlers.append(SaveChanges(response_starts_with='i', dialog_string='i=fix ids' ))
self.response_handlers.append(ChangeLine2Value(response_starts_with='l', dialog_string='l[:value]=change line to value for ids' ))
self.response_handlers.append(Reload(response_starts_with='r', dialog_string='r=reload xml file'))
self.response_handlers.append(SplitWords(response_starts_with='s', dialog_string='s=split and join word ("s splittext id")'))
self.response_handlers.append(ChangeDeletionStatus(response_starts_with='u', dialog_string='u=undelete [+ ids]'))
self.response_handlers.append(JoinWords(response_starts_with='w', dialog_string='w=join words with whitespace between them [+ ids]'))
self.response_handlers.append(ResponseHandler())
def _get_words_from_response(self, response, words) ->list:
"""Return a list of word that correspond to indices
"""
if re.match(r'\d+-\d+', response)\
or re.match(r'\d+\+', response):
index_boundaries = []
if response[-1] == '+':
index_boundaries.append(int(response[:response.index('+')]))
index_boundaries.append(index_boundaries[0]+1)
else:
index_boundaries = [ int(i) for i in response.split('-') ]
index_boundaries_length_diff = len(response.split('-')[0]) - len(response.split('-')[1])
if index_boundaries_length_diff > 0:
index_boundaries[1] = int(response.split('-')[0][0-index_boundaries_length_diff-1] + response.split('-')[1])
indices = [ i for i in range(index_boundaries[0], index_boundaries[1]+1) ]
if index_boundaries[0] > index_boundaries[1]:
indices = [ index_boundaries[0] ]
while indices[-1] > index_boundaries[1]:
indices.append(indices[-1]-1)
else:
indices = [ int(i) for i in response.split(' ') ]
result_words = []
for index in indices:
if len([ word for word in words if word.id == index ]) > 0:
result_words += [ word for word in words if word.id == index ]
return result_words
def run_interactive_editor(self, page) -> int:
"""Run interactive shell.
"""
replace_ligatures(page)
HTMLConverter(page).convert()
for response_handler in self.response_handlers: response_handler.print_dialog()
response = input('>')
for response_handler in self.response_handlers:
if response_handler.match(response):
return response_handler.handle_interactive_response(page, response, self)
def replace_ligatures(page):
"""Replace ligatures
"""
if len([ word for word in page.words if re.match(r'.*[flfi]', word.text) ]) > 0:
for word in [ word for word in page.words if re.match(r'.*[fi]', word.text) ]:
word.text = word.text.replace('fi', 'fi')
for word in [ word for word in page.words if re.match(r'.*[fl]', word.text) ]:
word.text = word.text.replace('fl', 'fl')
+ save_page(page, backup=True, attach_first=True, script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}')
def dict_contains_keys(a_dict, key_list)->bool:
"""Return whether dict a_dict contains key path given by key_list.
"""
if len(key_list) == 0:
return True
else:
if key_list[0] in a_dict.keys():
return dict_contains_keys(a_dict[key_list[0]], key_list[1:])
return False
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix faksimile position ->set them to their absolute value.
fixes/interactive_editor.py [OPTIONS]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
xml_file = args[0]
if isfile(xml_file):
counter = 0
shell = InteractiveShell()
for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK):
if not UNITTESTING:
print(Fore.CYAN + f'Processing {page.title}, {page.number} with interactive editor ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
counter += 1 if shell.run_interactive_editor(page) == 0 else 0
if not UNITTESTING:
print(Style.RESET_ALL + f'[{counter} pages changed by interactive shell]')
else:
raise FileNotFoundError('File {} does not exist!'.format(xml_file))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: fixes/server.py
===================================================================
--- fixes/server.py (revision 106)
+++ fixes/server.py (revision 107)
@@ -1,175 +1,176 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to send xml data as json over http.
"""
# Copyright (C) University of Basel 2020 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import getopt
from http.server import BaseHTTPRequestHandler, HTTPServer, SimpleHTTPRequestHandler
import http.client
import simplejson as json
from os.path import exists, isfile, isdir, dirname, basename
import cgi
import sys
from interactive_editor import ResponseOrganizer
sys.path.append('svgscripts')
from convert_wordPositions import JSONConverter
from datatypes.page import Page
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
class Server(BaseHTTPRequestHandler):
CONTENT_TYPE = 'Content-Type'
CONTENT_LENGTH = 'Content-Length'
CONFIG_FILE = '.local_variables'
XML = 'xml'
SVG = 'svg'
MANUSCRIPT = 'manuscript'
@classmethod
def get_local_file_dictionary(cls) ->dict:
"""Return a dictionary about local files with keys: XML, SVG, MANUSCRIPT.
"""
local_file_dictionary = {}
if isfile(cls.CONFIG_FILE):
with open(cls.CONFIG_FILE, 'r') as reader:
for raw_line in reader.readlines():
line = raw_line.replace('\n', '')
line_content = line.split('=')
if len(line_content) == 2\
and isfile(line_content[1]):
local_file_dictionary.update({line_content[0]: line_content[1]})
return local_file_dictionary
def _set_headers(self, response_code):
self.send_response(response_code)
self.send_header('Content-type', 'application/json')
#self.send_header('Access-Control-Allow-Credentials', 'true')
self.send_header("Cache-Control", "no-cache")
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "X-Requested-With")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_HEAD(self):
self._set_headers(200)
def do_OPTIONS(self):
"""Process OPTIONS.
"""
self.send_response(200, "ok")
self.send_header('Access-Control-Allow-Origin', '*')
self.send_header('Access-Control-Allow-Methods', 'GET, POST, OPTIONS')
self.send_header("Access-Control-Allow-Headers", "X-Requested-With")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self):
"""Process GET.
"""
self._set_headers(200)
local_file_dictionary = Server.get_local_file_dictionary()
if self.XML in local_file_dictionary.keys():
+ svg_file = local_file_dictionary[self.SVG] if self.SVG in local_file_dictionary.keys() else None
response_organizer = ResponseOrganizer(local_file_dictionary.get(self.MANUSCRIPT))
- json_dict = response_organizer.create_json_dict(local_file_dictionary[self.XML])
+ json_dict = response_organizer.create_json_dict(local_file_dictionary[self.XML], svg_file=svg_file)
try:
self.wfile.write(str.encode(json.dumps(json_dict)))
except Exception:
print(json_dict)
def _parse_header(self, key) ->str:
"""Return content of header for key.
"""
headers = [ header for header in self.headers._headers if key in header ]
if len(headers) > 0:
return headers[0][1]
return ''
def do_POST(self):
"""Process POST.
"""
ctype = self._parse_header(self.CONTENT_TYPE)
if ctype != 'application/json':
length = int(self._parse_header(self.CONTENT_LENGTH))
self._send_error()
return
# read the message and convert it into a python dictionary
length = int(self._parse_header(self.CONTENT_LENGTH))
response = json.loads(self.rfile.read(length))
local_file_dictionary = Server.get_local_file_dictionary()
response_organizer = ResponseOrganizer(local_file_dictionary.get(self.MANUSCRIPT))
json_dict = response_organizer.handle_response(response)
self._set_headers(200)
self.wfile.write(str.encode(json.dumps(json_dict)))
def _send_error(self):
"""Send error msg.
"""
self._set_headers(400)
self.end_headers()
def run(port=8008):
server_address = ('', port)
httpd = HTTPServer(server_address, Server)
print(f'Starting httpd on port {port}...')
httpd.serve_forever()
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to send xml data as json over http.
fixes/server.py OPTIONS
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
run()
return exit_code
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: tests_py2ttl/test_data/mapping_dict.xml
===================================================================
--- tests_py2ttl/test_data/mapping_dict.xml (revision 106)
+++ tests_py2ttl/test_data/mapping_dict.xml (revision 107)
@@ -1,355 +1,391 @@
tln
http://www.nie.org/ontology/nietzsche#
./tln-ontology_autogenerated.ttl
+
+ http://www.nie.org/ontology/nietzsche#ManuscriptUnity
+
+ http://www.nie.org/ontology/nietzsche#hasTitle
+ http://www.nie.org/ontology/nietzsche#hasManuscriptType
+ http://www.nie.org/ontology/nietzsche#hasPages
+ http://www.nie.org/ontology/nietzsche#hasDescription
+
+
http://www.nie.org/ontology/nietzsche#ArchivalManuscriptUnity
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasManuscriptType
- http://www.nie.org/ontology/nietzsche#hasStyles
http://www.nie.org/ontology/nietzsche#hasPages
+ http://www.nie.org/ontology/nietzsche#hasStyles
http://www.nie.org/ontology/nietzsche#hasDescription
+ http://www.nie.org/ontology/nietzsche#partsBelongToReconstructedKonvolut
http://www.nie.org/ontology/nietzsche#hasEarlierDescriptions
http://www.nie.org/ontology/nietzsche#EditorComment
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#AtypicalWriting
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#atypicalWritingHasText
http://www.nie.org/ontology/nietzsche#Path
http://www.nie.org/ontology/nietzsche#hasDAttribute
http://www.nie.org/ontology/nietzsche#Box
http://www.nie.org/ontology/nietzsche#hasDAttribute
http://www.nie.org/ontology/nietzsche#hasEarlierText
http://www.nie.org/ontology/nietzsche#Clarification
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#clarificationHasText
http://www.nie.org/ontology/nietzsche#Color
http://www.nie.org/ontology/nietzsche#colorHasName
http://www.nie.org/ontology/nietzsche#hasHexadecimalValue
http://www.nie.org/ontology/nietzsche#Text
http://www.nie.org/ontology/nietzsche#textHasContent
http://www.nie.org/ontology/nietzsche#textHasMarkup
http://www.nie.org/ontology/nietzsche#Description
http://www.nie.org/ontology/nietzsche#textHasContent
http://www.nie.org/ontology/nietzsche#textHasMarkup
http://www.nie.org/ontology/nietzsche#EarlierDescription
http://www.nie.org/ontology/nietzsche#textHasContent
http://www.nie.org/ontology/nietzsche#hasAuthor
http://www.nie.org/ontology/nietzsche#hasCitation
http://www.nie.org/ontology/nietzsche#textHasMarkup
http://www.nie.org/ontology/nietzsche#EditorCorrection
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#hasCorrectionText
http://www.nie.org/ontology/nietzsche#Image
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasFileName
- http://www.nie.org/ontology/nietzsche#hasTransform
+ http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasPrimaryurl
http://www.nie.org/ontology/nietzsche#hasSecondaryurl
http://www.nie.org/ontology/nietzsche#hasTextField
http://www.nie.org/ontology/nietzsche#FaksimileImage
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasFileName
- http://www.nie.org/ontology/nietzsche#hasTransform
+ http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasPrimaryurl
http://www.nie.org/ontology/nietzsche#hasSecondaryurl
http://www.nie.org/ontology/nietzsche#hasApiurl
http://www.nie.org/ontology/nietzsche#hasThumburl
http://www.nie.org/ontology/nietzsche#hasMediumurl
http://www.nie.org/ontology/nietzsche#hasTextField
http://www.nie.org/ontology/nietzsche#PositionalObject
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#WordPosition
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#FaksimilePosition
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#Line
http://www.nie.org/ontology/nietzsche#lineHasNumber
http://www.nie.org/ontology/nietzsche#lineHasBottomValueOnTranskription
http://www.nie.org/ontology/nietzsche#lineHasTopValueOnTranskription
http://www.nie.org/ontology/nietzsche#lineHasInnerBottomValueOnFaksimile
http://www.nie.org/ontology/nietzsche#lineHasInnerTopValueOnFaksimile
http://www.nie.org/ontology/nietzsche#lineHasOuterBottomValueOnFaksimile
http://www.nie.org/ontology/nietzsche#lineHasOuterTopValueOnFaksimile
http://www.nie.org/ontology/nietzsche#isMainLine
http://www.nie.org/ontology/nietzsche#lineHasEditorComment
http://www.nie.org/ontology/nietzsche#LineContinuation
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#isLineAContinuationTo
http://www.nie.org/ontology/nietzsche#lineContinuationHasReference
http://www.nie.org/ontology/nietzsche#SimpleWord
http://www.nie.org/ontology/nietzsche#hasText
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#SpecialWord
http://www.nie.org/ontology/nietzsche#hasText
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#MarkForeignHands
http://www.nie.org/ontology/nietzsche#hasText
- http://www.nie.org/ontology/nietzsche#textOfForeignHands
http://www.nie.org/ontology/nietzsche#penOfForeignHands
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
+ http://www.nie.org/ontology/nietzsche#textOfForeignHands
http://www.nie.org/ontology/nietzsche#Page
http://www.nie.org/ontology/nietzsche#hasNumber
http://www.nie.org/ontology/nietzsche#hasOrientation
http://www.nie.org/ontology/nietzsche#hasLines
http://www.nie.org/ontology/nietzsche#hasMarkForeignHands
http://www.nie.org/ontology/nietzsche#hasWords
http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths
http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks
http://www.nie.org/ontology/nietzsche#hasFaksimileImage
http://www.nie.org/ontology/nietzsche#pageIsOnSVGTextField
http://www.nie.org/ontology/nietzsche#pageIsOnFaksimileTextField
http://www.nie.org/ontology/nietzsche#hasSvgImage
+
+ http://www.nie.org/ontology/nietzsche#NonExistentPage
+
+ http://www.nie.org/ontology/nietzsche#hasNumber
+ http://www.nie.org/ontology/nietzsche#hasOrientation
+ http://www.nie.org/ontology/nietzsche#hasLines
+ http://www.nie.org/ontology/nietzsche#hasMarkForeignHands
+ http://www.nie.org/ontology/nietzsche#hasWords
+ http://www.nie.org/ontology/nietzsche#hasWordDeletionPaths
+ http://www.nie.org/ontology/nietzsche#hasWordInsertionMarks
+ http://www.nie.org/ontology/nietzsche#hasStatus
+ http://www.nie.org/ontology/nietzsche#hasFaksimileImage
+ http://www.nie.org/ontology/nietzsche#pageIsOnSVGTextField
+ http://www.nie.org/ontology/nietzsche#pageIsOnFaksimileTextField
+ http://www.nie.org/ontology/nietzsche#hasSvgImage
+
+
+
+ http://www.nie.org/ontology/nietzsche#ReconstructedKonvolut
+
+ http://www.nie.org/ontology/nietzsche#hasTitle
+ http://www.nie.org/ontology/nietzsche#hasManuscriptType
+ http://www.nie.org/ontology/nietzsche#hasPages
+ http://www.nie.org/ontology/nietzsche#hasDescription
+
+
http://www.nie.org/ontology/nietzsche#Reference
http://www.nie.org/ontology/nietzsche#firstLineOfReference
http://www.nie.org/ontology/nietzsche#lastLineOfReference
http://www.nie.org/ontology/nietzsche#wordReference
http://www.nie.org/ontology/nietzsche#IsUncertain
http://www.nie.org/ontology/nietzsche#hasTitle
http://www.nie.org/ontology/nietzsche#hasPageNumber
http://www.nie.org/ontology/nietzsche#SVGImage
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasFileName
- http://www.nie.org/ontology/nietzsche#hasTransform
+ http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasPrimaryurl
http://www.nie.org/ontology/nietzsche#hasSecondaryurl
http://www.nie.org/ontology/nietzsche#hasTextField
http://www.nie.org/ontology/nietzsche#StandoffTag
http://www.nie.org/ontology/nietzsche#standoffTagHasStartIndex
http://www.nie.org/ontology/nietzsche#standoffTagHasEndIndex
http://www.nie.org/ontology/nietzsche#standoffTagHasCSS
http://www.nie.org/ontology/nietzsche#TextConnectionMark
http://www.nie.org/ontology/nietzsche#hasText
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#textConnectionMarkHasTextSource
http://www.nie.org/ontology/nietzsche#TextField
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#TranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#UncertainDecipherment
http://www.nie.org/ontology/nietzsche#isUncertain
http://www.nie.org/ontology/nietzsche#hasComment
http://www.nie.org/ontology/nietzsche#Word
http://www.nie.org/ontology/nietzsche#hasText
http://www.nie.org/ontology/nietzsche#hasEditedText
http://www.nie.org/ontology/nietzsche#wordHasWordParts
http://www.nie.org/ontology/nietzsche#wordBelongsToLine
http://www.nie.org/ontology/nietzsche#hasTranskriptionPosition
http://www.nie.org/ontology/nietzsche#hasFaksimilePosition
http://www.nie.org/ontology/nietzsche#wordHasStyle
http://www.nie.org/ontology/nietzsche#overwritesWord
http://www.nie.org/ontology/nietzsche#isTransformationOfWord
http://www.nie.org/ontology/nietzsche#isExtensionOfWord
http://www.nie.org/ontology/nietzsche#isDeletionOfWord
http://www.nie.org/ontology/nietzsche#isClarificationOfWord
http://www.nie.org/ontology/nietzsche#wordHasEarlierVersion
http://www.nie.org/ontology/nietzsche#wordHasCorrection
http://www.nie.org/ontology/nietzsche#wordIsDeletedByPath
http://www.nie.org/ontology/nietzsche#wordHasEditorComment
http://www.nie.org/ontology/nietzsche#WordDeletionPath
http://www.nie.org/ontology/nietzsche#hasDAttribute
http://www.nie.org/ontology/nietzsche#WordInsertionMark
http://www.nie.org/ontology/nietzsche#hasHeight
http://www.nie.org/ontology/nietzsche#hasWidth
http://www.nie.org/ontology/nietzsche#hasLeft
http://www.nie.org/ontology/nietzsche#hasTop
http://www.nie.org/ontology/nietzsche#hasBottom
http://www.nie.org/ontology/nietzsche#hasTransform
http://www.nie.org/ontology/nietzsche#hasMarkType
http://www.nie.org/ontology/nietzsche#hasSymbolId
http://www.nie.org/ontology/nietzsche#hasNextWord
http://www.nie.org/ontology/nietzsche#hasPreviousWord
http://www.nie.org/ontology/nietzsche#wordInsertionMarkBelongsToLine
xml-dictionary
- 2020-12-07 10:55:12
+ 2021-02-26 15:46:21
Index: svgscripts/datatypes/mark_foreign_hands.py
===================================================================
--- svgscripts/datatypes/mark_foreign_hands.py (revision 106)
+++ svgscripts/datatypes/mark_foreign_hands.py (revision 107)
@@ -1,148 +1,155 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent the mark for text by some foreign hand.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from .matrix import Matrix
from .special_word import SpecialWord
+from .standoff_tag import StandoffTag
+from .text import Text
class MarkForeignHands(SpecialWord):
"""
This class represents the mark for text by some foreign hand.
"""
XML_TAG = 'mark-foreign-hands'
XML_SUB_TAG = 'text'
CLASS_MARK = '$'
REPLACE_DICT = { '+': 'x' }
- def __init__(self, id=0, line_number=-1, text=CLASS_MARK, foreign_hands_text='', pen='', transkription_positions=[], faksimile_positions=[]):
+ def __init__(self, id=0, line_number=-1, text=CLASS_MARK, foreign_hands_text=None, pen='', transkription_positions=[], faksimile_positions=[]):
super(MarkForeignHands, self).__init__(id=id, text=text, line_number=line_number,\
transkription_positions=transkription_positions, faksimile_positions=faksimile_positions)
self.foreign_hands_text = foreign_hands_text
self.pen = pen
def add_content(self, node):
"""Adds content to MarkForeignHands.
"""
- self.foreign_hands_text = node.text
self.pen = node.get('pen')
+ if node.text is not None:
+ self.foreign_hands_text = Text(content=node.text)
+ else:
+ standoff_markups = [ StandoffTag.create_cls_from_node(stf) for stf in node.xpath('./' + '|./'.join(StandoffTag.MARKUP_STYLES))]
+ content = node.xpath('./content')[0].text if len(node.xpath('./content')) > 0 else None
+ self.foreign_hands_text = Text(content=content, standoff_markups=standoff_markups, tag='content')
def attach_word_to_tree(self, target_tree):
"""Attaches MarkForeignHands to tree target_tree.
"""
node = super(MarkForeignHands,self).attach_word_to_tree(target_tree)
- if self.foreign_hands_text != '':
+ if self.foreign_hands_text is not None:
content_node = ET.SubElement(node, MarkForeignHands.XML_SUB_TAG)
- content_node.text = self.foreign_hands_text
- if self.pen != '':
+ content_node.text = self.foreign_hands_text if type(self.foreign_hands_text) == str else self.foreign_hands_text.content
+ if self.pen is not None and self.pen != '':
content_node.set('pen', self.pen)
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(MarkForeignHands,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('foreign_hands_text',\
- str, cardinality=1, name='textOfForeignHands', label='text traces of some foreign hand'))
+ Text, cardinality=1, name='textOfForeignHands', label='text traces of some foreign hand'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('pen',\
str, cardinality=1, cardinality_restriction='maxCardinality',\
name='penOfForeignHands', label='pen used to write text by some foreign hand'))
return cls.return_dictionary_after_updating_super_classes(dictionary)
@classmethod
def get_special_char_list(cls):
"""Returns a list of the chars that define this special word.
"""
return [ cls.CLASS_MARK ]
@staticmethod
def find_content(list_of_special_words, transkription_field, svg_tree, style_dict=None, italic_classes=None, SonderzeichenList=None, marginals_extra=False, set_to_text_field_zero=True):
"""Find content for the MarkForeignHands.
"""
if style_dict is None:
style_dict = {}
if italic_classes is None:
italic_classes = []
if SonderzeichenList is None:
SonderzeichenList = []
if len(style_dict) > 0:
if len(italic_classes) == 0:
italic_classes = [ key for key in style_dict\
if bool(style_dict[key].get('font-family')) and style_dict[key]['font-family'].endswith('Italic') ]
if len(SonderzeichenList) == 0:
SonderzeichenList = [ key for key in style_dict\
if bool(style_dict[key].get('font-family')) and style_dict[key]['font-family'].startswith('Sonderzeichen') ]
nodes_in_margin_field = [ item for item in filter(lambda x: Matrix.IS_IN_MARGIN_FIELD(x.get('transform'), transkription_field, marginals_on_extra_page=marginals_extra),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
provide_tf = transkription_field if set_to_text_field_zero else None
for mark_foreign_hands in list_of_special_words:
relevant_nodes = [ node for node in nodes_in_margin_field\
if is_close((mark_foreign_hands.transkription_positions[0].bottom+mark_foreign_hands.transkription_positions[0].top)/2,\
node.get('transform'), transkription_field=provide_tf) ]
relevant_nodes = sorted(relevant_nodes, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX())
italic_found = False
mark_foreign_hands_text = ''
pen = ''
for node in relevant_nodes:
if len(node.getchildren()) == 0:
if italic_found:
pen += node.text
elif any(style in italic_classes for style in node.get('class').split(' ')):
italic_found = True
pen = node.text
else:
mark_foreign_hands_text += get_text_from_node(node, SonderzeichenList)
else:
for tspan in node.getchildren():
if italic_found:
pen += tspan.text
elif any(style in italic_classes for style in tspan.get('class').split(' ')):
italic_found = True
pen = tspan.text
else:
mark_foreign_hands_text += get_text_from_node(tspan, SonderzeichenList)
mark_foreign_hands.foreign_hands_text = mark_foreign_hands_text
mark_foreign_hands.pen = pen
def get_text_from_node(node, SonderzeichenList):
"""Returns the text of node. Replaces Sonderzeichen if node has a style class in SonderzeichenList.
"""
if any(style in SonderzeichenList for style in node.get('class').split(' '))\
and bool(MarkForeignHands.REPLACE_DICT.get(node.text)):
return MarkForeignHands.REPLACE_DICT[node.text]
else:
return node.text
def is_close(mark_foreign_hands_position, matrix_string, transkription_field=None):
"""Return true if mark_foreign_hands_position is == matrix.getY()+-THRESHOLD_Y
"""
THRESHOLD_Y = 4
matrix = Matrix(transform_matrix_string=matrix_string, transkription_field=transkription_field)
return abs(mark_foreign_hands_position-matrix.getY()) < THRESHOLD_Y
Index: svgscripts/datatypes/standoff_tag.py
===================================================================
--- svgscripts/datatypes/standoff_tag.py (revision 106)
+++ svgscripts/datatypes/standoff_tag.py (revision 107)
@@ -1,151 +1,151 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent the standoff markup of a text.
"""
# Copyright (C) University of Basel 2020 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
import re
import sys
from .attachable_object import AttachableObject
sys.path.append('py2ttl')
from class_spec import SemanticClass
class StandoffTag(AttachableObject,SemanticClass):
"""
This class represents the standoff markup of a text.
"""
- MARKUP_STYLES = [ 'bold', 'italic' ]
+ MARKUP_STYLES = [ 'bold', 'italic', 'delete' ]
RDFS_SUBCLASSOF_LIST = ['http://www.nie.org/ontology/standoff#StandoffMarkup']
RELEVANT_STYLE_KEY = 'font-family'
RELEVANT_CONTENT_STARTSWITH = 'Frutiger-'
RELEVANT_PATTERN = re.compile('.*(Italic|Bold)$')
RELEVANT_SUB_PATTERN = re.compile('Frutiger-(Light)*')
STOFF_HAS_CSS_URL_STRING = 'http://www.nie.org/ontology/standoff#hasCSS'
STOFF_HAS_START_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasStartIndex'
STOFF_HAS_END_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasEndIndex'
HTML_TAG_DICTIONARY = { '': 'italic', '': 'bold', '': 'delete' }
CSS_DICTIONARY = { 'bold': 'font-weight:bold;',
'italic': 'font-style: italic;',
'delete': 'text-decoration:line-through;' }
def __init__(self, markup: str, startIndex: int, endIndex: int, id=0):
self.id = str(id)
self.css_string = self.CSS_DICTIONARY.get(markup)
self.markup = markup
self.startIndex = startIndex
self.endIndex = endIndex
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.markup)
obj_node.set('id', self.id)
obj_node.set('start', str(self.startIndex))
obj_node.set('end', str(self.endIndex))
@classmethod
def create_cls(cls, start_index, end_index, style_string, page=None, style_dict=None):
"""Creates a StandoffTag from a style_string.
:return: a list of (datatypes.standoff_tag) StandoffTag
"""
if page is not None:
style_dict = cls.create_relevant_style_dictionary(page)
relevant_keys = [ key for key in set(style_string.split(' '))\
if key in style_dict.keys() ]
standoff_tags = []
if style_dict is None or len(style_dict) == 0:
return standoff_tags
for relevant_key in relevant_keys:
font_family = style_dict[relevant_key][cls.RELEVANT_STYLE_KEY]
if re.match(cls.RELEVANT_PATTERN, font_family):
markup = re.sub(cls.RELEVANT_SUB_PATTERN, '', font_family).lower()
standoff_tags.append(cls(markup, start_index, end_index))
return standoff_tags
@classmethod
def create_cls_from_node(cls, node):
"""Creates a StandoffTag from a node.
:return: (datatypes.standoff_tag) StandoffTag
"""
return cls(node.tag, int(node.get('start')), int(node.get('end')), id=node.get('id'))
@classmethod
def create_relevant_style_dictionary(cls, page):
"""Return a style dictionary that contains only relevant keys and contents.
"""
return { key: key_dict for key, key_dict in page.style_dict.items()\
if cls.RELEVANT_STYLE_KEY in key_dict.keys()\
and key_dict[cls.RELEVANT_STYLE_KEY].startswith(cls.RELEVANT_CONTENT_STARTSWITH) }
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
properties = {}
#properties.update(cls.create_semantic_property_dictionary('markup', str, cardinality=1,\
# name='standoffTagHasMarkup', label='standoff tag has a specific markup', comment='Connects a standoff tag with its markup, e.g. bold or italic'))
properties.update(cls.create_semantic_property_dictionary('startIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_START_INDEX,\
name='standoffTagHasStartIndex', label='standoff tag has a start index', comment='Connects a standoff tag with its start index.'))
properties.update(cls.create_semantic_property_dictionary('endIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_END_INDEX,\
name='standoffTagHasEndIndex', label='standoff tag has a end index', comment='Connects a standoff tag with its end index.'))
properties.update(cls.create_semantic_property_dictionary('css_string', str,\
subPropertyOf=cls.STOFF_HAS_CSS_URL_STRING,\
name='standoffTagHasCSS', label='standoff tag has css', comment='Connects a standoff tag with CSS style.'))
dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties }
return cls.return_dictionary_after_updating_super_classes(dictionary)
def is_joinable(self, other):
"""Return true if self and other have same markup and self.endIndex == other.startIndex.
"""
return self.markup == other.markup and self.endIndex == other.startIndex
def join(self, other):
"""Join self with other.
"""
self.endIndex = other.endIndex
def join_list(self, others):
"""Join all others that are joinable, return remaining others as a list.
"""
unjoinable_others = []
for other in others:
if self.is_joinable(other):
self.join(other)
else:
unjoinable_others.append(other)
return unjoinable_others
Index: svgscripts/datatypes/word.py
===================================================================
--- svgscripts/datatypes/word.py (revision 106)
+++ svgscripts/datatypes/word.py (revision 107)
@@ -1,895 +1,905 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
import inspect
from lxml import etree as ET
from operator import attrgetter
import re
import string
import sys
import warnings
from .box import Box
from .editor_comment import EditorComment
from .matrix import Matrix
from .path import Path
from .simple_word import SimpleWord
from .style import Style
from .word_deletion_path import WordDeletionPath
from .word_position import WordPosition
from .transkription_position import TranskriptionPosition
from .writing_process import WritingProcess
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
def execute_function_on_parts(word_parts, func_name):
"""Execute function on parts and add those parts instead of original word to word_parts.
:return: new word_parts, output from func
"""
copy_parts = word_parts[:]
for word in word_parts:
output = eval('word.{0}()'.format(func_name))
if len(word.word_parts) > 0:
for part_word in word.word_parts:
copy_parts.insert(copy_parts.index(word), part_word)
copy_parts.remove(word)
word.word_parts = []
return copy_parts, output
def update_transkription_position_ids(word):
"""Update transkription_position' ids according to index.
"""
word_part_ids = [ wp.id for wp in word.word_parts ]
if len(word_part_ids) != len(set(word_part_ids)):
for id, wp in enumerate(word.word_parts):
wp.id = id
for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
transkription_position.id = index
transkription_position.has_box = None
transkription_position.deleted = False
class Word(SimpleWord):
"""
This class represents a word.
"""
COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ]
APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' }
DATA = 'debug-data'
RDFS_SUBCLASSOF_LIST = ['http://www.e-editiones.ch/ontology/text#HandwrittenText']
XML_TAG = 'word'
XML_EARLIER_VERSION = 'earlier-version'
XML_OVERWRITES = 'overwrites'
XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\
'isDeletionOfWord': 'deletesEarlierPart',\
'isExtensionOfWord': 'extendsEarlierVersion',\
'isTransformationOfWord': 'transformsEarlierPart' }
def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None):
super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
self.corrections = []
self.deleted = deleted
self.deletion_paths = []
self.deletion_paths_near_word = []
self.debug_container = {}
self.debug_msg = None
self.earlier_version = earlier_version
self.edited_text = None
self.editor_comment = None
self.isClarificationOfWord = None
self.isDeletionOfWord = None
self.isExtensionOfWord = None
self.isTransformationOfWord = None
if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0:
self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ])
self.overwrites_word = None
self.process_flags = []
self.styles = styles\
if styles is not None\
else []
self.verified = None
self.writing_process_id = writing_process_id
self.writing_processes = []
self.word_insertion_mark = None
self.word_box = None
self.word_parts = word_parts if word_parts is not None else []
self.word_part_objs = word_part_objs if word_part_objs is not None else []
def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Add a word deletion path to word.
"""
if len(self.word_parts) > 0:
for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
elif self.deleted:
index = 0
while len(self.deletion_paths) == 0 and index < len(self.transkription_positions):
include_pwps = (len(self.transkription_positions[index].positional_word_parts) > 0
and abs(self.transkription_positions[index].left-self.transkription_positions[index].positional_word_parts[0].left) < 10)
word_path = Path.create_path_from_transkription_position(self.transkription_positions[index],\
tr_xmin=tr_xmin, tr_ymin=tr_ymin, include_pwps=include_pwps)
self.deletion_paths += [ deletion_path for deletion_path in deletion_paths\
if not Path.is_path_contained(self.deletion_paths, deletion_path)\
and deletion_path.do_paths_intersect(word_path) ]
index += 1
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
word_node = super(Word,self).attach_word_to_tree(target_tree)
if self.deleted is not None:
word_node.set('deleted', str(self.deleted).lower())
if self.verified is not None:
word_node.set('verified', str(self.verified).lower())
if self.edited_text is not None:
word_node.set('edited-text', self.edited_text)
if self.editor_comment is not None:
self.editor_comment.attach_object_to_tree(word_node)
if self.writing_process_id > -1:
word_node.set('writing-process-id', str(self.writing_process_id))
if len(self.process_flags) > 0:
word_node.set('process-flags', ' '.join(self.process_flags))
for index, word_part in enumerate(self.word_parts):
word_part.id = index
word_part.attach_word_to_tree(word_node)
if self.earlier_version is not None:
earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION)
self.earlier_version.attach_word_to_tree(earlier_node)
if self.overwrites_word is not None\
and len(self.overwrites_word.transkription_positions) > 0:
overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES)
self.overwrites_word.attach_word_to_tree(overwrite_node)
if self.word_box is not None:
self.word_box.attach_object_to_tree(word_node)
if len(self.corrections) > 0:
word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ])))
for deletion_id, deletion_path in enumerate(self.deletion_paths):
deletion_path.id = deletion_id
deletion_path.tag = WordDeletionPath.XML_TAG
deletion_path.attach_object_to_tree(word_node)
for key in self.XML_CORRECTION_DICT.keys():
if self.__dict__[key] is not None:
word_node.set(self.XML_CORRECTION_DICT[key], 'true')
return word_node
def belongs_to_multiple_writing_processes(self, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if len(self.word_parts) > 0 and include_parts:
return len(set(word.writing_process_id for word in self.word_parts)) > 1
return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1
def set_parent_word_writing_process_id(self):
"""Set writing_process_id for parent word.
"""
ids = set(word.transkription_positions[0].style for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None)
if len(ids) > 1:
self.writing_process_id = max([style.writing_process_id for style in ids])
if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\
for word in self.word_parts\
if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\
> 1:
self.writing_process_id += 1
@classmethod
def create_cls(cls, word_node):
"""Creates a word from a (lxml.Element) node.
[:return:] Word
"""
cls = super(Word,cls).create_cls(word_node)
cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
cls.split_strings = None
cls.join_string = word_node.get('join')
if bool(word_node.get('split')):
cls.split_strings = word_node.get('split').split(' ')
if ''.join(cls.split_strings) != cls.text:
error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\
format(word_node.getroottree().docinfo.URL, str(cls.id))\
+ 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\
+ 'Text attribute: "{0}".\n'.format(cls.text)
raise Exception(error_msg)
cls.verified = word_node.get('verified') == 'true'\
if bool(word_node.get('verified')) else None
cls.deleted = word_node.get('deleted') == 'true'\
if bool(word_node.get('deleted')) else None
cls.edited_text = word_node.get('edited-text')
cls.editor_comment = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ][0]\
if len([ node for node in word_node.xpath('./' + EditorComment.XML_TAG) ]) > 0 else None
cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ]
if bool(word_node.get('corrections')):
for index in [ int(i) for i in word_node.get('corrections').split(' ') ]:
if index < len(cls.word_parts):
cls.corrections.append(cls.word_parts[index])
cls.earlier_version = None
if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0:
cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0]
for key_value in cls.XML_CORRECTION_DICT.values():
if word_node.get(key_value) == 'true':
cls.__dict__[key_value] = True
if cls.earlier_version is not None:
for word_part in cls.word_parts:
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\
and len(cls.word_parts) <= len(cls.earlier_version.word_parts):
try:
word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id]
except Exception:
msg = f'{cls.id} {cls.text}: {word_part.id}'
raise Exception(msg)
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls.earlier_version
for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]:
if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
word_part.__dict__[key] = cls
cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\
if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\
else None
cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\
if len(word_node.xpath('./' + Box.XML_TAG)) > 0\
else None
cls.deletion_paths = [ Path(node=node) for node in word_node.xpath(f'./{WordDeletionPath.XML_TAG}') ]
cls.process_flags = word_node.get('process-flags').split(' ')\
if bool(word_node.get('process-flags'))\
else []
return cls
@classmethod
def join_words(cls, list_of_words, add_white_space_between_words=False):
"""Creates a word from a list of words.
[:return:] Word
"""
if len(list_of_words) > 1:
deleted = True in [ word.deleted for word in list_of_words ]\
and len(set([ word.deleted for word in list_of_words ])) == 1
line_number = list_of_words[0].line_number\
if len(set([ word.line_number for word in list_of_words ])) == 1\
else -1
for word in list_of_words:
if len(word.word_parts) > 0:
index = list_of_words.index(word)
list_of_words.remove(word)
for part_word in reversed(word.word_parts):
list_of_words.insert(index, part_word)
new_word_text = ''.join([word.text for word in list_of_words])\
if not add_white_space_between_words\
else ' '.join([word.text for word in list_of_words])
new_word = cls(id=list_of_words[0].id, text=new_word_text,\
line_number=line_number, deleted=deleted, word_parts=list_of_words)
if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]:
change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0]
new_word.edited_text = new_word.text.replace(change_text, change_text[:-1])
for id, word in enumerate(new_word.word_parts): word.id = id
return new_word
if len(list_of_words) > 0:
return list_of_words[0]
else:
return None
def create_earlier_version(self, root_word=None, id=0):
"""Create an earlier version of word.
"""
if root_word is None:
root_word = self
root_word.set_parent_word_writing_process_id()
word_parts = []
non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\
if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ]
non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts)
if non_single_punctuation_word_parts_length > 0\
and len([ word_part for word_part in non_single_punctuation_word_parts\
if word_part.deleted ])\
== non_single_punctuation_word_parts_length:
self.deleted = True
for word_part in non_single_punctuation_word_parts: word_part.deleted = False
for id, word_part in enumerate(self.word_parts):
earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id)
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
elif word_part.overwrites_word is not None\
and ((len(word_part.transkription_positions) > 0\
and word_part.overwrites_word.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style\
!= word_part.overwrites_word.transkription_positions[0].style)
or word_part.word_box.earlier_version):
word_part.overwrites_word.id = word_part.id
word_parts.append(word_part.overwrites_word)
word_part.isTransformationOfWord = word_part.overwrites_word
#print(f'transform: {self.text}')
if word_part not in self.corrections:
self.corrections.append(word_part)
elif root_word.writing_process_id > -1\
and (len(word_part.transkription_positions) > 0\
and word_part.transkription_positions[0].style is not None\
and word_part.transkription_positions[0].style.writing_process_id\
== root_word.writing_process_id):
word_part.extendsEarlierVersion = True
#print('extends')
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
if word_part.deleted:
word_part.isDeletionOfWord = earlierWordPart
word_parts.append(earlierWordPart)
if word_part not in self.corrections:
self.corrections.append(word_part)
else:
#print(f'default: {self.text}')
word_parts.append(earlierWordPart)
text = ''.join([ word.text for word in word_parts ])\
if len(word_parts) > 0\
else self.text
if len(word_parts) == 1:
self.transkription_positions += word_parts[0].transkription_positions
self.faksimile_positions += word_parts[0].faksimile_positions
word_parts = []
new_transkription_positions = copy.deepcopy(self.transkription_positions)
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None:
writing_process_id = self.transkription_positions[0].style.writing_process_id
for new_tp in new_transkription_positions:
new_tp.style.writing_process_id = writing_process_id
return Word(id=id, text=text, transkription_positions=new_transkription_positions,\
faksimile_positions=self.faksimile_positions, line_number=self.line_number,\
word_parts=word_parts)
def create_correction_history(self, page=None, box_style=None):
"""Create correction history.
"""
if self.word_box is not None:
manuscript = self.transkription_positions[0].style.manuscript\
if len(self.transkription_positions) > 0\
and self.transkription_positions[0].style is not None\
else None
style = Style()
if box_style is not None:
style = box_style
if page is not None:
style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript)
for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]:
style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions)
for transkription_position in transkription_positions:
transkription_position.style = style
self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\
line_number=self.line_number)
for word_part in self.word_parts:
word_part.create_correction_history(page=page, box_style=box_style)
if len(self.word_parts) > 0:
earlier_version = self.create_earlier_version()
extending_words = self._get_parts_with_property_key('extendsEarlierVersion')
if len(extending_words) > 0:
for word in extending_words:
word.isExtensionOfWord = earlier_version
if self.has_mixed_status('deleted', include_parts=True):
self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ])
if len(self.corrections) > 0:
self.earlier_version = earlier_version
@staticmethod
def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
"""Creates a word from a (lxml.Element) node or word_part_objs.
[:return:] Word
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
text = word_node.get('text')
deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
if len(word_node.findall('.//' + Word.DATA)) > 0\
else [ item.attrib for item in word_node.findall('.//part')]
return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
WIDTH = 5
TOPCORRECTION = 2.0
FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
height = height
x = round(float(word_part_objs[0]['x']), 3)
if(page is not None and bool(page.style_dict)):
HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
if endSign is not None and '%' in endSign:
lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
if bool(page.style_dict[key].get('font-size'))]
lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
elif endSign is not None and '%' in endSign:
endX = float(endX) + WIDTH
bottom = round(float(word_part_objs[0]['y']), 3)
y = round(bottom - height + TOPCORRECTION, 3)
width = round(float(endX) - x, 3)
transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
text = ''.join([ dict['text'] for dict in word_part_objs])
line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
word.debug_msg = debug_msg
return word
else:
error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(Word,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\
cardinality=1, cardinality_restriction='minCardinality',\
name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\
name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\
name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\
comment='Word has been deleted by the author using a deletion path.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comment', EditorComment,\
name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\
name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\
name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\
name='isClarificationOfWord', label='word is a clarification of word',\
comment='The author has used this part of the word in order to clarify the appearance of that word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\
name='isDeletionOfWord', label='word is a deletion of word',\
comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\
name='isExtensionOfWord', label='word is a extension of word',\
comment='The author has used this part of a word in order to extend an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\
name='isTransformationOfWord', label='word is a transformation of word',\
comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.'))
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\
name='overwritesWord', label='word overwrites word',\
comment='The author has used this word in order to overwrite that word.'))
# This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING,
# cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class.
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\
name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\
subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING))
super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\
name='isCorrectionOfWord', label='word is a correction of word',\
comment='The author has used this word in order to correct that word.')
for key in cls.XML_CORRECTION_DICT.keys():
correction_dict = dictionary[cls.PROPERTIES_KEY].get(key)
correction_dict.update(super_property_dictionary)
dictionary[cls.PROPERTIES_KEY].update({key: correction_dict})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def has_mixed_status(self, property_key, include_parts=False, concerns_word=True):
"""Returns true if transkription_positions have mixed status concerning the property_key in their __dict__.
"""
if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions):
return False
if len(self.word_parts) > 0 and include_parts:
if concerns_word:
if False in set(property_key in word.__dict__.keys() for word in self.word_parts):
return False
return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1
else:
return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\
if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1
return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1
def init_word(self, page):
"""Initialize word with objects from page.
"""
super(Word,self).init_word(page)
if self.writing_process_id > -1:
self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ]
writing_processes = self.writing_processes
for word_part in self.word_parts:
word_part.init_word(page)
self.lines += word_part.lines
self.writing_processes += word_part.writing_processes
self.lines = [ line for line in set(self.lines) ]
self.writing_processes = [ wp for wp in set(self.writing_processes)]
if self.overwrites_word is not None:
self.overwrites_word.init_word(page)
if self.earlier_version is not None:
if self.earlier_version.writing_process_id == -1:
self.earlier_version.writing_process_id = self.writing_process_id-1
if self.earlier_version.line_number == -1:
self.earlier_version.line_number = self.line_number
self.earlier_version.init_word(page)
self.deletion_paths = [ page.get_word_deletion_path(path) for path in self.deletion_paths if path.path is not None ]
def join(self, other_word, append_at_end_of_new_word=True, add_white_space_between_words=False):
"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
"""
if append_at_end_of_new_word:
self.text = self.text + other_word.text\
if not add_white_space_between_words\
else self.text + ' ' + other_word.text
for position in other_word.transkription_positions:
position.id = str(len(self.transkription_positions))
self.transkription_positions.append(position)
+ for position in other_word.faksimile_positions:
+ position.id = str(len(self.faksimile_positions))
+ self.faksimile_positions.append(position)
else:
self.text = other_word.text + self.text
index = 0
for position in other_word.transkription_positions:
self.transkription_positions.insert(index, position)
index += 1
while index < len(self.transkription_positions):
self.transkription_positions[index].id = str(index)
index += 1
+ index = 0
+ for position in other_word.faksimile_positions:
+ self.faksimile_positions.insert(indexposition)
+ index += 1
+ while index < len(self.faksimile_positions):
+ self.faksimile_positions[index].id = str(index)
+ index += 1
self.simplify_transkription_positions()
def partition_according_to_deletion(self):
"""Partition a word according to its transkription_positions' deletion status
->split word and add partial words as its parts.
"""
if self.has_mixed_status('deleted'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.deleted != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
for tp in transkription_positions:
newWord.deletion_paths += tp._deletion_paths
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.deleted
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
for tp in transkription_positions:
newWord.deletion_paths += tp._deletion_paths
self.word_parts.append(newWord)
self.transkription_positions = []
self.line_number = -1
self.deleted = False
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion')
elif not self.deleted\
and len(self.transkription_positions) > 0\
and self.transkription_positions[0].deleted:
self.deleted = True
for tp in self.transkription_positions:
self.deletion_paths += tp._deletion_paths
def partition_according_to_writing_process_id(self):
"""Partition a word according to its transkription_positions' writing_process_ids
->split word and add partial words as its parts.
"""
if self.belongs_to_multiple_writing_processes():
last_writing_process_id = -1
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.writing_process_id != last_writing_process_id\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_writing_process_id = transkription_position.writing_process_id
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id')
if self.belongs_to_multiple_writing_processes(include_parts=True):
self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0]
elif len(self.transkription_positions) > 0:
self.writing_process_id = self.transkription_positions[0].writing_process_id
def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False):
"""Determines whether word is over a word box.
"""
word_over_box = None
if len(self.word_parts) > 0:
for word in self.word_parts:
current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None))
if current_word is not None and current_word.word_box is not None:
word_over_box = current_word
else:
new_tp_dict = {}
for index, transkription_position in enumerate(self.transkription_positions):
if previous_word_has_box and index == 0:
if len(transkription_position.positional_word_parts) > 0:
transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2
#print(f'{self.text}: {transkription_position.positional_word_parts[0].left}')
else:
transkription_position.left += 1
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
containing_boxes = [ box_path for box_path in box_paths\
if word_path.is_partially_contained_by(box_path)\
or box_path.do_paths_intersect(word_path) ]
if len(containing_boxes) > 0:
if previous_word_has_box:
print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}')
self._set_box_to_transkription_position(containing_boxes[0], word_path,\
transkription_position, new_tp_dict, tr_xmin)
box_paths.remove(containing_boxes[0])
for replace_tp in new_tp_dict.keys():
for tp in new_tp_dict.get(replace_tp):
self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp)
self.transkription_positions.remove(replace_tp)
word_over_box = self._get_partial_word_over_box()
update_transkription_position_ids(self)
return word_over_box
def set_word_insertion_mark(self, word_insertion_mark):
"""Sets word_insertion_mark
"""
self.word_insertion_mark = word_insertion_mark
def set_writing_process_id_to_transkription_positions(self, page):
"""Determines the writing process id of the transkription_positions.
"""
for transkription_position in self.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
if font_key in page.fontsizekey2stage_mapping.keys():
transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
def simplify_transkription_positions(self):
"""Merge transkription_positions if possible.
"""
index = len(self.transkription_positions)-1
while index > 0\
and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
current_tp = self.transkription_positions[index]
index -= 1
previous_tp = self.transkription_positions[index]
if previous_tp.is_mergebale_with(current_tp):
positional_word_parts = previous_tp.positional_word_parts
positional_word_parts += current_tp.positional_word_parts
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
if len(transkription_positions) == 1:
transkription_positions[0].writing_process_id = previous_tp.writing_process_id\
if previous_tp.writing_process_id != -1\
else current_tp.writing_process_id
self.transkription_positions.pop(index+1)
self.transkription_positions[index] = transkription_positions[0]
#print(self.text, len(self.transkription_positions))
def split(self, split_string, start_id=0):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString, currentString, nextString = self.text.partition(split_string)
currentWord = None
previousWord = None
nextWord = None
previousIndex = 0
current_id = start_id
all_positional_word_parts = []
for position in self.transkription_positions:
all_positional_word_parts += position.positional_word_parts
if len(all_positional_word_parts) == 0:
warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString))
if len(previousString) > 0:
previous_pwps = []
while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
previous_pwps.append(all_positional_word_parts[previousIndex])
previousIndex += 1
if previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString))
else:
previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split')
previous_text = ''.join([ pwp.text for pwp in previous_pwps ])
previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions)
previousWord.faksimile_positions = self.faksimile_positions
current_id += 1
all_positional_word_parts = all_positional_word_parts[previousIndex:]
if len(nextString) > 0:
tmp_pwps = []
index = 0
while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
tmp_pwps.append(all_positional_word_parts[index])
index += 1
if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString))
else:
next_pwps = all_positional_word_parts[index:]
next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split')
next_text = ''.join([ pwp.text for pwp in next_pwps ])
nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions)
nextWord.faksimile_positions = self.faksimile_positions
all_positional_word_parts = all_positional_word_parts[:index]
current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split')
current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ])
currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions)
currentWord.faksimile_positions = self.faksimile_positions
return previousWord, currentWord, nextWord
def split_according_to_status(self, status, splits_are_parts=False):
"""Split a word according to its transkription_positions' text.
:return: a list of new word.Word
"""
new_words = []
if self.has_mixed_status(status):
last_status = None
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.__dict__[status] != last_status\
and len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.__dict__[status]
if len(transkription_positions) > 0:
new_words.append(\
self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
if splits_are_parts:
self.word_parts += new_words
if len(self.word_parts) > 0:
self.transkription_positions = []
return new_words
def undo_partitioning(self):
"""Undo partitioning.
"""
if len(self.word_parts) > 0:
for word_part in self.word_parts:
word_part.undo_partitioning()
if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]):
self.transkription_positions += word_part.transkription_positions
self.earlier_version = None
self.edited_text = None
self.word_box = None
self.word_parts = []
self.corrections = []
self.earlier_versions = []
self.box_paths = []
def _create_new_word(self, transkription_positions, status, new_id=0):
"""Create a new word from self and transkription_positions.
"""
newWord = Word(id=new_id, transkription_positions=transkription_positions)
for key in self.COPY_PROPERTY_KEY:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys():
newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status])
else:
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
return newWord
def _get_parts_with_property_key(self, property_key):
"""Return a list of word_parts with property == property_key.
"""
word_parts = []
for word_part in self.word_parts:
if property_key in word_part.__dict__.keys():
word_parts.append(word_part)
else:
word_parts += word_part._get_parts_with_property_key(property_key)
return word_parts
def _get_partial_word_over_box(self):
"""Partition a word according to its transkription_positions' has_box
->split word and add partial words as its parts.
:return: word over box or self
"""
word_over_box = None
if self.has_mixed_status('has_box'):
transkription_positions = []
last_word_box = None
for transkription_position in self.transkription_positions:
if transkription_position.has_box != last_word_box\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
transkription_positions = []
transkription_positions.append(transkription_position)
last_word_box = transkription_position.has_box
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_word_box is not None:
word_over_box = newWord
word_over_box.word_box = last_word_box
self.transkription_positions = []
elif len(self.word_parts) > 0:
#self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box')
for word_part in self.word_parts:
if word_over_box is None:
word_over_box = word_part._get_partial_word_over_box()
else:
break
elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1:
word_over_box = self
word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box
return word_over_box
def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin):
"""Set box_path to transkription_position that is contained by box_path.
Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary.
"""
if box_path.contains_path(word_path):
transkription_position.has_box = box_path
elif box_path.contains_start_of_path(word_path):
split_position = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[0].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
elif box_path.contains_end_of_path(word_path):
split_position = box_path.path.bbox()[0] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
else: # box_path in the middle of word_pathz
split_position1 = box_path.path.bbox()[0] - tr_xmin
split_position2 = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position1, split_position2)
if len(new_tps) >= 2:
new_tps[1].has_box = box_path
new_transkription_positions_dictionary.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
def do_paths_intersect_saveMode(mypath1, mypath2):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try:
return mypath1.path.intersect(mypath2.path, justonemode=True)\
or mypath1.is_partially_contained_by(mypath2)
except AssertionError:
return False
Index: svgscripts/process_words_post_merging.py
===================================================================
--- svgscripts/process_words_post_merging.py (revision 106)
+++ svgscripts/process_words_post_merging.py (revision 107)
@@ -1,482 +1,487 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path as PathlibPath
from progress.bar import Bar
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
import sys
import tempfile
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.box import Box
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word, do_paths_intersect_saveMode, update_transkription_position_ids
from extract_line_continuation import extract_line_continuations
from util import back_up, process_warnings4status
from process_files import update_svgposfile_status
from process_footnotes import categorize_footnotes
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import extract_paths_on_tf
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
DEBUG_WORD = None
MERGED_DIR = 'merged'
WARNING_FOOTNOTES_ERROR = 'footnotes not processed'
WARNING_LINE_CONTINUATION = 'line continuation fail'
def categorize_paths(page, transkription_field=None):
"""Categorize all paths that are part of the transkription field.
:return: a dictionary containig a list for each category of path.
"""
if page.source is not None and isfile(page.source):
MAX_HEIGHT_LINES = 1
max_line = sorted(\
[line_number.bottom-line_number.top for line_number in page.line_numbers if line_number.id % 2 == 0],\
reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17
tr_xmin = 0.0
tr_ymin = 0.0
if (page.svg_image is None or page.svg_image.text_field is None)\
and transkription_field is not None:
tr_xmin = transkription_field.xmin
tr_ymin = transkription_field.ymin
paths, attributes = svg_to_paths.svg2paths(page.source)
allpaths_outside_tf = []
attributes_outside_tf = []
if transkription_field is None:
transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index)
allpaths_on_tf = extract_paths_on_tf(page, outsiders=allpaths_outside_tf, outsider_attributes=attributes_outside_tf, transkription_field=transkription_field)
path_dict = { 'text_area_deletion_paths': [],\
'deletion_or_underline_paths': [],\
'box_paths': [],\
'dots_paths': [],\
'word_connector_paths': [],\
'uncategorized_paths': [] }
for mypath in allpaths_on_tf:
xmin, xmax, ymin, ymax = mypath.path.bbox()
start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin)
if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1:
path_dict.get('dots_paths').append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed():
path_dict.get('box_paths').append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
path_dict.get('word_connector_paths').append(mypath)
elif abs(ymax-ymin) < MAX_HEIGHT_LINES:
mypath.start_line_number = start_line_number
path_dict.get('deletion_or_underline_paths').append(mypath)
elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin):
# Check for "ladder", i.e. a path with 3 segments (seg0 is horizontal on line x, seg1 moves to line x+1, seg2 is horizontal on line x+1)
if start_line_number + 1 == page.get_line_number(mypath.path.end.imag-tr_ymin)\
and len(mypath.path._segments) == 3\
and abs(mypath.path._segments[0].bbox()[3]-mypath.path._segments[0].bbox()[2]) < MAX_HEIGHT_LINES\
and abs(mypath.path._segments[2].bbox()[3]-mypath.path._segments[2].bbox()[2]) < MAX_HEIGHT_LINES:
for index in 0, 2:
new_path = Path(parent_path=mypath, path=SVGPath(mypath.path._segments[index]))
new_path.start_line_number = page.get_line_number(new_path.path.start.imag-tr_ymin)
path_dict.get('deletion_or_underline_paths').append(new_path)
else:
path_dict.get('text_area_deletion_paths').append(mypath)
else:
path_dict.get('uncategorized_paths').append(mypath)
underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin)
path_dict.update({'underline_path': underline_path})
path_dict['uncategorized_paths'] += process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\
paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line)
return path_dict
elif not UNITTESTING:
error_msg = 'Svg source file {} does not exist!'.format(page.source)\
if page.source is not None else 'Page does not contain a source file!'
raise FileNotFoundError(error_msg)
return {}
def copy_page_to_merged_directory(page, manuscript_file=None):
"""Copy page to directory that contains the first version of all svg_pos_files that have been
merged with the faksimile position data. MERGED_DIR is a subfolder of svg_pos_files-directory.
"""
svg_pos_file = PathlibPath(page.page_tree.docinfo.URL)
target_dir = svg_pos_file.parent / MERGED_DIR
if not target_dir.is_dir():
target_dir.mkdir()
target_pos_file = target_dir / svg_pos_file.name
save_page(page, str(svg_pos_file), target_svg_pos_file=str(target_pos_file), status=STATUS_MERGED_OK, manuscript_file=manuscript_file)
def find_special_words(page, transkription_field=None):
"""Find special words, remove them from words, process their content.
"""
if page.source is None or not isfile(page.source):
raise FileNotFoundError('Page does not have a source!')
if transkription_field is None:
transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index)
set_to_text_field_zero = (page.svg_image is None or page.svg_image.text_field is None)
special_char_list = MarkForeignHands.get_special_char_list()
special_char_list += TextConnectionMark.get_special_char_list()
single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ]
if not UNITTESTING:
bar = Bar('find special words', max=len(single_char_words))
for word in single_char_words:
not bool(UNITTESTING) and bar.next()
if word.text == MarkForeignHands.CLASS_MARK:
id = len(page.mark_foreign_hands)
page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id))
page.words.remove(word)
elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\
or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\
and any(style in page.sonderzeichen_list for style\
in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))):
id = len(page.text_connection_marks)
page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id))
page.words.remove(word)
not bool(UNITTESTING) and bar.finish()
svg_tree = ET.parse(page.source)
page.update_page_type(transkription_field=transkription_field)
page.update_line_number_area(transkription_field, svg_tree=svg_tree, set_to_text_field_zero=set_to_text_field_zero)
if page.marginals_source is not None:
svg_tree = ET.parse(page.marginals_source)
italic_classes = [ key for key in page.style_dict\
if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ]
if len(page.mark_foreign_hands) > 0:
MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\
SonderzeichenList=page.sonderzeichen_list, set_to_text_field_zero=set_to_text_field_zero)
if len(page.text_connection_marks) > 0:
TextConnectionMark.find_content_in_footnotes(page, transkription_field, svg_tree)
def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Marks all words that intersect with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] list of .path.Path that might be word_underline_paths
"""
if not UNITTESTING:
bar = Bar('mark words that intersect with deletion paths', max=len(page.words))
for word in page.words:
not bool(UNITTESTING) and bar.next()
word = mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
for part_word in word.word_parts:
part_word = mark_word_if_it_intersects_with_paths_as_deleted(part_word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
word.partition_according_to_deletion()
not bool(UNITTESTING) and bar.finish()
# return those paths in deletion_paths that are not in page.word_deletion_paths
return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ]
def mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Marks word if it intersects with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] word
"""
word.deleted = False
for transkription_position in word.transkription_positions:
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
intersecting_paths = [ deletion_path for deletion_path in deletion_paths\
if do_paths_intersect_saveMode(deletion_path, word_path) ]
if DEBUG_WORD is not None and word.text == DEBUG_WORD.text and word.line_number == DEBUG_WORD.line_number:
relevant_paths = [ path for path in deletion_paths if path.start_line_number == DEBUG_WORD.line_number ]
#print(word.line_number, word_path.path.bbox(), [ path.path.bbox() for path in relevant_paths])
if len(intersecting_paths) > 0:
#print(f'{word.line_number}: {word.id}, {word.text}: {intersecting_paths}')
transkription_position.deleted = True
transkription_position._deletion_paths += intersecting_paths
for deletion_path in intersecting_paths:
if deletion_path.parent_path is not None:
deletion_path = deletion_path.parent_path
if deletion_path not in page.word_deletion_paths:
deletion_path.tag = Path.WORD_DELETION_PATH_TAG
deletion_path.attach_object_to_tree(page.page_tree)
page.word_deletion_paths.append(deletion_path)
return word
def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None):
"""Process words after merging with faksimile word positions.
"""
if page is None and svg_pos_file is None:
raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!')
if page is None:
page = Page(svg_pos_file)
if page.source is None or not isfile(page.source):
raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file))
if svg_pos_file is None:
svg_pos_file = page.page_tree.docinfo.URL
if new_words is not None:
page.words = sorted(new_words, key=attrgetter('id'))
for word_node in page.page_tree.xpath('.//word'):
word_node.getparent().remove(word_node)
manuscript = ArchivalManuscriptUnity.create_cls(manuscript_file)\
if manuscript_file is not None\
else None
copy_page_to_merged_directory(page, manuscript_file=manuscript_file)
transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index)
update_faksimile_line_positions(page)
status = STATUS_MERGED_OK
page.update_styles(manuscript=manuscript, partition_according_to_styles=True)
+ save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file)
categorize_paths(page, transkription_field=transkription_field)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('default')
try:
find_special_words(page, transkription_field=transkription_field)
categorize_footnotes(page)
extract_line_continuations(page, warning_message=WARNING_LINE_CONTINUATION)
except Exception:
warnings.warn(WARNING_FOOTNOTES_ERROR)
status = process_warnings4status(w, [ WARNING_FOOTNOTES_ERROR, WARNING_LINE_CONTINUATION ], status, STATUS_POSTMERGED_OK)
save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file)
def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17) -> list:
"""Process word boxes: partition words according to word boxes.
[:return:] a list of paths that are not boxes
"""
MAX_HEIGHT_LINES = 1
not_boxes = []
- if not UNITTESTING:
- bar = Bar('process word boxes', max=len(page.words))
- svg_tree = ET.parse(page.source)
- namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
- allpaths_on_margin_field = []
- tr_xmin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\
- else transkription_field.xmin
- tr_ymin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\
- else transkription_field.ymin
- if paths is None or attributes is None:
- paths = []
- raw_paths, attributes = svg_to_paths.svg2paths(page.source)
- for index, raw_path in enumerate(raw_paths):
- paths.append(Path.create_cls(id=index, path=raw_path, style_class=attributes[index].get('class'), page=page))
- for index, mypath in enumerate(paths):
- path = mypath.path
- xmin, xmax, ymin, ymax = path.bbox()
- attribute = attributes[index]
- if len(path) > 0\
- and path != transkription_field.path\
- and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\
- or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\
- and abs(ymax-ymin) < max_line:
- allpaths_on_margin_field.append(mypath)#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page))
- box_line_number_dict = {}
- for box_path in sorted(box_paths, key=lambda path: path.get_median_y()):
- line_number = page.get_line_number(box_path.get_median_y(tr_ymin=tr_ymin))
- if line_number > 0:
- if line_number not in box_line_number_dict.keys():
- box_line_number_dict.update({ line_number: [ box_path ]})
+ try:
+ if not UNITTESTING:
+ bar = Bar('process word boxes', max=len(page.words))
+ svg_tree = ET.parse(page.source)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ allpaths_on_margin_field = []
+ tr_xmin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\
+ else transkription_field.xmin
+ tr_ymin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\
+ else transkription_field.ymin
+ if paths is None or attributes is None:
+ paths = []
+ raw_paths, attributes = svg_to_paths.svg2paths(page.source)
+ for index, raw_path in enumerate(raw_paths):
+ paths.append(Path.create_cls(id=index, path=raw_path, style_class=attributes[index].get('class'), page=page))
+ for index, mypath in enumerate(paths):
+ path = mypath.path
+ xmin, xmax, ymin, ymax = path.bbox()
+ attribute = attributes[index]
+ if len(path) > 0\
+ and path != transkription_field.path\
+ and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\
+ or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\
+ and abs(ymax-ymin) < max_line:
+ allpaths_on_margin_field.append(mypath)#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page))
+ box_line_number_dict = {}
+ for box_path in sorted(box_paths, key=lambda path: path.get_median_y()):
+ line_number = page.get_line_number(box_path.get_median_y(tr_ymin=tr_ymin))
+ if line_number > 0:
+ if line_number not in box_line_number_dict.keys():
+ box_line_number_dict.update({ line_number: [ box_path ]})
+ else:
+ box_line_number_dict.get(line_number).append(box_path)
+ boxes = []
+ for line_number in box_line_number_dict.keys():
+ box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x())
+ margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\
+ if page.get_line_number(margin_box.get_median_y(tr_ymin=tr_ymin)) == line_number ],\
+ key=lambda path: path.get_x())
+ threshold = 3 if line_number % 2 == 0 else 1.5
+ if len(margin_boxes_on_line) > 0:
+ for box_path in box_paths_on_line:
+ #print(line_number, box_path.path.d(), len(margin_boxes_on_line))
+ box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\
+ namespaces=namespaces, threshold=threshold)
+ if box is not None:
+ boxes.append(box)
else:
- box_line_number_dict.get(line_number).append(box_path)
- boxes = []
- for line_number in box_line_number_dict.keys():
- box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x())
- margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\
- if page.get_line_number(margin_box.get_median_y(tr_ymin=tr_ymin)) == line_number ],\
- key=lambda path: path.get_x())
- threshold = 3 if line_number % 2 == 0 else 1.5
- if len(margin_boxes_on_line) > 0:
- for box_path in box_paths_on_line:
- #print(line_number, box_path.path.d(), len(margin_boxes_on_line))
- box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\
- namespaces=namespaces, threshold=threshold)
- if box is not None:
- boxes.append(box)
- else:
- not_boxes += box_paths_on_line
- if len(boxes) > 0:
- for word in page.words:
- word.process_boxes(boxes, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
- word.create_correction_history(page)
- if not bool(UNITTESTING):
- bar.next()
- elif word.earlier_version is not None:
- #print(f'{word.text} -> {word.earlier_version.text}')
- if word.earlier_version.earlier_version is not None:
- print(f'{word.earlier_version.earlier_version.text}')
- not bool(UNITTESTING) and bar.finish()
+ not_boxes += box_paths_on_line
+ if len(boxes) > 0:
+ print(len(boxes))
+ for word in page.words:
+ word.process_boxes(boxes, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
+ word.create_correction_history(page)
+ if not bool(UNITTESTING):
+ bar.next()
+ elif word.earlier_version is not None:
+ #print(f'{word.text} -> {word.earlier_version.text}')
+ if word.earlier_version.earlier_version is not None:
+ print(f'{word.earlier_version.earlier_version.text}')
+ not bool(UNITTESTING) and bar.finish()
+ except Exception as e:
+ print(e)
return not_boxes
def reset_page(page):
"""Reset all words that have word_parts in order to run the script a second time.
"""
svg_pos_file = PathlibPath(page.page_tree.docinfo.URL)
first_merge_version = svg_pos_file.parent / MERGED_DIR / svg_pos_file.name
if first_merge_version.exists():
page = Page(str(first_merge_version))
else:
word_with_wordparts = [ word for word in page.words if len(word.word_parts) > 0 ]
word_with_wordparts += [ word for word in page.words if word.earlier_version is not None ]
page_changed = False
if len(word_with_wordparts) > 0:
for word in word_with_wordparts:
word.undo_partitioning()
update_transkription_position_ids(word)
page_changed = True
no_line_numbers = [ word for word in page.words if word.line_number == -1 ]
if len(no_line_numbers) > 0:
for word in no_line_numbers:
if len(word.transkription_positions) > 0:
word.line_number = page.get_line_number((word.transkription_positions[0].top+word.transkription_positions[0].bottom)/2)
else:
msg = f'Word {word.id} {word.text} has no transkription_position!'
warnings.warn(msg)
page_changed = True
if page_changed:
page.update_and_attach_words2tree()
def save_page(page, svg_pos_file, target_svg_pos_file=None, status=None, manuscript_file=None):
"""Save page to target_file and update status of file.
"""
page.update_and_attach_words2tree()
if not UNITTESTING:
if target_svg_pos_file is None:
target_svg_pos_file = svg_pos_file
if status is not None:
update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status)
write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
def update_faksimile_line_positions(page):
"""Update faksimile_positions of the lines
"""
num_lines = len(page.line_numbers)
ymin = page.text_field.ymin\
if page.text_field is not None\
else 0.0
for line_number in page.line_numbers:
if len([ word.faksimile_positions[0] for word in page.words\
if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) > 0:
line_number.faksimile_inner_top = min([ word.faksimile_positions[0].top for word in page.words\
if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ])
line_number.faksimile_inner_bottom = max([ word.faksimile_positions[0].bottom for word in page.words\
if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ])
if line_number.id % 2 == 0:
line_number.faksimile_outer_top = line_number.faksimile_inner_top - ymin
line_number.faksimile_outer_bottom = line_number.faksimile_inner_bottom - ymin
for index, line_number in enumerate(page.line_numbers):
if line_number.faksimile_inner_bottom == 0.0\
or line_number.faksimile_inner_bottom < line_number.faksimile_inner_top:
if index == 0 and num_lines > 1:
line_number.faksimile_inner_bottom = page.line_numbers[index+1].top
elif index == num_lines-1 and page.text_field is not None:
line_number.faksimile_inner_bottom = round(page.text_field.height + page.text_field.ymin, 3)
elif index > 0 and index < num_lines-1:
line_number.faksimile_inner_bottom = page.line_numbers[index+1].faksimile_inner_top\
if page.line_numbers[index+1].faksimile_inner_top > page.line_numbers[index-1].faksimile_inner_bottom\
else page.line_numbers[index-1].faksimile_inner_bottom
line_number.attach_object_to_tree(page.page_tree)
def update_writing_process_ids(page):
"""Update the writing_process_ids of the words and split accordingly.
"""
for word in page.words:
word.set_writing_process_id_to_transkription_positions(page)
word.partition_according_to_writing_process_id()
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to process words after they have been merged with faksimile data.
svgscripts/process_words_post_merging.py [OPTIONS]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-i|--include-missing-line-number run script on files that contain words without line numbers
-r|--rerun rerun script on a svg_pos_file that has already been processed
:return: exit code (int)
"""
status_not_contain = STATUS_POSTMERGED_OK
include_missing_line_number = False
try:
opts, args = getopt.getopt(argv, "hir", ["help", "include-missing-line-number", "rerun" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-i', '--include-missing-line-number'):
include_missing_line_number = True
elif opt in ('-r', '--rerun'):
status_not_contain = ''
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
manuscript_file = file_a\
if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
else None
counter = 0
for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=status_not_contain):
reset_page(page)
no_line_numbers = [ word for word in page.words if word.line_number == -1 ]
if not include_missing_line_number and len(no_line_numbers) > 0:
not UNITTESTING and print(Fore.RED + f'Page {page.title}, {page.number} has words with no line number!')
for word in no_line_numbers:
not UNITTESTING and print(f'Word {word.id}: {word.text}')
else:
back_up(page, page.xml_file)
not UNITTESTING and print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL)
post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file)
counter += 1
not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/util.py
===================================================================
--- svgscripts/util.py (revision 106)
+++ svgscripts/util.py (revision 107)
@@ -1,525 +1,535 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to copy a faksimile svg file with the option of highlighting some word boxes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from datetime import datetime
from functools import cmp_to_key
import getopt
import inspect
import itertools
import lxml.etree as ET
import re
import shutil
import signal
import string
import subprocess
from svgpathtools import svg_to_paths
import sys
import tempfile
import os
from os import listdir, sep, path, setpgrp, devnull, makedirs
from os.path import basename, commonpath, dirname, exists, isfile, isdir, realpath, splitext
import warnings
import wget
import xml.etree.ElementTree as XET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.faksimile_image import FaksimileImage
from datatypes.lineNumber import LineNumber
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.page_creator import PageCreator
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word, update_transkription_position_ids
from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
class ExternalViewer:
"""This class can be used to show files with external viewers.
"""
file_format_viewer_dict = { '.pdf': PDF_READER, '.svg': SVG_EDITOR }
@classmethod
def show_files(cls, single_file=None, list_of_files=[]):
"""Opens file(s) with corresponding external viewer(s).
"""
DEVNULL = None
if type(single_file) == list:
list_of_files = single_file
elif single_file is not None:
list_of_files.append(single_file)
if len(list_of_files) > 1:
DEVNULL = open(devnull, 'wb')
process_list = []
list_of_files.reverse()
while len(list_of_files) > 0:
file2open = list_of_files.pop()
viewer = cls.file_format_viewer_dict.get(splitext(file2open)[1])
if viewer is not None:
if len(list_of_files) > 0:
process_list.append(\
subprocess.Popen([viewer, file2open], stdout=DEVNULL, stderr=DEVNULL, preexec_fn=os.setsid))
else:
subprocess.run([viewer, file2open])
for process in process_list:
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
if DEVNULL is not None:
DEVNULL.close()
def back_up(page: Page, reference_file, bak_dir='./bak') -> str:
"""Back up a xml_source_file.
:return: target_file_name
"""
date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
makedirs(bak_dir, exist_ok=True)
page.bak_file = bak_dir + sep + basename(page.page_tree.docinfo.URL) + '_' + date_string
write_pretty(xml_element_tree=page.page_tree, file_name=page.bak_file,\
script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, reference_file),\
file_type=FILE_TYPE_SVG_WORD_POSITION)
return page.bak_file
def back_up_svg_file(svg_tree: ET.ElementTree, namespaces=None, bak_dir='./bak') -> str:
"""Back up a xml_source_file.
:return: target_file_name
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
makedirs(bak_dir, exist_ok=True)
bak_file = bak_dir + sep + date_string + '_' + basename(svg_tree.docinfo.URL)
copy_faksimile_svg_file(target_file=bak_file, faksimile_tree=svg_tree, namespaces=namespaces)
return bak_file
def copy_faksimile_svg_file(target_file=None, faksimile_source_file=None, faksimile_tree=None, target_directory=None, abs_image_path=None, local_image_path=None, namespaces=None):
"""Copy a faksimile_svg_file to target_file.
"""
if faksimile_source_file is None and faksimile_tree is not None:
faksimile_source_file = faksimile_tree.docinfo.URL
elif faksimile_source_file is None:
raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file')
if target_file is not None and target_directory is not None:
target_file = target_directory + sep + target_file
elif target_file is None and target_directory is not None:
target_file = target_directory + sep + basename(faksimile_source_file)
elif target_file is None:
raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory')
paths, attributes, svg_attributes = svg_to_paths.svg2paths(faksimile_source_file, return_svg_attributes=True)
for key in [ key for key in svg_attributes.keys() if key.startswith('xmlns:') ]:
try:
XET.register_namespace(key.replace('xmlns:', ''), svg_attributes[key])
except ValueError: pass
XET.register_namespace('', 'http://www.w3.org/2000/svg')
if namespaces is None:
namespaces = { 'ns': svg_attributes['xmlns'], 'xlink': svg_attributes['xmlns:xlink'],\
'sodipodi': svg_attributes['xmlns:sodipodi'] }
if faksimile_tree is not None:
element = XET.fromstring(ET.tostring(faksimile_tree))\
if type(faksimile_tree) == ET._ElementTree\
else XET.fromstring(XET.tostring(faksimile_tree.getroot()))
target_tree = XET.ElementTree(element)
else:
target_tree = XET.parse(faksimile_source_file)
if (local_image_path is not None or abs_image_path is not None)\
and len(target_tree.findall('.//ns:image', namespaces=namespaces)) > 0:
image_node = target_tree.findall('.//ns:image', namespaces=namespaces)[0]
if local_image_path is not None:
image_node.set('{%s}href' % namespaces['xlink'], local_image_path)
if abs_image_path is not None:
image_node.set('{%s}absref' % namespaces['sodipodi'], abs_image_path)
target_tree.write(target_file)
def copy_faksimile_update_image_location(faksimile_source_file=None, faksimile_tree=None, target_file=None, target_directory=None, overwrite=False):
"""Copy a faksimile_svg_file to target_file and update image location.
"""
if faksimile_source_file is None and faksimile_tree is not None:
faksimile_source_file = faksimile_tree.docinfo.URL
elif faksimile_source_file is None:
raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file')
if target_file is not None and target_directory is not None:
target_file = target_directory + sep + target_file
elif target_file is None and target_directory is not None:
target_file = target_directory + sep + basename(faksimile_source_file)
elif target_directory is None and target_file is not None:
target_directory = dirname(target_file)
elif target_file is None:
raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory')
source_tree = ET.parse(faksimile_source_file) if faksimile_tree is None else faksimile_tree
namespaces = { k if k is not None else 'ns': v for k, v in source_tree.getroot().nsmap.items() }
image_nodes = source_tree.xpath('//ns:image', namespaces=namespaces)
local_image_path = None
abs_image_path = None
user_abs_image_path = None
if len(image_nodes) > 0:
image = FaksimileImage.CREATE_IMAGE(image_nodes[0], source_file=faksimile_source_file)
abs_image_path = image.local_path
for user_name in USER_ROOT_LOCATION_DICT.keys():
if user_name in target_directory:
user_abs_image_path = abs_image_path.replace(FAKSIMILE_LOCATION, USER_ROOT_LOCATION_DICT[user_name]).replace('//','/')
break
# if target_directory is subdir of FAKSIMILE_LOCATION
if realpath(target_directory).startswith(realpath(FAKSIMILE_LOCATION)):
common_path = commonpath([ realpath(target_directory), realpath(dirname(image.local_path)) ])
relative_directory = '/'.join(\
[ '..' for d in realpath(target_directory).replace(common_path + '/', '').split('/') ])
local_image_path = relative_directory + realpath(image.local_path).replace(common_path, '')
if not isfile(target_directory + sep + local_image_path):
local_image_path = None
elif abs_image_path is not None:
local_image_path = abs_image_path
if abs_image_path is not None and not isfile(abs_image_path):
wget.download(image.URL, out=dirname(abs_image_path))
if not isfile(target_file) or overwrite:
abs_image_path = user_abs_image_path if user_abs_image_path is not None else abs_image_path
copy_faksimile_svg_file(target_file=target_file, faksimile_source_file=faksimile_source_file,\
faksimile_tree=faksimile_tree, abs_image_path=abs_image_path,\
local_image_path=local_image_path, namespaces=namespaces)
else:
msg = 'File {0} not copied to directory {1}, it already contains a file {2}.'.format(faksimile_source_file, target_directory, target_file)
warnings.warn(msg)
def copy_xml_file_word_pos_only(xml_source_file, target_directory):
"""Copy word positions of a xml file to target directory.
:return: (str) xml_target_file
"""
xml_target_file = target_directory + sep + basename(xml_source_file)
source_page = Page(xml_source_file)
target_page = PageCreator(xml_target_file, title=source_page.title, page_number=source_page.number, orientation=source_page.orientation)
target_page.words = source_page.words
target_page.update_and_attach_words2tree()
write_pretty(xml_element_tree=target_page.page_tree, file_name=xml_target_file,\
script_name=__file__ + '({})'.format(inspect.currentframe().f_code.co_name), file_type=FILE_TYPE_SVG_WORD_POSITION)
return xml_target_file
def create_highlighted_svg_file(faksimile_tree, node_ids, nodes_color_dict=None, target_file=None, target_directory=None, local_image_path=None, namespaces=None, highlight_color=HIGHLIGHT_COLOR, opacity=OPACITY):
"""Highlights the nodes of a faksimile_tree that are specified by the list of node_ids and writes the tree to a file.
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
for node in itertools.chain(*[\
faksimile_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)\
for node_id in node_ids\
]):
node.set('fill', highlight_color)
node.set('opacity', opacity)
node.set('style', '')
copy_faksimile_update_image_location(target_file=target_file, faksimile_tree=faksimile_tree, target_directory=target_directory)
def get_empty_node_ids(faksimile_tree, x_min=0.0, x_max=0.0, y_min=0.0, y_max=0.0, text_field_id=None, faksimile_page=None, namespaces={}):
"""Returns a list of ids of rect and path nodes that do not have a title element.
"""
THRESHOLD_X = 10
if faksimile_page is not None:
x_min = faksimile_page.text_field.xmin + faksimile_page.faksimile_image.x
x_max = faksimile_page.text_field.xmax + faksimile_page.faksimile_image.x - THRESHOLD_X
y_min = faksimile_page.text_field.ymin + faksimile_page.faksimile_image.y
y_max = faksimile_page.text_field.ymax + faksimile_page.faksimile_image.y
text_field_id = faksimile_page.text_field.id
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
empyt_node_ids = []
nodes_without_title = faksimile_tree.xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'.format(\
x_min, x_max, y_min, y_max, text_field_id), namespaces=namespaces)
nodes_without_title += get_paths_inside_rect(faksimile_tree, '//ns:path[not(./ns:title)]', x_min, x_max, y_min, y_max, text_field_id, namespaces=namespaces)
for node_without_title in nodes_without_title:
empyt_node_ids.append(node_without_title.get('id'))
return empyt_node_ids
def get_mismatching_ids(words, faksimile_positions):
""" Return the list of mismatching words and the list of mismatching faksimile_positions
as a 2-tuple.
"""
mismatching_words = []
mismatching_faksimile_positions = []
faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions)
word_texts = [ word.text for word in words if word.text != '.' ]
for word_text in set(word_texts):
if word_text not in unique_faksimile_words:
mismatching_words += [ word for word in words if word.text == word_text ]
for faksimile_position_text in unique_faksimile_words:
if faksimile_position_text not in set(word_texts):
mismatching_faksimile_positions += [ faksimile_position for faksimile_position in faksimile_positions\
if faksimile_position.text == faksimile_position_text ]
return mismatching_words, mismatching_faksimile_positions
def process_warnings4status(warnings, warning_messages, current_status, ok_status, status_prefix='') ->str:
"""Process potential warnings and return actual status.
"""
if warnings is not None and len(warnings) > 0:
status = status_prefix
for warning_message in warning_messages:
if True in [ str(warn.message).startswith(warning_message) for warn in warnings ]:
status += f':{warning_message}:'
if status != status_prefix:
return status
return f'{current_status}:{ok_status}:'
else:
return f'{current_status}:{ok_status}:'
+def change_title_of_svg(svg_file, node_id, text):
+ """Change the title of a rect/path node.
+ """
+ svg_tree = ET.parse(svg_file)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ nodes = svg_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces)
+ if len(nodes) > 0:
+ nodes[0].text = text
+ copy_faksimile_svg_file(target_file=svg_file, faksimile_tree=svg_tree)
+
def record_changes(original_svg_file, changed_svg_file, node_ids, namespaces={}):
"""Copy changes made to changed_svg_file to original_svg_file.
"""
old_tree = ET.parse(original_svg_file)
new_tree = ET.parse(changed_svg_file)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() }
for node_id in node_ids:
new_titles = new_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces)
old_nodes = old_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)
if len(new_titles) > 0 and len(old_nodes) > 0:
if old_nodes[0].find('ns:title', namespaces=namespaces) is not None:
old_nodes[0].find('ns:title', namespaces=namespaces).text = new_titles[0].text
else:
old_title_id_string = new_titles[0].get('id')
old_title = ET.SubElement(old_nodes[0], 'title', attrib={ 'id': old_title_id_string })
old_title.text = new_titles[0].text
elif len(old_nodes) > 0:
for old_node in old_nodes:
old_node.getparent().remove(old_node)
copy_faksimile_svg_file(target_file=original_svg_file, faksimile_tree=old_tree)
def record_changes_on_svg_file_to_page(xml_source_file, svg_file, word_ids=None):
"""Copy changes made to svg_file to xml_source_file.
:return: datatypes.page.Page
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
transkription_field = TranskriptionField(svg_file)
page = Page(xml_source_file)
words = [ word for word in page.words if word.id in word_ids ]\
if word_ids is not None else page.words
new_page_words = []
for word in words:
word_id = 'word_' + str(word.id) + '_'
recorded_ids = []
for transkription_position in word.transkription_positions:
transkription_position_id = word_id + str(transkription_position.id)
tp_nodes = svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]'.format(transkription_position_id), namespaces=namespaces)
if len(tp_nodes) > 0:
record_changes_to_transkription_position(tp_nodes[0], transkription_position,\
transkription_field.xmin, transkription_field.ymin, namespaces=namespaces)
recorded_ids.append(transkription_position_id)
extra_nodes = [ node for node in\
svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[contains(@id, "{0}")]'.format(word_id), namespaces=namespaces)\
if node.get('id') not in recorded_ids ]
if len(extra_nodes) > 0:
for extra_node in extra_nodes:
old_ids = [ inkscape_id.replace('#','') for inkscape_id in\
svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]/@inkscape:label'.format(extra_node.get('id')),\
namespaces=namespaces) ]
if len(old_ids) > 0 and re.match(r'word_[0-9]+_[0-9]+', old_ids[0]):
old_id_list = old_ids[0].split('_')
ref_word_id = int(old_id_list[1])
ref_tp_id = old_id_list[2]
ref_words = [ word for word in page.words if word.id == ref_word_id ]
if len(ref_words) > 0:
ref_tps = [ tp for tp in ref_words[0].transkription_positions\
if tp.id == ref_tp_id ]
if len(ref_tps) > 0:
ref_words[0].transkription_positions.remove(ref_tps[0])
record_changes_to_transkription_position(extra_node,\
ref_tps[0], transkription_field.xmin, transkription_field.ymin, namespaces=namespaces)
word.transkription_positions.append(ref_tps[0])
for word in page.words:
if word.has_mixed_status('text'):
new_page_words += [ word for word in word.split_according_to_status('text') if word.text is not None and word.text != '' ]
elif len(word.transkription_positions) > 0:
new_text = [ tp.text for tp in word.transkription_positions if tp.text is not None and tp.text != '' ]
if len(new_text) > 0:
word.text = new_text[0]
new_page_words.append(word)
page.words = new_page_words
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
page.unlock()
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\
script_name=__file__ + ' -> ' + inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_SVG_WORD_POSITION)
return page
def record_changes_on_xml_file_to_page(xml_source_file, xml_file) -> Page:
"""Copy changes made to xml_file to xml_source_file.
:return: datatypes.page.Page
"""
copy_page = Page(xml_file)
page = Page(xml_source_file)
page.unlock()
back_up(page, xml_file)
page.words = []
for word in copy_page.words:
if word.split_strings is None\
or len(word.split_strings) == 0:
page.words.append(word)
else:
next_word = word
for split_string in word.split_strings:
_, new_word, next_word = next_word.split(split_string)
page.words.append(new_word)
if next_word is not None:
page.words.append(next_word)
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
remove_words_if_done = []
for word in page.words:
if 'join_string' in word.__dict__.keys()\
and word.join_string is not None:
if word.id > 0\
and page.words[word.id-1].text + word.text == word.join_string:
page.words[word.id-1].join(word)
remove_words_if_done.append(word)
elif word.id < len(page.words)\
and word.text + page.words[word.id+1].text == word.join_string:
word.join(page.words[word.id+1])
remove_words_if_done.append(page.words[word.id+1])
for word in remove_words_if_done:
page.words.remove(word)
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\
script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, xml_file), file_type=FILE_TYPE_SVG_WORD_POSITION)
return page
def record_changes_to_transkription_position(node, transkription_position, xmin=0.0, ymin=0.0, namespaces=None):
"""Record changes made to node to transkription_position.
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in node.nsmap.items() }
if bool(node.get('x')):
transkription_position.left = float(node.get('x')) - xmin
if bool(node.get('y')):
transkription_position.top = float(node.get('y')) - ymin
if bool(node.get('width')):
transkription_position.width = float(node.get('width'))
if bool(node.get('height')):
transkription_position.height = float(node.get('height'))
if len(node.xpath('./ns:title/text()', namespaces=namespaces)) > 0:
transkription_position.text = node.xpath('./ns:title/text()', namespaces=namespaces)[0]
def replace_chars(words, faksimile_positions, unique_faksimile_words=None):
"""Return unique_faksimile_words and faksimile_positions, with characters changed according to transcription words.
"""
if unique_faksimile_words is None:
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
for index, word_text in enumerate(unique_faksimile_words):
if len([ word for word in words if word.text == word_text ]) == 0:
if re.match(r'.*".*', word_text)\
and len([ word for word in words if word.text == word_text.replace('"', '“') ]) > 0:
unique_faksimile_words[index] = word_text.replace('"', '“')
elif re.match(r'.*ss.*', word_text)\
and len([ word for word in words if word.text == word_text.replace('ss', 'ß') ]) > 0:
unique_faksimile_words[index] = word_text.replace('ss', 'ß')
elif re.match(r'.*-.*', word_text)\
and len([ word for word in words if word.text == word_text.replace('-', '–') ]) > 0:
unique_faksimile_words[index] = word_text.replace('-', '–')
for faksimile_position in [ faksimile_position for faksimile_position in faksimile_positions\
if faksimile_position.text == word_text ]:
faksimile_position.text = unique_faksimile_words[index]
elif word_text == '-'\
and len([ word for word in words if word.text == '–' ]) > 0:
print([ word.text for word in words if word.text == word_text ])
print([ word.text for word in words if word.text == '–' ])
return faksimile_positions, unique_faksimile_words
def reset_tp_with_matrix(transkription_positions, new_left=0, new_top=-5, tr_xmin=0.0, tr_ymin=0.0):
"""Fix transkription_position with transform matrix.
"""
if len(transkription_positions) > 0:
for tp in transkription_positions:
if tp.transform is not None\
and tp.transform.isRotationMatrix():
tp.transform.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + tr_xmin, 3)
tp.left = round(tp.left, 3) - tp.transform.matrix[Matrix.XINDEX]\
if abs(round(tp.left, 3) - tp.transform.matrix[Matrix.XINDEX]) > 1\
else 0
tp.bottom = round(tp.bottom, 3) - tp.transform.matrix[Matrix.YINDEX]
tp.transform.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + tr_ymin, 3)
tp.top= tp.bottom - tp.height + 2
def update_svgposfile_status(file_name, manuscript_file=None, status='changed', append=True):
"""Updates svg position file's status. Changes its status to status if it does not contain 'OK',
else it appends new status to old status.
"""
if isfile(file_name):
parser = ET.XMLParser(remove_blank_text=True)
file_tree = ET.parse(file_name, parser)
old_status = file_tree.getroot().get('status')
if old_status is None or 'OK' not in old_status.split(':'):
file_tree.getroot().set('status', status)
elif append:
if status not in old_status.split(':'):
new_status = old_status + ':' + status
file_tree.getroot().set('status', new_status)
else:
file_tree.getroot().set('status', new_status)
write_pretty(xml_element_tree=file_tree, file_name=file_name, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
if manuscript_file is not None and isfile(manuscript_file):
page_number = file_tree.getroot().get('number')
update_manuscript_file(manuscript_file, page_number, file_name, status=status)
def update_manuscript_file(manuscript_file, page_number, file_name, status='changed', append=True):
"""Updates manuscript file: adds status information about page.
"""
if isfile(manuscript_file):
parser = ET.XMLParser(remove_blank_text=True)
manuscript_tree = ET.parse(manuscript_file, parser)
if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0]
old_status = node.get('status')
if old_status is None or 'OK' not in old_status.split(':'):
node.set('status', status)
elif append:
if status not in old_status.split(':'):
new_status = old_status + ':' + status
node.set('status', new_status)
else:
node.set('status', new_status)
if not bool(node.get('output')):
node.set('output', file_name)
else:
pages_node = manuscript_tree.getroot().find('pages')\
if manuscript_tree.getroot().find('pages') is not None\
else ET.SubElement(manuscript_tree.getroot(), 'pages')
new_id = len(pages_node.findall('page')) + 1
ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name})
write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT)
Index: svgscripts/convert_wordPositions.py
===================================================================
--- svgscripts/convert_wordPositions.py (revision 106)
+++ svgscripts/convert_wordPositions.py (revision 107)
@@ -1,740 +1,755 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import cairosvg
import getopt
import json
from lxml.html import builder as E
from lxml.html import open_in_browser
import lxml
from pathlib import Path as PathLibPath
from os import sep, listdir, mkdir, path, remove
from os.path import exists, isfile, isdir, dirname
import re
import sys
from svgpathtools import svg_to_paths
import xml.etree.ElementTree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.page_creator import PageCreator
from datatypes.transkriptionField import TranskriptionField
from datatypes.text_field import TextField
from datatypes.writing_process import WritingProcess
from datatypes.word import Word
sys.path.append('shared_util')
from main_util import extract_paths_on_tf, get_paths_near_position
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
EXIST_DB = 'http://existdb-test.dasch.swiss/exist/rest/db/storage/nietzsche/'
LOCAL_SERVER = 'http://localhost:8000/'
class Converter:
"""The converter super class.
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
self.page = page
self.non_testing = non_testing
self.show_word_insertion_mark = show_word_insertion_mark
def _get_transkription_positions(self, transkription_positions, stage_version=''):
"""Returns the transkription_positions of the indicated stage_version.
"""
convertable_transkription_positions = transkription_positions
if stage_version != '':
convertable_transkription_positions = []
if re.match(r'^\d$', stage_version):
writing_process_id = int(stage_version)
for transkription_position in transkription_positions:
if transkription_position.writing_process_id == writing_process_id:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\+$', stage_version):
version_range = [ *range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\-\d$', stage_version):
start_stop = [ int(i) for i in re.split(r'-', stage_version) ]
version_range = [ *range(start_stop[0], start_stop[1]+1) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
return convertable_transkription_positions
def _get_words(self, words, highlighted_words=None):
"""Return the words that will be hightlighted.
"""
return highlighted_words if highlighted_words is not None else words
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Prints all words.
"""
first_word_of_line = None
out = sys.stdout
if output_file is not None:
out = open(output_file, 'w')
for word in self.page.words:
if first_word_of_line is None or first_word_of_line.line_number != word.line_number:
out.write('\n')
first_word_of_line = word
if word.line_number % 2 == 0:
out.write(str(word.line_number).zfill(2) + ' ')
else:
out.write(' ')
if stage_version == '' or len(self._get_transkription_positions(word.transkription_positions, stage_version=stage_version)) > 0:
if word.text is not None:
out.write(word.text + ' ')
out.close()
return 0
@classmethod
def CREATE_CONVERTER(cls, page, non_testing=True, converter_type='', show_word_insertion_mark=False, key=''):
"""Returns a converter of type converter_type.
[:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None
"""
cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() }
cls_key = converter_type + 'Converter'
if bool(cls_dict.get(cls_key)):
converter_cls = cls_dict[cls_key]
if converter_cls == JSONConverter:
- return converter_cls(page, non_testing, key=key)
+ return converter_cls(page, non_testing=non_testing, key=key)
return converter_cls(page, non_testing, show_word_insertion_mark)
else:
return Converter(page, non_testing, show_word_insertion_mark)
class JSONConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a json file.
"""
- def __init__(self, page, non_testing=True, key=''):
+ def __init__(self, page, faksimile_page=None, non_testing=True, key=''):
Converter.__init__(self, page, non_testing, False)
+ self.faksimile_page = faksimile_page
def _add_word_to_list(self, words, word, text, text_field=None, edited_text=None, earlier_version=None, overwrites_word=None, parent_id=-1, faksimile_positions=None):
"""Add word to list.
"""
id = word.id\
if parent_id == -1\
else parent_id
edited_text = word.edited_text\
if edited_text is None\
else edited_text
earlier_version = word.earlier_version\
if earlier_version is None\
else earlier_version
overwrites_word = word.overwrites_word\
if overwrites_word is None\
else overwrites_word
line_number = word.line_number
for tp in word.transkription_positions:
tp_id = f'w{word.id}:tp{tp.id}'\
if parent_id == -1\
else f'w{parent_id}:w{word.id}:tp{tp.id}'
if text_field is not None:
word_dict = { 'id': id, 'text': text, 'left': tp.left + text_field.left, 'top': tp.top + text_field.top,\
'width': tp.width, 'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted }
if tp.transform is not None:
matrix = tp.transform.clone_transformation_matrix()
xmin = text_field.left
ymin = text_field.top
matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3)
word_dict.update({ 'transform': matrix.toString() })
if tp.left > 0:
word_dict.update({ 'left': round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)})
else:
word_dict.update({ 'left': 0})
word_dict.update({ 'top': round((tp.height-1.5)*-1, 3)})
else:
word_dict = { 'id': id, 'text': text, 'left': tp.left, 'top': tp.top, 'width': tp.width,\
'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted }
if tp.transform is not None:
word_dict.update({ 'transform': tp.transform.toString() })
if edited_text is not None:
word_dict.update({'edited_text': edited_text})
if earlier_version is not None:
word_dict.update({'earlier_version': earlier_version.text })
if overwrites_word is not None:
word_dict.update({'overwrites_word': overwrites_word.text })
if parent_id > -1:
word_dict.update({'part_text': word.text })
if len(word.deletion_paths) > 0:
for dp_index, dp in enumerate(word.deletion_paths):
if bool(word_dict.get('deletion_path')):
word_dict = word_dict.copy()
word_dict.update({'deletion_path': dp.d_attribute})
words.append(word_dict)
if len(word.deletion_paths_near_word) > 0:
word_dict.update({'paths_near_word': word.deletion_paths_near_word })
words.append(word_dict)
else:
words.append(word_dict)
if faksimile_positions is not None:
faksimile_dict = {}
for fp in word.faksimile_positions:
- faksimile_dict = { 'id': id, 'text': text, 'left': fp.left, 'top': fp.top,\
- 'width': fp.width, 'height': fp.height, 'line': line_number, 'fp_id': fp.id, 'deleted': word.deleted }
- if fp.transform is not None:
- faksimile_dict.update({ 'transform': fp.transform.toString() })
- if len(faksimile_dict) > 0:
- if edited_text is not None:
- faksimile_dict.update({'edited_text': edited_text})
- if earlier_version is not None:
- faksimile_dict.update({'earlier_version': earlier_version.text })
- if overwrites_word is not None:
- faksimile_dict.update({'overwrites_word': overwrites_word.text })
- if parent_id > -1:
- faksimile_dict.update({'part_text': word.text })
- faksimile_positions.append(faksimile_dict)
+ self._add_faksimile_to_list(id, line_number, fp, word.deleted, faksimile_positions, text, edited_text=edited_text,\
+ earlier_version=earlier_version, overwrites_word=overwrites_word, parent_id=parent_id, word_text=word.text)
for wp in word.word_parts:
self._add_word_to_list(words, wp, text, text_field=text_field, edited_text=edited_text,\
earlier_version=earlier_version, overwrites_word=overwrites_word, parent_id=word.id, faksimile_positions=faksimile_positions)
+
+ def _add_faksimile_to_list(self, id, line_number, fp, deleted, faksimile_positions, text, edited_text=None, earlier_version=None, overwrites_word=None, parent_id=-1, word_text='') ->dict:
+ """Create and return a json dictionary.
+ """
+ faksimile_dict = { 'id': id, 'text': text, 'left': fp.left, 'top': fp.top,\
+ 'width': fp.width, 'height': fp.height, 'line': line_number, 'fp_id': fp.id, 'deleted': deleted }
+ if fp.transform is not None:
+ faksimile_dict.update({ 'transform': fp.transform.toString() })
+ if len(faksimile_dict) > 0:
+ if edited_text is not None:
+ faksimile_dict.update({'edited_text': edited_text})
+ if earlier_version is not None:
+ faksimile_dict.update({'earlier_version': earlier_version.text })
+ if overwrites_word is not None:
+ faksimile_dict.update({'overwrites_word': overwrites_word.text })
+ if parent_id > -1:
+ faksimile_dict.update({'part_text': word_text })
+ faksimile_positions.append(faksimile_dict)
+
def create_json_dict(self) ->dict:
"""Create and return a json dictionary.
"""
words = []
faksimile_positions = []
text_field = None
if self.page.svg_image is not None:
if self.page.svg_image.text_field is None:
text_field = self.page.svg_image.text_field = TranskriptionField(self.page.svg_image.file_name).convert_to_text_field()
- #self.page.svg_image.decontextualize_file_name(update_url=EXIST_DB)
for word in self.page.words:
self._add_word_to_list(words, word, word.text, text_field=text_field, faksimile_positions=faksimile_positions)
lines = []
faksimile_lines = []
offset = 0 if text_field is None else text_field.ymin
svg_image = self.add_object2dict(self.page.svg_image)
+ if self.faksimile_page is not None:
+ if self.page.faksimile_image is None:
+ if self.faksimile_page.faksimile_image.text_field is None\
+ and self.faksimile_page.text_field is not None:
+ self.faksimile_page.faksimile_image.text_field = self.faksimile_page.text_field
+ self.page.faksimile_image = self.faksimile_page.faksimile_image
+ for fp in self.faksimile_page.word_positions:
+ if fp.id not in [ f_dict.get('fp_id') for f_dict in faksimile_positions ]:
+ self._add_faksimile_to_list(fp.id, -1, fp, False, faksimile_positions, fp.text)
+ faksimile_image = self.add_object2dict(self.page.faksimile_image)
if svg_image is not None:
svg_image.update({ 'URL': self.page.svg_image.primaryURL })
svg_image.update({ 'x': self.page.svg_image.text_field.left })
svg_image.update({ 'y': self.page.svg_image.text_field.top })
- faksimile_image = self.add_object2dict(self.page.faksimile_image)
if faksimile_image is not None:
faksimile_image.update({ 'secondaryURL': LOCAL_SERVER + "faksimiles/" + self.page.faksimile_image.file_name })
faksimile_image.update({ 'x': 0 })
faksimile_image.update({ 'y': 0 })
for line in self.page.lines:
lines.append({ 'id': line.id, 'number': line.id, 'top': line.top + offset, 'bottom': line.bottom })
faksimile_lines.append({ 'id': line.id, 'number': line.id, 'top': line.faksimile_inner_top, 'bottom': line.faksimile_inner_bottom })
return { 'title': self.page.title, 'number': self.page.number, 'words': words, 'svg': svg_image, 'lines': lines,\
'faksimile': faksimile_image, 'faksimile_positions': faksimile_positions, 'faksimile_lines': faksimile_lines }
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to JSON.
"""
if output_file is None:
output_file = 'output.json'
json_file = open(output_file, "w+")
try:
json.dump(self.create_json_dict(), json_file)
except Exception:
raise Exception('Error in json.dump')
json_file.close()
return 0
def add_object2dict(self, object_instance):
"""Add an object to json_dict and generate json data and interfaces.
[:return:] json dict or object_instance
"""
json_dict = {}
object_type = type(object_instance)
if object_type.__module__ == 'builtins':
if object_type != list:
return object_instance
else:
items = []
for item in object_instance:
items.append(self.add_object2dict(item))
if len(items) > 0:
return items
else:
return { self.key: [] }
semantic_dictionary = object_type.get_semantic_dictionary()
for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]:
content = object_instance.__dict__.get(key)
if content_type == list\
and content is not None\
and len(content) > 0\
and type(content[0]).__module__ != 'builtins':
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item))
json_dict.update({key: content_list})
elif content_type.__module__ == 'builtins':
if content is not None:
json_dict.update({key: content})
else:
if content is not None and type(content) == list:
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item))
json_dict.update({key: content_list})
else:
if content is not None:
json_dict.update({key: self.add_object2dict(content)})
return json_dict
class oldJSONConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a json file.
"""
PY2TS_DICT = { float: 'number', int: 'number', bool: 'boolean', str: 'string' }
def __init__(self, page, non_testing=True, key=''):
Converter.__init__(self, page, non_testing, False)
self.key = key
self.interface_output_dir = PathLibPath('ts_interfaces')
if not self.interface_output_dir.is_dir():
self.interface_output_dir.mkdir()
elif len(list(self.interface_output_dir.glob('*.ts'))) > 0:
for ts_file in self.interface_output_dir.glob('*.ts'):
remove(ts_file)
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to JSON.
"""
if output_file is None:
output_file = 'output.json'
class_dict = {}
if self.key != '':
object_instance = self.page.__dict__.get(self.key)
if object_instance is not None:
json_dict = self.add_object2dict(object_instance, class_dict)
if type(json_dict) == list:
json_dict = { self.key : json_dict }
else:
print(f'Page initialized from {self.page.page_tree.docinfo.URL} does not have an object at "{self.key}"!')
return 2
else:
json_dict = self.add_object2dict(self.page, class_dict)
json_file = open(output_file, "w+")
try:
json.dump(json_dict, json_file)
except Exception:
raise Exception('Error in json.dump')
json_file.close()
self.create_imports(class_dict)
return 0
def add_object2dict(self, object_instance, class_dict):
"""Add an object to json_dict and generate json data and interfaces.
[:return:] json dict or object_instance
"""
json_dict = {}
interface_list = []
object_type = type(object_instance)
if object_type.__module__ == 'builtins':
if object_type != list:
return object_instance
else:
items = []
for item in object_instance:
items.append(self.add_object2dict(item, class_dict))
if len(items) > 0:
return { self.key: items }
else:
return { self.key: 'null' }
semantic_dictionary = object_type.get_semantic_dictionary()
for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]:
content = object_instance.__dict__.get(key)
if content_type == list\
and content is not None\
and len(content) > 0\
and type(content[0]).__module__ != 'builtins':
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item, class_dict))
json_dict.update({key: content_list})
interface_list.append(f'{key}: {type(content[0]).__name__}[];')
elif content_type.__module__ == 'builtins':
if content_type != list:
ts_type = self.PY2TS_DICT[content_type]\
if content_type in self.PY2TS_DICT.keys()\
else 'string'
interface_list.append(f'{key}: {ts_type};')
json_dict.update({key: content})
else:
if content is not None and type(content) == list:
interface_list.append(f'{key}: {content_type.__name__}[];')
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item, class_dict))
json_dict.update({key: content_list})
else:
interface_list.append(f'{key}: {content_type.__name__};')
if content is not None:
json_dict.update({key: self.add_object2dict(content, class_dict)})
if object_type not in class_dict.keys():
class_dict.update({object_type: self.create_interface(object_type.__name__, interface_list)})
return json_dict
def create_imports(self, class_dict):
"""Create an ts interface from a list of key and content_types.
[:return:] file_name of interface
"""
ts_file = PathLibPath('ts_imports.ts')
file = open(ts_file, "w+")
file.write(f'//import all interfaces from {self.interface_output_dir} ' + '\n')
for interface_name, path_name in class_dict.items() :
file.write('import {' + interface_name.__name__ + '} from \'./' + str(self.interface_output_dir.joinpath(path_name.stem)) + '\';\n')
file.close()
return ts_file
def create_interface(self, class_name, interface_list) -> PathLibPath:
"""Create an ts interface from a list of key and content_types.
[:return:] file_name of interface
"""
ts_file = self.interface_output_dir.joinpath(PathLibPath(f'{class_name.lower()}.ts'))
import_list = [ import_class_name for import_class_name in\
[ import_class_name.split(': ')[1].replace(';','').replace('[]','') for import_class_name in interface_list ]\
if import_class_name not in set(self.PY2TS_DICT.values()) ]
file = open(ts_file, "w")
for import_class_name in set(import_list):
file.write('import {' + import_class_name + '} from \'./' + import_class_name.lower() + '\';\n')
file.write(f'export interface {class_name} ' + '{\n')
for interace_string in interface_list:
file.write(f'\t' + interace_string + '\n')
file.write('}')
file.close()
return ts_file
class SVGConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text.
"""
BG_COLOR = 'yellow'
OPACITY = '0.2'
def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
self.bg_color = bg_color
self.opacity = opacity
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to SVG
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
svg_file = self.page.svg_file
if svg_file is None and self.page.svg_image is not None:
svg_file = self.page.svg_image.file_name
elif svg_file is None:
msg = f'ERROR: xml_source_file {self.page.docinfo.URL} does neither have a svg_file nor a svg_image!'
raise Exception(msg)
transkription_field = TranskriptionField(svg_file)
if bool(transkription_field.get_svg_attributes('xmlns')):
ET.register_namespace('', transkription_field.get_svg_attributes('xmlns'))
if bool(transkription_field.get_svg_attributes('xmlns:xlink')):
ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink'))
svg_tree = ET.parse(svg_file)
transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'})
colors = [ 'yellow', 'orange' ] if self.bg_color == self.BG_COLOR else [ self.bg_color ]
if highlighted_words is not None:
colors = ['yellow']
else:
highlighted_words = []
color_index = 0
for word in self.page.words:
word_id = 'word_' + str(word.id)
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
transkription_position_id = word_id + '_' + str(transkription_position.id)
color = colors[color_index] if word not in highlighted_words else self.bg_color
rect_node = ET.SubElement(transkription_node, 'rect',\
attrib={'id': transkription_position_id, 'x': str(transkription_position.left + transkription_field.xmin),\
'y': str(transkription_position.top + transkription_field.ymin), 'width': str(transkription_position.width),\
'height': str(transkription_position.height), 'fill': color, 'opacity': self.opacity})
if transkription_position.transform is not None:
matrix = transkription_position.transform.clone_transformation_matrix()
matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3)
rect_node.set('transform', matrix.toString())
rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3)))
rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3)))
ET.SubElement(rect_node, 'title').text = word.text
color_index = (color_index + 1) % len(colors)
if output_file is not None:
svg_tree.write(output_file)
return 0
class HTMLConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a test HTML file.
"""
CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; }
.highlight1 { background-color: pink; opacity: 0.2; }
.highlight2 { background-color: red; opacity: 0.2; }
.foreign { background-color: blue; opacity: 0.4; }
.overwritten { background-color: green; opacity: 0.4; }
.word-insertion-mark { background-color: orange; opacity: 0.2; }
.deleted { background-color: grey; opacity: 0.2; }
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
self.text_field = TextField()
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to HTML
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
if stage_version != '':
title = title + ', Schreibstufe: ' + stage_version
if self.page.svg_image is not None:
width = self.page.svg_image.width
height = self.page.svg_image.height
svg_file = self.page.svg_image.file_name
if self.page.svg_image.text_field is not None:
self.text_field = self.page.svg_image.text_field
print('Textfield found ->adjusting data')
elif self.page.svg_file is not None:
svg_file = self.page.svg_file
transkription_field = TranskriptionField(svg_file)
width = transkription_field.getWidth()
height = transkription_field.getHeight()
style_content = ' position: relative; width: {}px; height: {}px; background-image: url("{}"); background-size: {}px {}px '\
.format(width, height, path.abspath(svg_file), width, height)
style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS)
head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style)
transkription = E.DIV(id="transkription")
counter = 0
for word in self.page.words:
highlight_class = 'highlight' + str(counter)\
if not word.deleted else 'deleted'
if highlighted_words is not None\
and word in highlighted_words:
highlight_class = 'highlight2'
earlier_text = '' if word.earlier_version is None else word.earlier_version.text
if earlier_text == '' and len(word.word_parts) > 0:
earlier_versions = [ word for word in word.word_parts if word.earlier_version is not None ]
earlier_text = earlier_versions[0].text if len(earlier_versions) > 0 else ''
if earlier_text != '':
word_title = 'id: {}/line: {}\n0: {}\n1: {}'.format(str(word.id), str(word.line_number), earlier_text, word.text)
else:
word_title = 'id: {}/line: {}\n{}'.format(str(word.id), str(word.line_number), word.text)
if word.edited_text is not None:
word_title += f'\n>{word.edited_text}'
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
self._append2transkription(transkription, highlight_class, word_title, transkription_position)
if word.overwrites_word is not None:
overwritten_title = f'{word.text} overwrites {word.overwrites_word.text}'
for overwritten_transkription_position in word.overwrites_word.transkription_positions:
self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position)
for part_word in word.word_parts:
highlight_class = 'highlight' + str(counter)\
if not part_word.deleted else 'deleted'
for part_transkription_position in self._get_transkription_positions(part_word.transkription_positions, stage_version=stage_version):
self._append2transkription(transkription, highlight_class, word_title, part_transkription_position)
if part_word.overwrites_word is not None:
overwritten_title = f'{word.text} overwrites {part_word.overwrites_word.text}'
for overwritten_transkription_position in part_word.overwrites_word.transkription_positions:
self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position)
counter = (counter + 1) % 2
word_insertion_mark_class = 'word-insertion-mark'
counter = 0
for mark_foreign_hands in self.page.mark_foreign_hands:
highlight_class = 'foreign'
title = 'id: {}/line: {}\n{} {}'.format(str(mark_foreign_hands.id), str(mark_foreign_hands.line_number),\
mark_foreign_hands.foreign_hands_text, mark_foreign_hands.pen)
for transkription_position in mark_foreign_hands.transkription_positions:
self._append2transkription(transkription, highlight_class, title, transkription_position)
if self.show_word_insertion_mark:
for word_insertion_mark in self.page.word_insertion_marks:
wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(word_insertion_mark.id), str(word_insertion_mark.line_number))
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
word_insertion_mark.top, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height)
link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content)
transkription.append(link)
html = E.HTML(head,E.BODY(transkription))
bool(self.non_testing) and open_in_browser(html)
if output_file is not None:
with open(output_file, 'wb') as f:
f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8'))
f.closed
return 0
def _append2transkription(self, transkription, highlight_class, title, transkription_position):
"""Append content to transkription-div.
"""
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
transkription_position.top - self.text_field.top, transkription_position.left - self.text_field.left, transkription_position.width, transkription_position.height)
if transkription_position.transform is not None:
style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString())
transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\
if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0
style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height)
link = E.A(' ', E.CLASS(highlight_class), title=title, style=style_content)
transkription.append(link)
def create_pdf_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, pdf_file_name='output.pdf', bg_color=SVGConverter.BG_COLOR):
"""Creates a pdf file highlighting some words.
"""
if not pdf_file_name.endswith('pdf'):
pdf_file_name = pdf_file_name + '.pdf'
tmp_svg_file = pdf_file_name.replace('.pdf', '.svg')
create_svg_with_highlighted_words(xml_source_file=xml_source_file, page=page, highlighted_words=highlighted_words,\
svg_file_name=tmp_svg_file, bg_color=bg_color)
if isfile(tmp_svg_file):
cairosvg.svg2pdf(url=tmp_svg_file, write_to=pdf_file_name)
remove(tmp_svg_file)
def create_svg_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, svg_file_name='output.svg', bg_color=SVGConverter.BG_COLOR):
"""Creates a svg file highlighting some words.
"""
if page is None and xml_source_file is not None:
page = Page(xml_source_file)
converter = SVGConverter(page, bg_color=bg_color)
if not svg_file_name.endswith('svg'):
svg_file_name = svg_file_name + '.svg'
converter.convert(output_file=svg_file_name, highlighted_words=highlighted_words)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes.
svgscripts/convert_wordPositions.py OPTIONS
OPTIONS:
-h|--help: show help
-H|--HTML [default] convert to HTML test file
-k|--key=key option for json converter:
only convert object == page.__dict__[key]
-o|--output=outputFile save output to file outputFile
-P|--PDF convert to PDF test file
-S|--SVG convert to SVG test file
-s|--svg=svgFile: svg web file
-T|--TEXT convert to TEXT output
-t|--text=text highlight word
-w|--word-insertion-mark show word insertion mark on HTML
-v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. }
-x|--testing execute in test mode, do not write to file or open browser
:return: exit code (int)
"""
convert_to_type = None
key = ''
non_testing = True
output_file = None
page = None
show_word_insertion_mark = False
stage_version = ''
svg_file = None
text = None
try:
opts, args = getopt.getopt(argv, "hk:t:HPSTws:o:v:x", ["help", "key=", "text=", "HTML", "PDF", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version=", "testing"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-v', '--version'):
if re.match(r'^(\d|\d\+|\d\-\d)$', arg):
stage_version = arg
else:
raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg))
elif opt in ('-w', '--word-insertion-mark'):
show_word_insertion_mark = True
elif opt in ('-P', '--PDF'):
convert_to_type = 'PDF'
elif opt in ('-S', '--SVG'):
convert_to_type = 'SVG'
elif opt in ('-T', '--TEXT'):
convert_to_type = 'TEXT'
elif opt in ('-H', '--HTML'):
convert_to_type = 'HTML'
elif opt in ('-x', '--testing'):
non_testing = False
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-o', '--output'):
output_file = arg
elif opt in ('-k', '--key'):
key = arg
elif opt in ('-t', '--text'):
text = arg
print(arg)
if len(args) < 1:
usage()
return 2
if convert_to_type is None:
if output_file is not None and len(re.split(r'\.', output_file)) > 1:
output_file_part_list = re.split(r'\.', output_file)
convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper()
else:
convert_to_type = 'HTML'
exit_code = 0
for word_position_file in args:
if not isfile(word_position_file):
print("'{}' does not exist!".format(word_position_file))
return 2
if convert_to_type == 'PDF':
if output_file is None:
output_file = 'output.pdf'
highlighted_words = None
if text is not None:
page = Page(word_position_file)
highlighted_words = [ word for word in page.words if word.text == text ]
create_pdf_with_highlighted_words(word_position_file, pdf_file_name=output_file, highlighted_words=highlighted_words)
else:
if svg_file is not None:
if isfile(svg_file):
page = PageCreator(word_position_file, svg_file=svg_file)
else:
print("'{}' does not exist!".format(word_position_file))
return 2
else:
page = Page(word_position_file)
if page.svg_file is None:
print('Please specify a svg file!')
usage()
return 2
-
highlighted_words = None
if text is not None:
highlighted_words = [ word for word in page.words if word.text == text ]
print([ (word.id, word.text) for word in highlighted_words ])
converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark, key=key)
exit_code = converter.convert(output_file=output_file, stage_version=stage_version, highlighted_words=highlighted_words)
return exit_code
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/join_faksimileAndTranskription.py
===================================================================
--- svgscripts/join_faksimileAndTranskription.py (revision 106)
+++ svgscripts/join_faksimileAndTranskription.py (revision 107)
@@ -1,604 +1,666 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path
from progress.bar import Bar
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from convert_wordPositions import create_pdf_with_highlighted_words, create_svg_with_highlighted_words
from create_task import CorrectWords
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.lineNumber import LineNumber
from datatypes.page import Page, STATUS_MERGED_OK
from datatypes.transkriptionField import TranskriptionField
from process_files import update_svgposfile_status
from process_words_post_merging import post_merging_processing_and_saving
from util import ExternalViewer, create_highlighted_svg_file, get_empty_node_ids, record_changes,\
record_changes_on_svg_file_to_page, record_changes_on_xml_file_to_page, get_mismatching_ids,\
replace_chars
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation)
PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"')
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
SINGLE_WORD_PATTERN = r"^[\w{}]$".format(string.punctuation)
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
def create_task_correct_words(target_dir, xml_source_file=None, source_svg_file=None, page=None, unmatched_word_ids=None, unmatched_node_ids=None):
"""Create a task CorrectWords or process corrected files.
"""
exit_status = 0
if xml_source_file is None or source_svg_file is None:
if xml_source_file is None and page is not None and isfile(page.page_tree.docinfo.URL):
xml_source_file = page.page_tree.docinfo.URL if xml_source_file is None else xml_source_file
elif xml_source_file is None:
raise Exception('create_task_correct_words needs a xml_source_file or a page that has a valid tree source!')
if source_svg_file is None and page is not None and isfile(page.faksimile_svgFile):
source_svg_file = page.faksimile_svgFile if source_svg_file is None else source_svg_file
elif source_svg_file is None:
raise Exception('create_task_correct_words needs a source_svg_file or a page that has a faksimile_svgFile!')
if page is None:
page = Page(xml_source_file)
correct_words = CorrectWords(xml_source_file, source_svg_file, target_dir, page=page,\
unmatched_node_ids=unmatched_node_ids)
if not correct_words.has_been_created(page):
if not page.is_locked():
reference_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.(xml|svg)')
lock_dict = { 'reference_file': reference_file,\
'message': 'Run:$ python3 {0} -c {1} {2}'.format(__file__, target_dir, source_svg_file)}
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file, script_name=__file__,\
file_type=FILE_TYPE_SVG_WORD_POSITION, **lock_dict)
correct_words.create()
if not UNITTESTING:
print('Created a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description))
elif correct_words.has_been_finished(page):
msg = 'Task "correct words" for page {} has been finished!'.format(str(page.number))
xml_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.xml', is_finished=True)
transkription_svg = correct_words.get_target_filepath(page, is_faksimile_svg=False, is_finished=True)
faksimile_svg = correct_words.get_target_filepath(page, is_finished=True)
faksimile_file = faksimile_svg if isfile(faksimile_svg) else source_svg_file
if isfile(xml_file):
msg += '\n Words loaded from file {}.'.format(xml_file)
page = record_changes_on_xml_file_to_page(xml_source_file, xml_file)
page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=xml_file)
elif isfile(transkription_svg):
msg += '\n Words loaded from file {}.'.format(transkription_svg)
page = record_changes_on_svg_file_to_page(xml_source_file, transkription_svg, word_ids=unmatched_word_ids)
page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=transkription_svg)
msg += '\n Faksimile loaded from file {}.'.format(faksimile_file)
if not UNITTESTING:
print(msg)
- exit_status = join_faksimileAndTranskription(faksimile_file, page=page)
+ exit_status = old_join_faksimileAndTranskription(faksimile_file, page=page)
elif not UNITTESTING:
print('There is a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description))
return exit_status
def debug_function(words, input=''):
"""Custon debug function.
"""
if len([ word for word in words if word.debug_container.get('marked') ]) > 0:
print(Fore.RED + 'marked word(s): {}'.format([ word.text for word in words if word.debug_container.get('marked') ]))
if input != '':
print('input: {}'.format(input))
print(Fore.RESET)
def fix_errors(faksimile_file, unmerged_faksimile_positions, unmerged_words, text_field_id=None, faksimile_page=None, xml_source_file=None, manuscript_file=None, namespaces={}):
"""Creates a faksimile svg file and a pdf file highlighting the positions of the word positions
that could not been merged. After correction, results are inserted into origianl file and processed again.
:return: exit status (int)
"""
parser = ET.XMLParser(remove_blank_text=True)
faksimile_tree = ET.parse(faksimile_file, parser)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
if faksimile_page is None:
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if text_field_id is not None\
and text_field_id in [ faksimile_page.text_field.id for faksimile_page in faksimile_pages ]:
faksimile_page = [ faksimile_page for faksimile_page in faksimile_pages if faksimile_page.text_field.id == text_field_id ][0]
else:
faksimile_page = faksimile_pages[0]
if xml_source_file is None or manuscript_file is None:
xml_source_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)
tmp_dir = tempfile.mkdtemp()
tmp_pdf_file = tmp_dir + sep + 'output.pdf'
tmp_svg_file = tmp_dir + sep + 'output.svg'
tmp_faksimile = tmp_dir + sep + 'faksimile.svg'
empyt_node_ids = get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces)\
if len(unmerged_faksimile_positions) < len(unmerged_words) else []
highlight_node_ids = [ faksimile_position.id for faksimile_position in unmerged_faksimile_positions ]
highlight_node_ids += empyt_node_ids
create_highlighted_svg_file(faksimile_tree, highlight_node_ids, target_file=tmp_faksimile,
local_image_path=faksimile_page.faksimile_image.local_path, namespaces=namespaces, highlight_color=HIGHLIGHT_COLOR)
#create_pdf_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, pdf_file_name=tmp_pdf_file, bg_color=HIGHLIGHT_COLOR)
create_svg_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, svg_file_name=tmp_svg_file, bg_color=HIGHLIGHT_COLOR)
exit_status = 2
if isfile(tmp_svg_file) and isfile(tmp_faksimile):
ExternalViewer.show_files(list_of_files=[tmp_svg_file, tmp_faksimile])
record_changes_on_svg_file_to_page(xml_source_file, tmp_svg_file, word_ids=[ word.id for word in unmerged_words ])
record_changes(faksimile_file, tmp_faksimile, highlight_node_ids, namespaces=namespaces)
shutil.rmtree(tmp_dir)
- exit_status = join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, do_fix_errors=False, join_single_char_words=True)
+ exit_status = old_join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, do_fix_errors=False, join_single_char_words=True)
return exit_status
def get_filelist_and_manuscript_file(file_a, file_b=None, correction_dir=None):
"""Returns a file list and a manuscript file (or None)
"""
file_list = []
manuscript_file = None
if isfile(file_a) and file_a.endswith('svg'):
file_list.append(file_a)
if file_b is not None and isfile(file_b):
manuscript_file = file_b
elif isfile(file_a) and file_a.endswith('xml'):
manuscript_file = file_a
if file_b is not None and isfile(file_b):
file_list.append(file_b)
elif file_b is not None and isdir(file_b):
file_list = [ svgfile for svgfile in listdir(file_b) if svgfile.endswith('svg') ]
elif correction_dir is not None and isdir(correction_dir)\
and Path(correction_dir, CorrectWords.finish_dir).is_dir():
finish_dir = Path(correction_dir, CorrectWords.finish_dir)
xml_files = list(finish_dir.glob('*.xml'))
svg_files = list(finish_dir.glob('*.svg'))
if len(xml_files + svg_files) > 1:
manuscript_tree = ET.parse(manuscript_file)
for xml_file in xml_files:
output = manuscript_tree.xpath(f'.//page[contains(@output, "{xml_file.name}")]/@output')
if len(output) > 0:
file_list.append(output[0])
elif isdir(file_a):
file_list = [ file_a + sep + svgfile for svgfile in listdir(file_a) if svgfile.endswith('svg') ]
if file_b is not None and isfile(file_b):
manuscript_file = file_b
return file_list, manuscript_file
def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None, redo_ok=False):
"""Return svg_pos_file and manuscript_file if they are ready for processing.
"""
svg_pos_file = None
manuscript_tree = None
if manuscript_file is not None:
#and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')):
manuscript_tree = ET.parse(manuscript_file)
else:
title_string = faksimile_page.title.replace(' ', '_')
manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\
if isdir('.{}xml'.format(sep)) else title_string + '.xml'
if isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if manuscript_tree is not None:
if redo_ok and len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
else:
if not UNITTESTING:
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
msg = Fore.LIGHTBLUE_EX +'->' + Fore.CYAN + 'Data from page {0} already merged with {1}!'.format(\
faksimile_page.page_number,\
manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)[0])
else:
msg = Fore.MAGENTA + 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number)
print(msg, end='')
print(Style.RESET_ALL)
return svg_pos_file, manuscript_file
-def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, page=None, do_fix_errors=False, redo_ok=False, debug_word_text='', **kwargs):
+def old_join_faksimileAndTranskription(faksimile_file, manuscript_file=None, page=None, do_fix_errors=False, redo_ok=False, debug_word_text='', **kwargs):
"""Joins the data of a faksimile file with the data of svgposfile.
"""
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='')
print(Style.RESET_ALL)
if not do_fix_errors and 'do_fix_errors' in kwargs.keys():
do_fix_errors = kwargs.get('do_fix_errors')
if not redo_ok and 'redo_ok' in kwargs.keys():
redo_ok = kwargs.get('redo_ok')
if debug_word_text == '' and 'debug_word_text' in kwargs.keys():
debug_word_text = kwargs.get('debug_word_text')
faksimile_tree = ET.parse(faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if page is not None:
faksimile_pages = [ faksimile_page for faksimile_page in faksimile_pages\
if get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)[0]\
== page.page_tree.docinfo.URL ]
exit_status = 0
for faksimile_page in faksimile_pages:
svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)
if svg_pos_file is not None:
image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field)
if page is None:
page = Page(svg_pos_file, faksimile_image=image4page, faksimile_svgFile=faksimile_file)
write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__,\
file_type=FILE_TYPE_SVG_WORD_POSITION)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + '->', end='')
print(Fore.CYAN + 'Joining data from page {0} with file {1} ... '.format(faksimile_page.page_number, svg_pos_file), end='')
words = sort_words(page)
if debug_word_text != '' and len([ word for word in words if word.text == debug_word_text ]) > 0:
for word in words:
if word.text == debug_word_text:
word.debug_container.update({'marked': True})
if bool(kwargs.get('join_single_char_words')):
removed_words = join_single_char_words(words)
page.words = words
page.update_and_attach_words2tree()
#print([ word.text for word in page.words if word in removed_words ])
faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions)
new_words = []
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
- faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions, unique_faksimile_words)
+ #faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions, unique_faksimile_words)
+ for word_text in unique_faksimile_words:
+ old_process_word_text(new_words, word_text, faksimile_positions, words)
+ if False not in [ word.joined for word in words if word.text != '.' ]\
+ and False not in [ position.joined for position in faksimile_positions]\
+ and not UNITTESTING:
+ post_merging_processing_and_saving(svg_pos_file=svg_pos_file, new_words=new_words, page=page, manuscript_file=manuscript_file)
+ print(Fore.GREEN + '[OK]')
+ print(Style.RESET_ALL)
+ elif not UNITTESTING:
+ mismatch_words, mismatch_faksimile_positions = get_mismatching_ids(words, faksimile_positions)
+ not_joined_fp = [ (position.id, position.text) for position in sorted(mismatch_faksimile_positions, key=lambda fp: fp.top) ]
+ plural_fp = '' if len(not_joined_fp) < 2 else 's'
+ not_joined_tw = [ (word.id, word.line_number, word.text) for word in sorted(mismatch_words, key=lambda word: word.transkription_positions[0].top) ]
+ plural_tw = '' if len(not_joined_tw) < 2 else 's'
+ print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp))
+ print([(position.id, position.text) for position in faksimile_positions if not position.joined])
+ print('--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw))
+ print([(word.id, word.line_number, word.text) for word in words if not word.joined ])
+ debug_function(new_words, input='new_words')
+ debug_function(words, input='words')
+ print(Style.RESET_ALL)
+ exit_status = 2
+ elif False in [ word.joined for word in words ]:
+ print([ (word.id, word.text) for word in words if not word.joined ])
+ exit_status = 2
+ page = None
+ return exit_status
+
+def add_faksimile_image(page, faksimile_page):
+ """Add faksimile image to page.
+ """
+ if page.faksimile_image is None:
+ if faksimile_page.faksimile_image.text_field is None\
+ and faksimile_page.text_field is not None:
+ faksimile_page.faksimile_image.text_field = faksimile_page.text_field
+ page.faksimile_image = faksimile_page.faksimile_image
+ page.faksimile_image.attach_object_to_tree(page.page_tree)
+ page.update_data_source(faksimile_svgFile=faksimile_page.svg_source_file)
+
+def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, page=None):
+ """Joins the data of a faksimile file with the data of svgposfile.
+ """
+ if not UNITTESTING:
+ print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='')
+ print(Style.RESET_ALL)
+ faksimile_tree = ET.parse(faksimile_file)
+ namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
+ faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
+ if page is not None:
+ faksimile_pages = [ faksimile_page for faksimile_page in faksimile_pages\
+ if faksimile_page.page_number == page.number ]
+ exit_status = 0
+ for faksimile_page in faksimile_pages:
+ svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)
+ if svg_pos_file is not None:
+ if page is None:
+ page = Page(svg_pos_file)
+ if page.faksimile_image is None:
+ add_faksimile_image(page, faksimile_page)
+ write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__,\
+ file_type=FILE_TYPE_SVG_WORD_POSITION)
+ if not UNITTESTING:
+ print(Fore.LIGHTBLUE_EX + '->', end='')
+ print(Fore.CYAN + 'Joining data from page {0} with file {1} ... '.format(faksimile_page.page_number, svg_pos_file), end='')
+ words = sort_words(page)
+ faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions)
+ new_words = []
+ unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
+ key=lambda text: len(text))
+ #faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions, unique_faksimile_words)
for word_text in unique_faksimile_words:
process_word_text(new_words, word_text, faksimile_positions, words)
if False not in [ word.joined for word in words if word.text != '.' ]\
and False not in [ position.joined for position in faksimile_positions]\
and not UNITTESTING:
if page.is_locked():
page.unlock()
post_merging_processing_and_saving(svg_pos_file=svg_pos_file, new_words=new_words, page=page, manuscript_file=manuscript_file)
print(Fore.GREEN + '[OK]')
print(Style.RESET_ALL)
elif not UNITTESTING:
mismatch_words, mismatch_faksimile_positions = get_mismatching_ids(words, faksimile_positions)
not_joined_fp = [ (position.id, position.text) for position in sorted(mismatch_faksimile_positions, key=lambda fp: fp.top) ]
plural_fp = '' if len(not_joined_fp) < 2 else 's'
not_joined_tw = [ (word.id, word.line_number, word.text) for word in sorted(mismatch_words, key=lambda word: word.transkription_positions[0].top) ]
plural_tw = '' if len(not_joined_tw) < 2 else 's'
print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp))
print([(position.id, position.text) for position in faksimile_positions if not position.joined])
print('--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw))
print([(word.id, word.line_number, word.text) for word in words if not word.joined ])
debug_function(new_words, input='new_words')
debug_function(words, input='words')
print(Style.RESET_ALL)
- if kwargs.get('correct_words') is not None:
- unmatched_node_ids = [ position.id for position in mismatch_faksimile_positions ]
- unmatched_node_ids += get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces)
- exit_status = create_task_correct_words(kwargs.get('correct_words'), page=page, source_svg_file=faksimile_file,\
- unmatched_word_ids=[ word.id for word in mismatch_words ],\
- unmatched_node_ids=unmatched_node_ids)
- elif do_fix_errors:
- exit_status = fix_errors(faksimile_file, [position for position in faksimile_positions if not position.joined],\
- [ word for word in words if not word.joined ], text_field_id=faksimile_page.text_field.id,\
- faksimile_page=faksimile_page, xml_source_file=svg_pos_file,\
- manuscript_file=manuscript_file, namespaces=namespaces)
- else:
- exit_status = 2
+ exit_status = 2
elif False in [ word.joined for word in words ]:
print([ (word.id, word.text) for word in words if not word.joined ])
exit_status = 2
page = None
return exit_status
def join_single_char_words(words, threshold_x=5, threshold_y=5):
"""Join single char words.
:return: a list of removed words
"""
#all_single_char_words = [ word for word in words if re.match(r'^\w$', word.text) ]
removed_words = []
all_single_char_words = [ word for word in words if re.match(SINGLE_WORD_PATTERN, word.text) ]
if not UNITTESTING:
bar = Bar('Joining single char words', max=len(all_single_char_words))
line_numbers = sorted(set(word.line_number for word in all_single_char_words))
for line_number in line_numbers:
single_char_words = [ word for word in all_single_char_words if word.line_number == line_number ]
index = len(single_char_words)
while index > 0:
index -= 1
word = None
not UNITTESTING and bar.next()
if single_char_words[index] in words:
single_char_word_index = words.index(single_char_words[index])
if re.match(SINGLE_PUNCTUATION_PATTERN, single_char_words[index].text)\
and words_close_enough(words[single_char_word_index-1], single_char_words[index], 15, 12):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
#print('{0} -> {1}, {2}'.format(word.text, words[single_char_word_index-1].text))
elif index > 0\
and words_close_enough(single_char_words[index-1], single_char_words[index], threshold_x, threshold_y):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
elif single_char_word_index > 0\
and words[single_char_word_index-1].line_number == line_number\
and words_close_enough(words[single_char_word_index-1], single_char_words[index], threshold_x, threshold_y):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
not UNITTESTING and bar.finish()
return removed_words
-def process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text='', min_length_split=5):
+def old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text='', min_length_split=5):
"""Joins faksimile_positions with text == word_text with words with text == word_text.
"""
text = word_text if alt_word_text == '' else alt_word_text
fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ]
words4word = [ word for word in words if word.text == word_text and not word.joined ]
if alt_word_text != '':
words4word += [ word for word in words if word.text == text and not word.joined ]
words4word = sorted(words4word, key=attrgetter('id'))
if len(fposition4word) == len(words4word):
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words[words.index(words4word[index])].joined = True
new_words.append(words4word[index])
elif len(words4word) < len(fposition4word):
if re.match(r'(.*)ss(.*)', text):
alt_word_text = re.sub(r'ss', 'ß', text)
- process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
+ old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
elif re.match(SINGLE_PUNCTUATION_PATTERN, text):
if text == '-':
alt_word_text = text.replace('-', '–')
- process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
+ old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
print('single', word_text, len(fposition4word), len(words4word))
"""
elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text):
alt_word_text = re.sub(PUNCTUATION_PATTERN, "", text)
debug_function(words4word, input='elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text) text {0}'.format(text))
if alt_word_text != '':
pattern = r'(.*){0}(.*)'.format(alt_word_text)
words4word += [ word for word in words if re.match(pattern, word.text) and not word.joined ]
if len(words4word) < len(fposition4word):
- process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
+ old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
if words4word[index].text[len(words4word[index].text)-1] != word_text[len(word_text)-1]\
and words.index(words4word[index])+1 < len(words)\
and words[words.index(words4word[index])+1].text == word_text[len(word_text)-1]:
words4word[index].join(words[words.index(words4word[index])+1])
words[words.index(words4word[index])+1].joined = True
words[words.index(words4word[index])].joined = True
words4word[index].text = word_text
new_words.append(words4word[index])
elif len(text) >= min_length_split and len([ word for word in words if word.text.startswith(text) and not word.joined ]) == len(fposition4word):
new_words4word = [ word for word in words if word.text.startswith(text) and not word.joined ]
debug_function(new_words4word, input='word.startswith {}'.format(text))
for index, fposition in enumerate(fposition4word):
old_word = new_words4word[index]
none_word, new_word, next_word = old_word.split(text, start_id=old_word.id)
fposition4word[index].joined = True
new_word.faksimile_positions = [ fposition4word[index] ]
words[words.index(old_word)] = new_word
if next_word is not None:
next_word.id = len(words)
next_word.joined = False
words.append(next_word)
new_word.joined = True
new_words.append(new_word)
elif len(text) >= min_length_split and len([ word for word in words if word.text.endswith(text) and not word.joined ]) == len(fposition4word):
new_words4word = [ word for word in words if word.text.endswith(text) and not word.joined ]
debug_function(new_words4word, input='word.endswith {}'.format(text))
for index, fposition in enumerate(fposition4word):
old_word = new_words4word[index]
before_word, new_word, none_word = old_word.split(text, start_id=old_word.id)
fposition4word[index].joined = True
new_word.faksimile_positions = [ fposition4word[index] ]
words[words.index(old_word)] = new_word
if before_word is not None:
before_word.id = len(words)
before_word.joined = False
words.append(before_word)
new_word.joined = True
new_words.append(new_word)
else:
if len(text) > 1:
new_words4word = [ word for word in words if word.text == text[1:] and not word.joined ]
debug_function(new_words4word, input='else text {0}'.format(text))
if len(new_words4word) == 0:
alt_word_text = text[1:]
- process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
+ old_process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
for new_word in new_words4word:
collected_text = new_word.text
current_word = new_word
while collected_text != word_text and word_text.endswith(collected_text) and current_word.id > 0:
previous_word = words[current_word.id-1]
if word_text.endswith(previous_word.text + collected_text):
words[current_word.id].joined = True
previous_word.join(current_word)
current_word = previous_word
collected_text = current_word.text
else:
collected_text = previous_word.text + collected_text
words4word.append(current_word)
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
if index < len(words4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words4word[index].text = word_text
words[words.index(words4word[index])].joined = True
new_words.append(words4word[index])
else:
print('<{0}> f{1}/t{2}, ids: {3}'.\
format(word_text, len(fposition4word), len(words4word), [ position.id for position in fposition4word ]))
"""
else:
print('<{0}> f{1}/t{2}'.format(word_text, len(fposition4word), len(words4word)))
+def process_word_text(new_words, word_text, faksimile_positions, words):
+ """Joins faksimile_positions with text == word_text with words with text == word_text.
+ """
+ fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ]
+ words4word = [ word for word in words if word.text == word_text and not word.joined ]
+ if len(fposition4word) == len(words4word):
+ for index, faksimile_position in enumerate(fposition4word):
+ faksimile_position.joined = True
+ words4word[index].faksimile_positions = [ faksimile_position ]
+ words[words.index(words4word[index])].joined = True
+ new_words.append(words4word[index])
+ else:
+ print('<{0}> f{1}/t{2}'.format(word_text, len(fposition4word), len(words4word)))
+
+def _compare_word_positions(wordA, wordB) ->int:
+ """Return ordering of wordA and wordB
+ """
+ tpA = wordA.transkription_positions[0] if len(wordA.word_parts) == 0 else wordA.word_parts[0].transkription_positions[0]
+ tpB = wordB.transkription_positions[0] if len(wordB.word_parts) == 0 else wordB.word_parts[0].transkription_positions[0]
+ if abs(tpA.bottom-tpB.bottom) < tpA.height/2:
+ return tpA.left - tpB.left
+ else:
+ return tpA.bottom - tpB. bottom
+
+
def sort_words(page)->list:
"""Returns sorted words (from top left to bottom right).
"""
- if -1 in [ word.line_number for word in page.words ]:
- warnings.warn('{} in page file {} for word with ids {}'.format(LineNumber.WARN_NO_LINE_NUMBER, page.page_tree.docinfo.URL, page.page_tree.xpath('./word[not(@line-number)]/@id')))
words = []
for line_number in page.line_numbers:
- word_on_line = [ word for word in page.words if word.line_number == line_number.id ]
+ word_on_line = [ word for word in page.words if word.line_number == line_number.id or (len(word.word_parts) > 0 and word.word_parts[0].line_number == line_number.id) ]
+ words += sorted(word_on_line, key=cmp_to_key(_compare_word_positions))
+ """
if line_number.id % 2 == 0:
words += sorted(word_on_line, key=lambda word: word.transkription_positions[0].left)
else:
words += sorted(word_on_line, key=cmp_to_key(\
lambda wordA, wordB: -1\
if wordA.transkription_positions[0].left < wordB.transkription_positions[0].left\
and abs(wordA.transkription_positions[0].bottom-wordB.transkription_positions[0].bottom) < wordA.transkription_positions[0].height/2\
else 1))
+ """
for index, word in enumerate(words):
words[index].id = index
words[index].joined = len(words[index].faksimile_positions) > 0 and words[index].verified
return words
def sort_faksimile_positions(faksimile_positions, reference_list=None):
"""Returns sorted words (from top left to bottom right).
"""
for faksimile_position in faksimile_positions:
faksimile_position.joined = False\
if reference_list is None\
else faksimile_position in reference_list
return sorted(faksimile_positions)
"""
return sorted(faksimile_positions, key=cmp_to_key(\
lambda positionA, positionB: -1\
if positionA.left+positionA.width/2 <= positionB.left+positionB.width/2\
and positionA.top+positionA.height/2 <= positionB.top+positionB.height/2\
else 1\
)\
)
"""
@deprecated(reason="Writing process id is now set to word not word_position, TODO: check faksimile_positions for split candidates!")
def update_writing_process(word):
"""Updates the writing process of the faksimile word position by
synchronizing it with the corresponding transkription word position.
If there are several transkription positions belonging to different writing
processes but just one faksimile position, then we skip the update.
We will fix these faksimile positions by manually adding more word positions
and processing those additions in a later stage.
"""
writing_processes = [ writing_process_id for writing_process_id in set( tp.writing_process_id for tp in word.transkription_positions ) ]
if len(writing_processes) == 1 and len(word.faksimile_positions) > 0:
word.faksimile_positions[0].writing_process_id = writing_processes[0]
def words_close_enough(wordA, wordB, threshold_x=10, threshold_y=5):
"""Return true if words are closer than thresholds
"""
return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left)\
-wordB.transkription_positions[0].left) < threshold_x\
and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y
#return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left+wordA.transkription_positions[len(wordA.transkription_positions)-1].width)\
# -wordB.transkription_positions[0].left) < threshold_x\
# and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
- svgscripts/join_faksimileAndTranskription.py [OPTIONS] [xmlManuscriptFile]
+ svgscripts/old_join_faksimileAndTranskription.py [OPTIONS] [xmlManuscriptFile]
a directory containing
a svg file containing information about the word positions on the faksimile.
a xml file about a manuscript, containing information about its pages.
OPTIONS:
-h|--help: show help
- -c|--correct-words=DIR create a taks "CorrectWords" in target dir DIR
- -d|--debug-word=WORD show debug information for word == WORD
- -f|--fix-errors: open faksimilie svg file if there are errors
- -i|--ignore-status-ok ignore status "OK:faksimile merged" in manuscript file and redo merging.
- -j|--join-single-char-words join single char words
:return: exit code (int)
"""
- commando_dict = { 'do_fix_errors': False, 'redo_ok': False, 'debug_word_text': '', 'correct_words': None,\
- 'join_single_char_words': False }
try:
- opts, args = getopt.getopt(argv, "hc:d:fij", ["help", "correct-words=", "debug-word=", "fix-errors", "ignore-status-ok",\
- "join-single-char-words" ])
+ opts, args = getopt.getopt(argv, "h", ["help" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
- elif opt in ('-c', '--correct-words'):
- commando_dict['correct_words'] = arg
- elif opt in ('-d', '--debug-word'):
- commando_dict['debug_word_text'] = arg
- elif opt in ('-f', '--fix-errors'):
- commando_dict['do_fix_errors'] = True
- elif opt in ('-i', '--ignore-status-ok'):
- commando_dict['redo_ok'] = True
- elif opt in ('-j', '--join-single-char-words'):
- commando_dict['join_single_char_words'] = True
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if exists(file_a):
file_b = None
if len(args) > 1 and exists(args[1]):
file_b = args[1]
- file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b, correction_dir=commando_dict['correct_words'])
- #if commando_dict['correct_words'] is not None and isdir(commando_dict['correct_words']):
- # print('checking new function, please remove this condition if successful!')
- # for file in file_list: print(file)
- # return 0
+ file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b)
for faksimile_file in file_list:
- join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, **commando_dict)
+ join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: tests_svgscripts/test_mark_foreign_hands.py
===================================================================
--- tests_svgscripts/test_mark_foreign_hands.py (revision 106)
+++ tests_svgscripts/test_mark_foreign_hands.py (revision 107)
@@ -1,81 +1,81 @@
import unittest
from os import sep, path
from os.path import dirname, isdir
import lxml.etree as ET
import sys
sys.path.append('svgscripts')
from datatypes.matrix import Matrix
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page
from datatypes.word import Word
class TestMarkForeignHands(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
self.xml_file = DATADIR + sep + 'N_VII_1_page008.xml'
self.test_content_svg = DATADIR + sep + 'N_VII_1_xp5_4_page5.svg'
self.test_content_xml = DATADIR + sep + 'N_VII_1_page005.xml'
self.test_contentB_svg = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg'
self.test_contentB_xml = DATADIR + sep + 'N_VII_1_page006.xml'
mylist = {'text': '*', 'id': '0', 'line-number': '2' }
self.node = ET.Element(MarkForeignHands.XML_TAG, attrib=mylist)
word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)'))
self.transkription_positions = [ word_position ]
word_position.attach_object_to_tree(self.node)
def test_create_cls(self):
mark_foreign_hands = MarkForeignHands.create_cls(self.node)
self.assertEqual(mark_foreign_hands.id, 0)
self.assertEqual(mark_foreign_hands.transkription_positions[0].bottom, 11)
self.assertEqual(mark_foreign_hands.transkription_positions[0].height, 10)
self.assertEqual(mark_foreign_hands.transkription_positions[0].top, 1)
self.assertEqual(mark_foreign_hands.transkription_positions[0].left, 0)
self.assertEqual(mark_foreign_hands.transkription_positions[0].width, 10)
self.assertEqual(mark_foreign_hands.text, '*')
self.assertEqual(mark_foreign_hands.line_number, 2)
self.assertEqual(mark_foreign_hands.transkription_positions[0].transform.isRotationMatrix(), True)
def test_attach_word_to_tree(self):
mark_foreign_hands = MarkForeignHands.create_cls(self.node)
mark_foreign_hands.foreign_hands_text = 'test'
mark_foreign_hands.pen= 'Rotstift'
empty_tree = ET.ElementTree(ET.Element('page'))
mark_foreign_hands.attach_word_to_tree(empty_tree)
#print(ET.dump(empty_tree.getroot()))
for node in empty_tree.xpath('//' + MarkForeignHands.XML_TAG):
mark = MarkForeignHands.create_cls(node)
self.assertEqual(mark.pen, 'Rotstift')
- self.assertEqual(mark.foreign_hands_text, 'test')
+ self.assertEqual(mark.foreign_hands_text.content, 'test')
self.assertEqual(mark.id, 0)
self.assertEqual(mark.transkription_positions[0].bottom, 11)
self.assertEqual(mark.transkription_positions[0].height, 10)
self.assertEqual(mark.transkription_positions[0].top, 1)
self.assertEqual(mark.transkription_positions[0].left, 0)
self.assertEqual(mark.transkription_positions[0].width, 10)
self.assertEqual(mark.text, '*')
self.assertEqual(mark.line_number, 2)
self.assertEqual(mark.transkription_positions[0].transform.isRotationMatrix(), True)
#print(empty_tree.xpath('//mark-foreign-hands/content/text()'))
#print(empty_tree.xpath('//mark-foreign-hands/content/@pen'))
def test_get_semanticAndDataDict(self):
dictionary = MarkForeignHands.get_semantic_dictionary()
#print(dictionary)
def test_find_content(self):
page = Page(self.test_contentB_xml)
transkription_field = TranskriptionField(page.source)
svg_tree = ET.parse(page.source)
page.update_line_number_area(transkription_field, svg_tree=svg_tree)
mark_foreign_hands_word = [ word for word in page.words if word.text == MarkForeignHands.CLASS_MARK ][0]
mark_foreign_hands = MarkForeignHands.create_cls_from_word(mark_foreign_hands_word)
MarkForeignHands.find_content([ mark_foreign_hands ] , transkription_field, svg_tree, style_dict=page.style_dict)
self.assertEqual(mark_foreign_hands.foreign_hands_text, 'W III, 104. (MXXIX, 3)')
self.assertEqual(mark_foreign_hands.pen, 'Bleistift')
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_join_faksimileAndTranskription.py
===================================================================
--- tests_svgscripts/test_join_faksimileAndTranskription.py (revision 106)
+++ tests_svgscripts/test_join_faksimileAndTranskription.py (revision 107)
@@ -1,123 +1,127 @@
import unittest
from os import sep, path, remove
from os.path import isdir, isfile, dirname
import shutil
import sys
import lxml.etree as ET
import warnings
import sys
sys.path.append('svgscripts')
import join_faksimileAndTranskription
from datatypes.faksimile import FaksimilePage
from datatypes.page import Page
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkriptionField import TranskriptionField
from datatypes.word_position import WordPosition
class TestJoin(unittest.TestCase):
def setUp(self):
join_faksimileAndTranskription.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.faksimile_dir = DATADIR + sep + 'faksimile_svg'
self.manuscript = DATADIR + sep + 'N_VII_1.xml'
self.manuscript_copy = self.manuscript.replace('.', '_copy.')
self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg'
self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml'
self.Mp_XIV_1_mytest_421 = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml'
self.correction_dir = DATADIR + sep + 'correction_dir'
self.page138 = DATADIR + sep + 'N_VII_1_page138.xml'
def test_sort_words(self):
page = Page(self.Mp_XIV_1_mytest_421)
words_line7 = [ word for word in page.words if word.line_number == 7 ]
page.words = words_line7
sorted_words = join_faksimileAndTranskription.sort_words(page)
self.assertEqual(len(sorted_words), len(words_line7))
for index, word in enumerate(words_line7):
self.assertEqual(sorted_words[index], word)
def test_sort_faksimile_positions(self):
faksimile_tree = ET.parse(self.faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
self.assertEqual(len(faksimile_pages), 2)
svg_pos_file, manuscript_file = join_faksimileAndTranskription.get_svgPosFile_and_manuscriptFile(faksimile_pages[0], manuscript_file=self.manuscript, redo_ok=True)
sorted_positions = join_faksimileAndTranskription.sort_faksimile_positions(faksimile_pages[0].word_positions)
page = Page(svg_pos_file)
#print(max(sorted_positions).text)
for index in range(0, 10):
id = sorted_positions[index].id
if len(faksimile_tree.getroot().xpath('//ns:rect[@id="{0}"]/ns:title/text()|//ns:path[@id="{0}"]/ns:title/text()'\
.format(id), namespaces=namespaces)) > 0:
word_text = faksimile_tree.getroot().xpath('//ns:rect[@id="{0}"]/ns:title/text()|//ns:path[@id="{0}"]/ns:title/text()'\
.format(id), namespaces=namespaces)[0]
#print(sorted_positions[index].left, sorted_positions[index].top, word_text, page.words[index].text)
self.assertEqual(word_text, page.words[index].text)
@unittest.skipUnless(__name__ == "__main__", 'test uses path from within dir')
def test_get_filelist_and_manuscript_file(self):
file_list, manuscript_file = join_faksimileAndTranskription.get_filelist_and_manuscript_file(self.faksimile_dir, self.manuscript)
- self.assertEqual(len(file_list), 1)
+ self.assertEqual(len(file_list), 2)
self.assertEqual(file_list[0], self.faksimile_file)
self.assertEqual(manuscript_file, self.manuscript)
file_list, manuscript_file = join_faksimileAndTranskription.get_filelist_and_manuscript_file(self.manuscript, self.faksimile_file)
self.assertEqual(len(file_list), 1)
self.assertEqual(file_list[0], self.faksimile_file)
self.assertEqual(manuscript_file, self.manuscript)
file_list, manuscript_file = join_faksimileAndTranskription.get_filelist_and_manuscript_file(self.manuscript, correction_dir=self.correction_dir)
self.assertEqual(len(file_list), 1)
self.assertEqual(file_list[0], self.page138)
@unittest.skipUnless(__name__ == "__main__", 'test uses path from within dir')
def test_get_svgPosFile_and_manuscriptFile(self):
faksimile_tree = ET.parse(self.faksimile_file)
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)
self.assertEqual(len(faksimile_pages), 2)
svg_pos_file, manuscript_file = join_faksimileAndTranskription.get_svgPosFile_and_manuscriptFile(faksimile_pages[0], manuscript_file=self.manuscript, redo_ok=True)
self.assertEqual(svg_pos_file, self.manuscript.replace('.', '_page00{}.'.format(faksimile_pages[0].page_number)))
self.assertEqual(manuscript_file, self.manuscript)
@unittest.skip('join changed ... fix me')
def test_join_faksimileAndTranskription(self):
self.assertEqual(join_faksimileAndTranskription.join_faksimileAndTranskription(self.faksimile_file, self.manuscript), 0)
#self.assertEqual(join_faksimileAndTranskription.join_faksimileAndTranskription(self.faksimile_file, self.manuscript, test_word_text='gar'), 0)
@unittest.skip('function update_writing_process is deprecated')
def testupdate_writing_process(self):
page = Page(self.xml_file)
word = page.words[12]
self.assertEqual(len(word.faksimile_positions), 1)
self.assertEqual(word.faksimile_positions[0].writing_process_id, -1)
join_faksimileAndTranskription.update_writing_process(word)
self.assertEqual(word.faksimile_positions[0].writing_process_id, 0)
#@unittest.skipUnless(__name__ == "__main__", 'test takes too long, we do not run it with unittest discover')
@unittest.skip('test takes too long, has been tested')
def test_fix_errors(self):
page = Page(self.xml_file)
word_position = WordPosition(id='rect945', text='Lenken')
exit_status = join_faksimileAndTranskription.fix_errors(self.faksimile_file, [ word_position], [page.words[12]], xml_source_file=self.xml_file, manuscript_file=self.manuscript )
self.assertEqual(exit_status, 0)
@unittest.skip('tested with local file')
def test_join_single_chars(self):
page = Page('xml/N_VII_1_page016.xml')
words = join_faksimileAndTranskription.sort_words(page)
join_faksimileAndTranskription.join_single_char_words(words)
new_words = [ word for word in words if word.text == 'selber' ]
self.assertEqual(len(new_words), 1)
new_words = [ word for word in words if word.text == 's' ]
self.assertEqual(len(new_words), 0)
def test_get_mismatching_ids(self):
page = Page(self.xml_file)
word_position = WordPosition(id='rect945', text='Lenken')
mwords, mfps = join_faksimileAndTranskription.get_mismatching_ids([ page.words[12]], [ word_position ])
self.assertEqual(mwords[0].text, 'Denken')
self.assertEqual(mfps[0].text, 'Lenken')
+ @unittest.skip('tested with local file')
+ def test_a_file(self):
+ #join_faksimileAndTranskription.UNITTESTING = False
+ join_faksimileAndTranskription.join_faksimileAndTranskription('/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile/Eric/Mp_XV/Kontrolle_und_Beschriftung_der_Wortrahmen/Fertig/Mp-XV-2d,3.svg', 'xml/Mp_XV.xml')
if __name__ == "__main__":
unittest.main()