Page MenuHomec4science

util.py
No OneTemporary

File Metadata

Created
Wed, May 1, 05:06
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to copy a faksimile svg file with the option of highlighting some word boxes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
from datetime import datetime
from functools import cmp_to_key
import getopt
import inspect
import itertools
import lxml.etree as ET
import re
import shutil
import signal
import string
import subprocess
from svgpathtools import svg_to_paths
import sys
import tempfile
import os
from os import listdir, sep, path, setpgrp, devnull, makedirs
from os.path import basename, commonpath, dirname, exists, isfile, isdir, realpath, splitext
import warnings
import wget
import xml.etree.ElementTree as XET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.faksimile_image import FaksimileImage
from datatypes.lineNumber import LineNumber
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page
from datatypes.page_creator import PageCreator
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import update_transkription_position_ids
from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
class ExternalViewer:
"""This class can be used to show files with external viewers.
"""
file_format_viewer_dict = { '.pdf': PDF_READER, '.svg': SVG_EDITOR }
@classmethod
def show_files(cls, single_file=None, list_of_files=[]):
"""Opens file(s) with corresponding external viewer(s).
"""
DEVNULL = None
if type(single_file) == list:
list_of_files = single_file
elif single_file is not None:
list_of_files.append(single_file)
if len(list_of_files) > 1:
DEVNULL = open(devnull, 'wb')
process_list = []
list_of_files.reverse()
while len(list_of_files) > 0:
file2open = list_of_files.pop()
viewer = cls.file_format_viewer_dict.get(splitext(file2open)[1])
if viewer is not None:
if len(list_of_files) > 0:
process_list.append(\
subprocess.Popen([viewer, file2open], stdout=DEVNULL, stderr=DEVNULL, preexec_fn=os.setsid))
else:
subprocess.run([viewer, file2open])
for process in process_list:
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
if DEVNULL is not None:
DEVNULL.close()
def back_up(page: Page, reference_file, bak_dir='./bak') -> str:
"""Back up a xml_source_file.
:return: target_file_name
"""
date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
makedirs(bak_dir, exist_ok=True)
page.bak_file = bak_dir + sep + basename(page.page_tree.docinfo.URL) + '_' + date_string
write_pretty(xml_element_tree=page.page_tree, file_name=page.bak_file,\
script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, reference_file),\
file_type=FILE_TYPE_SVG_WORD_POSITION)
return page.bak_file
def back_up_svg_file(svg_tree: ET.ElementTree, namespaces=None, bak_dir='./bak') -> str:
"""Back up a xml_source_file.
:return: target_file_name
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
date_string = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
makedirs(bak_dir, exist_ok=True)
bak_file = bak_dir + sep + date_string + '_' + basename(svg_tree.docinfo.URL)
copy_faksimile_svg_file(target_file=bak_file, faksimile_tree=svg_tree, namespaces=namespaces)
return bak_file
def copy_faksimile_svg_file(target_file=None, faksimile_source_file=None, faksimile_tree=None, target_directory=None, abs_image_path=None, local_image_path=None, namespaces=None):
"""Copy a faksimile_svg_file to target_file.
"""
if faksimile_source_file is None and faksimile_tree is not None:
faksimile_source_file = faksimile_tree.docinfo.URL
elif faksimile_source_file is None:
raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file')
if target_file is not None and target_directory is not None:
target_file = target_directory + sep + target_file
elif target_file is None and target_directory is not None:
target_file = target_directory + sep + basename(faksimile_source_file)
elif target_file is None:
raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory')
paths, attributes, svg_attributes = svg_to_paths.svg2paths(faksimile_source_file, return_svg_attributes=True)
for key in [ key for key in svg_attributes.keys() if key.startswith('xmlns:') ]:
try:
XET.register_namespace(key.replace('xmlns:', ''), svg_attributes[key])
except ValueError: pass
XET.register_namespace('', 'http://www.w3.org/2000/svg')
if namespaces is None:
namespaces = { 'ns': svg_attributes['xmlns'], 'xlink': svg_attributes['xmlns:xlink'],\
'sodipodi': svg_attributes['xmlns:sodipodi'] }
if faksimile_tree is not None:
element = XET.fromstring(ET.tostring(faksimile_tree))\
if type(faksimile_tree) == ET._ElementTree\
else XET.fromstring(XET.tostring(faksimile_tree.getroot()))
target_tree = XET.ElementTree(element)
else:
target_tree = XET.parse(faksimile_source_file)
if (local_image_path is not None or abs_image_path is not None)\
and len(target_tree.findall('.//ns:image', namespaces=namespaces)) > 0:
image_node = target_tree.findall('.//ns:image', namespaces=namespaces)[0]
if local_image_path is not None:
image_node.set('{%s}href' % namespaces['xlink'], local_image_path)
if abs_image_path is not None:
image_node.set('{%s}absref' % namespaces['sodipodi'], abs_image_path)
target_tree.write(target_file)
def copy_faksimile_update_image_location(faksimile_source_file=None, faksimile_tree=None, target_file=None, target_directory=None, overwrite=False):
"""Copy a faksimile_svg_file to target_file and update image location.
"""
if faksimile_source_file is None and faksimile_tree is not None:
faksimile_source_file = faksimile_tree.docinfo.URL
elif faksimile_source_file is None:
raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file')
if target_file is not None and target_directory is not None:
target_file = target_directory + sep + target_file
elif target_file is None and target_directory is not None:
target_file = target_directory + sep + basename(faksimile_source_file)
elif target_directory is None and target_file is not None:
target_directory = dirname(target_file)
elif target_file is None:
raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory')
source_tree = ET.parse(faksimile_source_file) if faksimile_tree is None else faksimile_tree
namespaces = { k if k is not None else 'ns': v for k, v in source_tree.getroot().nsmap.items() }
image_nodes = source_tree.xpath('//ns:image', namespaces=namespaces)
local_image_path = None
abs_image_path = None
user_abs_image_path = None
if len(image_nodes) > 0:
image = FaksimileImage.CREATE_IMAGE(image_nodes[0], source_file=faksimile_source_file)
abs_image_path = image.local_path
for user_name in USER_ROOT_LOCATION_DICT.keys():
if user_name in target_directory:
user_abs_image_path = abs_image_path.replace(FAKSIMILE_LOCATION, USER_ROOT_LOCATION_DICT[user_name]).replace('//','/')
break
# if target_directory is subdir of FAKSIMILE_LOCATION
if realpath(target_directory).startswith(realpath(FAKSIMILE_LOCATION)):
common_path = commonpath([ realpath(target_directory), realpath(dirname(image.local_path)) ])
relative_directory = '/'.join(\
[ '..' for d in realpath(target_directory).replace(common_path + '/', '').split('/') ])
local_image_path = relative_directory + realpath(image.local_path).replace(common_path, '')
if not isfile(target_directory + sep + local_image_path):
local_image_path = None
elif abs_image_path is not None:
local_image_path = abs_image_path
if abs_image_path is not None and not isfile(abs_image_path):
wget.download(image.URL, out=dirname(abs_image_path))
if not isfile(target_file) or overwrite:
abs_image_path = user_abs_image_path if user_abs_image_path is not None else abs_image_path
copy_faksimile_svg_file(target_file=target_file, faksimile_source_file=faksimile_source_file,\
faksimile_tree=faksimile_tree, abs_image_path=abs_image_path,\
local_image_path=local_image_path, namespaces=namespaces)
else:
msg = 'File {0} not copied to directory {1}, it already contains a file {2}.'.format(faksimile_source_file, target_directory, target_file)
warnings.warn(msg)
def copy_xml_file_word_pos_only(xml_source_file, target_directory):
"""Copy word positions of a xml file to target directory.
:return: (str) xml_target_file
"""
xml_target_file = target_directory + sep + basename(xml_source_file)
source_page = Page(xml_source_file)
target_page = PageCreator(xml_target_file, title=source_page.title, page_number=source_page.number, orientation=source_page.orientation)
target_page.words = source_page.words
target_page.update_and_attach_words2tree()
write_pretty(xml_element_tree=target_page.page_tree, file_name=xml_target_file,\
script_name=__file__ + '({})'.format(inspect.currentframe().f_code.co_name), file_type=FILE_TYPE_SVG_WORD_POSITION)
return xml_target_file
def create_highlighted_svg_file(faksimile_tree, node_ids, nodes_color_dict=None, target_file=None, target_directory=None, local_image_path=None, namespaces=None, highlight_color=HIGHLIGHT_COLOR, opacity=OPACITY):
"""Highlights the nodes of a faksimile_tree that are specified by the list of node_ids and writes the tree to a file.
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
for node in itertools.chain(*[\
faksimile_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)\
for node_id in node_ids\
]):
node.set('fill', highlight_color)
node.set('opacity', opacity)
node.set('style', '')
copy_faksimile_update_image_location(target_file=target_file, faksimile_tree=faksimile_tree, target_directory=target_directory)
def get_empty_node_ids(faksimile_tree, x_min=0.0, x_max=0.0, y_min=0.0, y_max=0.0, text_field_id=None, faksimile_page=None, namespaces={}):
"""Returns a list of ids of rect and path nodes that do not have a title element.
"""
THRESHOLD_X = 10
if faksimile_page is not None:
x_min = faksimile_page.text_field.xmin + faksimile_page.faksimile_image.x
x_max = faksimile_page.text_field.xmax + faksimile_page.faksimile_image.x - THRESHOLD_X
y_min = faksimile_page.text_field.ymin + faksimile_page.faksimile_image.y
y_max = faksimile_page.text_field.ymax + faksimile_page.faksimile_image.y
text_field_id = faksimile_page.text_field.id
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
empyt_node_ids = []
nodes_without_title = faksimile_tree.xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'.format(\
x_min, x_max, y_min, y_max, text_field_id), namespaces=namespaces)
nodes_without_title += get_paths_inside_rect(faksimile_tree, '//ns:path[not(./ns:title)]', x_min, x_max, y_min, y_max, text_field_id, namespaces=namespaces)
for node_without_title in nodes_without_title:
empyt_node_ids.append(node_without_title.get('id'))
return empyt_node_ids
def get_mismatching_ids(words, faksimile_positions):
""" Return the list of mismatching words and the list of mismatching faksimile_positions
as a 2-tuple.
"""
mismatching_words = []
mismatching_faksimile_positions = []
faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions)
word_texts = [ word.text for word in words if word.text != '.' ]
for word_text in set(word_texts):
if word_text not in unique_faksimile_words:
mismatching_words += [ word for word in words if word.text == word_text ]
for faksimile_position_text in unique_faksimile_words:
if faksimile_position_text not in set(word_texts):
mismatching_faksimile_positions += [ faksimile_position for faksimile_position in faksimile_positions\
if faksimile_position.text == faksimile_position_text ]
return mismatching_words, mismatching_faksimile_positions
def process_warnings4status(warnings, warning_messages, current_status, ok_status, status_prefix='') ->str:
"""Process potential warnings and return actual status.
"""
if warnings is not None and len(warnings) > 0:
status = status_prefix
for warning_message in warning_messages:
if True in [ str(warn.message).startswith(warning_message) for warn in warnings ]:
status += f':{warning_message}:'
if status != status_prefix:
return status
return f'{current_status}:{ok_status}:'
else:
return f'{current_status}:{ok_status}:'
def record_changes(original_svg_file, changed_svg_file, node_ids, namespaces={}):
"""Copy changes made to changed_svg_file to original_svg_file.
"""
old_tree = ET.parse(original_svg_file)
new_tree = ET.parse(changed_svg_file)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() }
for node_id in node_ids:
new_titles = new_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces)
old_nodes = old_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)
if len(new_titles) > 0 and len(old_nodes) > 0:
if old_nodes[0].find('ns:title', namespaces=namespaces) is not None:
old_nodes[0].find('ns:title', namespaces=namespaces).text = new_titles[0].text
else:
old_title_id_string = new_titles[0].get('id')
old_title = ET.SubElement(old_nodes[0], 'title', attrib={ 'id': old_title_id_string })
old_title.text = new_titles[0].text
elif len(old_nodes) > 0:
for old_node in old_nodes:
old_node.getparent().remove(old_node)
copy_faksimile_svg_file(target_file=original_svg_file, faksimile_tree=old_tree)
def record_changes_on_svg_file_to_page(xml_source_file, svg_file, word_ids=None):
"""Copy changes made to svg_file to xml_source_file.
:return: datatypes.page.Page
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
transkription_field = TranskriptionField(svg_file)
page = Page(xml_source_file)
words = [ word for word in page.words if word.id in word_ids ]\
if word_ids is not None else page.words
new_page_words = []
for word in words:
word_id = 'word_' + str(word.id) + '_'
recorded_ids = []
for transkription_position in word.transkription_positions:
transkription_position_id = word_id + str(transkription_position.id)
tp_nodes = svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]'.format(transkription_position_id), namespaces=namespaces)
if len(tp_nodes) > 0:
record_changes_to_transkription_position(tp_nodes[0], transkription_position,\
transkription_field.xmin, transkription_field.ymin, namespaces=namespaces)
recorded_ids.append(transkription_position_id)
extra_nodes = [ node for node in\
svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[contains(@id, "{0}")]'.format(word_id), namespaces=namespaces)\
if node.get('id') not in recorded_ids ]
if len(extra_nodes) > 0:
for extra_node in extra_nodes:
old_ids = [ inkscape_id.replace('#','') for inkscape_id in\
svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]/@inkscape:label'.format(extra_node.get('id')),\
namespaces=namespaces) ]
if len(old_ids) > 0 and re.match(r'word_[0-9]+_[0-9]+', old_ids[0]):
old_id_list = old_ids[0].split('_')
ref_word_id = int(old_id_list[1])
ref_tp_id = old_id_list[2]
ref_words = [ word for word in page.words if word.id == ref_word_id ]
if len(ref_words) > 0:
ref_tps = [ tp for tp in ref_words[0].transkription_positions\
if tp.id == ref_tp_id ]
if len(ref_tps) > 0:
ref_words[0].transkription_positions.remove(ref_tps[0])
record_changes_to_transkription_position(extra_node,\
ref_tps[0], transkription_field.xmin, transkription_field.ymin, namespaces=namespaces)
word.transkription_positions.append(ref_tps[0])
for word in page.words:
if word.has_mixed_status('text'):
new_page_words += [ word for word in word.split_according_to_status('text') if word.text is not None and word.text != '' ]
elif len(word.transkription_positions) > 0:
new_text = [ tp.text for tp in word.transkription_positions if tp.text is not None and tp.text != '' ]
if len(new_text) > 0:
word.text = new_text[0]
new_page_words.append(word)
page.words = new_page_words
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
page.unlock()
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\
script_name=__file__ + ' -> ' + inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_SVG_WORD_POSITION)
return page
def record_changes_on_xml_file_to_page(xml_source_file, xml_file) -> Page:
"""Copy changes made to xml_file to xml_source_file.
:return: datatypes.page.Page
"""
copy_page = Page(xml_file)
page = Page(xml_source_file)
page.unlock()
back_up(page, xml_file)
page.words = []
for word in copy_page.words:
if word.split_strings is None\
or len(word.split_strings) == 0:
page.words.append(word)
else:
next_word = word
for split_string in word.split_strings:
_, new_word, next_word = next_word.split(split_string)
page.words.append(new_word)
if next_word is not None:
page.words.append(next_word)
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
remove_words_if_done = []
for word in page.words:
if 'join_string' in word.__dict__.keys()\
and word.join_string is not None:
if word.id > 0\
and page.words[word.id-1].text + word.text == word.join_string:
page.words[word.id-1].join(word)
remove_words_if_done.append(word)
elif word.id < len(page.words)\
and word.text + page.words[word.id+1].text == word.join_string:
word.join(page.words[word.id+1])
remove_words_if_done.append(page.words[word.id+1])
for word in remove_words_if_done:
page.words.remove(word)
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\
script_name=__file__ + '({0},{1})'.format(inspect.currentframe().f_code.co_name, xml_file), file_type=FILE_TYPE_SVG_WORD_POSITION)
return page
def record_changes_to_transkription_position(node, transkription_position, xmin=0.0, ymin=0.0, namespaces=None):
"""Record changes made to node to transkription_position.
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in node.nsmap.items() }
if bool(node.get('x')):
transkription_position.left = float(node.get('x')) - xmin
if bool(node.get('y')):
transkription_position.top = float(node.get('y')) - ymin
if bool(node.get('width')):
transkription_position.width = float(node.get('width'))
if bool(node.get('height')):
transkription_position.height = float(node.get('height'))
if len(node.xpath('./ns:title/text()', namespaces=namespaces)) > 0:
transkription_position.text = node.xpath('./ns:title/text()', namespaces=namespaces)[0]
def replace_chars(words, faksimile_positions, unique_faksimile_words=None):
"""Return unique_faksimile_words and faksimile_positions, with characters changed according to transcription words.
"""
if unique_faksimile_words is None:
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
for index, word_text in enumerate(unique_faksimile_words):
if len([ word for word in words if word.text == word_text ]) == 0:
if re.match(r'.*".*', word_text)\
and len([ word for word in words if word.text == word_text.replace('"', '“') ]) > 0:
unique_faksimile_words[index] = word_text.replace('"', '“')
elif re.match(r'.*ss.*', word_text)\
and len([ word for word in words if word.text == word_text.replace('ss', 'ß') ]) > 0:
unique_faksimile_words[index] = word_text.replace('ss', 'ß')
elif re.match(r'.*-.*', word_text)\
and len([ word for word in words if word.text == word_text.replace('-', '–') ]) > 0:
unique_faksimile_words[index] = word_text.replace('-', '–')
for faksimile_position in [ faksimile_position for faksimile_position in faksimile_positions\
if faksimile_position.text == word_text ]:
faksimile_position.text = unique_faksimile_words[index]
elif word_text == '-'\
and len([ word for word in words if word.text == '–' ]) > 0:
print([ word.text for word in words if word.text == word_text ])
print([ word.text for word in words if word.text == '–' ])
return faksimile_positions, unique_faksimile_words
def update_svgposfile_status(file_name, manuscript_file=None, status='changed', append=True):
"""Updates svg position file's status. Changes its status to status if it does not contain 'OK',
else it appends new status to old status.
"""
if isfile(file_name):
parser = ET.XMLParser(remove_blank_text=True)
file_tree = ET.parse(file_name, parser)
old_status = file_tree.getroot().get('status')
if old_status is None or 'OK' not in old_status.split(':'):
file_tree.getroot().set('status', status)
elif append:
if status not in old_status.split(':'):
new_status = old_status + ':' + status
file_tree.getroot().set('status', new_status)
else:
file_tree.getroot().set('status', new_status)
write_pretty(xml_element_tree=file_tree, file_name=file_name, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
if manuscript_file is not None and isfile(manuscript_file):
page_number = file_tree.getroot().get('number')
update_manuscript_file(manuscript_file, page_number, file_name, status=status)
def update_manuscript_file(manuscript_file, page_number, file_name, status='changed', append=True):
"""Updates manuscript file: adds status information about page.
"""
if isfile(manuscript_file):
parser = ET.XMLParser(remove_blank_text=True)
manuscript_tree = ET.parse(manuscript_file, parser)
if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0]
old_status = node.get('status')
if old_status is None or 'OK' not in old_status.split(':'):
node.set('status', status)
elif append:
if status not in old_status.split(':'):
new_status = old_status + ':' + status
node.set('status', new_status)
else:
node.set('status', new_status)
if not bool(node.get('output')):
node.set('output', file_name)
else:
pages_node = manuscript_tree.getroot().find('pages')\
if manuscript_tree.getroot().find('pages') is not None\
else ET.SubElement(manuscript_tree.getroot(), 'pages')
new_id = len(pages_node.findall('page')) + 1
ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name})
write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT)

Event Timeline