Page MenuHomec4science

super_page.py
No OneTemporary

File Metadata

Created
Thu, May 2, 08:50

super_page.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a super page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile, basename, dirname
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import sys
import warnings
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .mark_foreign_hands import MarkForeignHands
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .writing_process import WritingProcess
class SuperPage:
"""
This super class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition'
FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile'
ADD2Y = 7
PAGE_RECTO = 'recto'
PAGE_VERSO = 'verso'
STATUS_MERGED_OK = 'faksimile merged'
STATUS_POSTMERGED_OK = 'words processed'
UNITTESTING = False
XML_TAG = 'page'
def __init__(self, xml_file, title=None, page_number='', orientation='North', multipage_index=-1, page_type=PAGE_VERSO, should_xml_file_exist=False):
self.properties_dictionary = {\
'faksimile_image': (FaksimileImage.XML_TAG, None, FaksimileImage),\
'faksimile_svgFile': ('data-source/@file', None, str),\
'multipage_index': ('page/@multipage-index', multipage_index, int),\
'marginals_source': ('page/@marginals-source', None, str),\
'number': ('page/@number', str(page_number), str),\
'orientation': ('page/@orientation', orientation, str),\
'page_type': ('page/@pageType', page_type, str),\
'pdfFile': ('pdf/@file', None, str),\
'source': ('page/@source', None, str),\
'svg_file': ('svg/@file', None, str),\
'svg_image': (SVGImage.XML_TAG, None, SVGImage),\
'text_field': (FaksimileImage.XML_TAG + '/' + TextField.XML_TAG, None, TextField),\
'title': ('page/@title', title, str),\
}
self.bak_file = None
self.online_properties = []
self.line_numbers = []
self.lines = []
self.mark_foreign_hands = []
self.page_tree = None
self.sonderzeichen_list = []
self.style_dict = {}
self.text_connection_marks = []
self.word_deletion_paths = []
self.word_insertion_marks = []
self.words = []
self.writing_processes = []
self.xml_file = xml_file
if not self.is_page_source_xml_file():
msg = f'ERROR: xml_source_file {self.xml_file} is not of type "{FILE_TYPE_SVG_WORD_POSITION}"'
raise Exception(msg)
self._init_tree(should_xml_file_exist=should_xml_file_exist)
def add_style(self, sonderzeichen_list=None, letterspacing_list=None, style_dict=None, style_node=None):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self.sonderzeichen_list = sonderzeichen_list if sonderzeichen_list is not None else []
self.letterspacing_list = letterspacing_list if letterspacing_list is not None else []
self.style_dict = style_dict if style_dict is not None else {}
if style_node is not None:
self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') }
self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ]
self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('letterspacing-list')) ]
elif bool(self.style_dict):
for node in self.page_tree.xpath('//style'): node.getparent().remove(node)
style_node = ET.SubElement(self.page_tree.getroot(), 'style')
if len(self.sonderzeichen_list) > 0:
style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list))
if len(self.letterspacing_list) > 0:
style_node.set('letterspacing-list', ' '.join(self.letterspacing_list))
for key in self.style_dict.keys():
self.style_dict[key]['name'] = key
ET.SubElement(style_node, 'class', attrib=self.style_dict[key])
fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value }
fontsizes = sorted(fontsize_dict.values(), reverse=True)
# create a mapping between fontsizes and word stages
self.fontsizekey2stage_mapping = {}
for fontsize_key, value in fontsize_dict.items():
if value >= fontsizes[0]-1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION })
elif value <= fontsizes[-1]+1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION })
else:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION })
def get_biggest_fontSize4styles(self, style_set={}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if bool(self.style_dict):
sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True)
return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1
else:
return 1
def get_line_number(self, y):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if len(self.line_numbers) > 0:
result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ]
return result_list[0] if len(result_list) > 0 else -1
else:
return -1
def init_all_properties(self, overwrite=False):
"""Initialize all properties.
"""
for property_key in self.properties_dictionary.keys():
if property_key not in self.online_properties:
self.init_property(property_key, overwrite=overwrite)
def init_property(self, property_key, value=None, overwrite=False):
"""Initialize all properties.
Args:
property_key: key of property in self.__dict__
value: new value to set to property
overwrite: whether or not to update values from xml_file (default: read only)
"""
if value is None:
if property_key not in self.online_properties:
xpath, value, cls = self.properties_dictionary.get(property_key)
if len(self.page_tree.xpath('//' + xpath)) > 0:
value = self.page_tree.xpath('//' + xpath)[0]
if value is not None:
if cls.__module__ == 'builtins':
self.update_tree(value, xpath)
self.__dict__.update({property_key: cls(value)})
else:
value = cls(node=value)\
if type(value) != cls\
else value
self.__dict__.update({property_key: value})
self.__dict__.get(property_key).attach_object_to_tree(self.page_tree)
else:
self.__dict__.update({property_key: value})
self.online_properties.append(property_key)
elif overwrite or property_key not in self.online_properties:
xpath, default_value, cls = self.properties_dictionary.get(property_key)
if cls.__module__ == 'builtins':
self.__dict__.update({property_key: cls(value)})
self.update_tree(value, xpath)
else:
self.__dict__.update({property_key: value})
self.__dict__.get(property_key).attach_object_to_tree(self.page_tree)
self.online_properties.append(property_key)
def is_locked(self):
"""Return true if page is locked.
"""
return len(self.page_tree.xpath('//metadata/lock')) > 0
def is_page_source_xml_file(self, source_tree=None):
"""Return true if xml_file is of type FILE_TYPE_SVG_WORD_POSITION.
"""
if not isfile(self.xml_file):
return True
if source_tree is None:
source_tree = ET.parse(self.xml_file)
return source_tree.getroot().find('metadata/type').text == self.FILE_TYPE_SVG_WORD_POSITION
def lock(self, reference_file, message=''):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if not self.is_locked():
metadata = self.page_tree.xpath('./metadata')[0]\
if len(self.page_tree.xpath('./metadata')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'metadata')
lock = ET.SubElement(metadata, 'lock')
ET.SubElement(lock, 'reference-file').text = reference_file
if message != '':
ET.SubElement(lock, 'message').text = message
def unlock(self):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if self.is_locked():
lock = self.page_tree.xpath('//metadata/lock')[0]
lock.getparent().remove(lock)
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_property_dictionary(self, property_key, default_value):
"""Update properties_dictionary.
"""
content = self.properties_dictionary.get(property_key)
if content is not None:
self.properties_dictionary.update({property_key: (content[0], default_value, content[2])})
else:
msg = f'ERROR: properties_dictionary does not contain a key {property_key}!'
raise Exception(msg)
def update_tree(self, value, xpath):
"""Update tree.
"""
node_name = dirname(xpath)
node = self.page_tree.xpath('//' + node_name)[0]\
if len(self.page_tree.xpath('//' + node_name)) > 0\
else ET.SubElement(self.page_tree.getroot(), node_name)
node.set(basename(xpath).replace('@', ''), str(value))
def _init_tree(self, should_xml_file_exist=False):
"""Initialize page_tree from xml_file if it exists.
"""
if isfile(self.xml_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(self.xml_file, parser)
elif not should_xml_file_exist:
self.page_tree = ET.ElementTree(ET.Element('page'))
self.page_tree.docinfo.URL = self.xml_file
else:
msg = f'ERROR: xml_source_file {self.xml_file} does not exist!'
raise FileNotFoundError(msg)

Event Timeline