Page MenuHomec4science

page.py
No OneTemporary

File Metadata

Created
Fri, May 3, 13:45
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile, basename
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import re
import sys
import warnings
from .box import Box
from .color import Color
from .image import Image, SVGImage
from .editor_comment import EditorComment
from .faksimile_image import FaksimileImage
from .faksimile_position import FaksimilePosition
from .imprint import Imprint
from .lineNumber import LineNumber
from .line import Line
from .mark_foreign_hands import MarkForeignHands
from .matrix import Matrix
from .path import Path
from .positional_word_part import PositionalWordPart
from .super_page import SuperPage
from .style import Style
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
from .word_deletion_path import WordDeletionPath
from .word_insertion_mark import WordInsertionMark
sys.path.append('py2ttl')
from class_spec import SemanticClass
sys.path.append('shared_util')
from main_util import extract_paths_on_tf, get_paths_near_position
FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION
FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT
STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK
STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK
class Page(SemanticClass,SuperPage):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
faksimile_image: FaksimileImage.
faksimile_svgFile: svg file containing information about word positions.
"""
UNITTESTING = False
def __init__(self, xml_source_file=None, faksimile_image=None, faksimile_svgFile=None, add_paths_near_words=False, warn=False, number=None):
if xml_source_file is not None:
super(Page,self).__init__(xml_source_file)
self.update_property_dictionary('faksimile_image', faksimile_image)
self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile)
self.init_all_properties()
self.add_style(style_node=self.page_tree.getroot().find('.//style'))
self.faksimile_text_field = None
self.svg_text_field = None
self.init_node_objects()
self.warn = warn
self.add_deletion_paths_to_words(add_paths_near_words)
else:
self.page_tree = None
self.number = number
def add_deletion_paths_to_words(self, add_paths_near_words=False):
"""Add deletion paths to words.
"""
words = [ word for word in self.words if (len(word.word_parts) == 0 and word.deleted and len(word.deletion_paths) == 0)\
or 'add_paths_near_words' in word.process_flags ]
words += [ word for word in self.words\
if len(word.word_parts) > 0 and True in\
[ (wp.deleted and len(wp.deletion_paths) == 0) for wp in word.word_parts ]]
if len(words) > 0 and ((self.svg_file is not None and isfile(self.svg_file))\
or (self.source is not None and isfile(self.source))):
svg_file = self.svg_file if self.svg_file is not None else self.source
transkription_field = TranskriptionField(svg_file)
tr_xmin = transkription_field.xmin if (self.svg_image is None or self.svg_image.text_field is None) else 0
tr_ymin = transkription_field.ymin if (self.svg_image is None or self.svg_image.text_field is None) else 0
word_deletion_paths = self.word_deletion_paths
index = 0
dp_updated = False
while index < len(words):
word = words[index]
word.add_deletion_paths(word_deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
if len(word.deletion_paths) > 0 or True in [ len(w.deletion_paths) > 0 for w in word.word_parts ]:
deletion_paths = word.deletion_paths
for wp in word.word_parts: deletion_paths += wp.deletion_paths
for deletion_path in deletion_paths:
if deletion_path not in self.word_deletion_paths:
self.word_deletion_paths.append(deletion_path)
elif not dp_updated:
word_deletion_paths = extract_paths_on_tf(self)
dp_updated = True
index -= 1
if add_paths_near_words\
and ('add_paths_near_words' in word.process_flags\
or ((word.deleted and len(word.deletion_paths) == 0)\
or True in [ (w.deleted and len(w.deletion_paths) == 0) for w in word.word_parts ])):
if not dp_updated\
and 'add_paths_near_words' in word.process_flags:
word_deletion_paths = extract_paths_on_tf(self)
dp_updated = True
transform = None
tp = None
target_word = word
paths_near_word = []
if word.deleted and len(word.transkription_positions) > 0:
transform = word.transkription_positions[0].transform
for tp in word.transkription_positions:
word.deletion_paths_near_word += get_paths_near_position(tp, word_deletion_paths)
elif len(word.word_parts) > 0:
for wp in word.word_parts:
if wp.deleted and len(wp.transkription_positions) > 0:
target_word = wp
for tp in wp.transkription_positions:
wp.deletion_paths_near_word = get_paths_near_position(tp, word_deletion_paths)
if self.warn and (word.deleted and len(word.deletion_paths) == 0):
warnings.warn(\
f'WARNING: {self.title} {self.number}: {word.id} on {word.line_number}, {word.text} has no deletion paths! {target_word.deletion_paths_near_word}, {transform}')
index += 1
@classmethod
def create_cls(cls, xml_source_file=None, create_dummy_page=False, isBlank=False, page_node=None):
"""Create a Page.
"""
if not create_dummy_page:
page = cls(xml_source_file)
page.status = 'complete'
if isBlank:
page.status = 'blank'
page.words = []
page.lines = []
page.word_deletion_paths = []
page.word_insertion_marks = []
return page
else:
m = re.match(r'(.*)(page[0]*)(.*)(\.xml)', xml_source_file)
if m is not None and len(m.groups()) > 3:
number = m.group(3)
else:
number = basename(xml_source_file).replace('.xml','')
return cls(number=number)
@classmethod
def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree = ET.parse(xml_file)
if source_tree.getroot().find('metadata/type').text == cls.FILE_TYPE_SVG_WORD_POSITION:
page = cls(xml_file)
if word_selection_function is None or len(word_selection_function(page.words)) > 0:
return [ page ]
else:
return []
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
pages = []
xpath = '//page/@output'
if status_contains != '' and status_not_contain != '':
xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain)
elif status_contains != '':
xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains)
elif status_not_contain != '':
xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain)
for xml_source_file in source_tree.xpath(xpath):
if isfile(xml_source_file):
pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function)
return pages
else:
return []
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'number': { 'class': str, 'cardinality': 1}}
properties.update(cls.create_semantic_property_dictionary('status', str,\
name='pageHasDataProcessingStatus', label='status of data processing',\
comment='The status of the data processing of this page'))
properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage, subPropertyOf=cls.HAS_IMAGE))
properties.update(cls.create_semantic_property_dictionary('faksimile_text_field', TextField,\
name='pageIsOnFaksimileTextField', label='page is on faksimile text field',\
comment='Relates a page to the text field on a svg image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
properties.update(cls.create_semantic_property_dictionary('orientation', str))
properties.update(cls.create_semantic_property_dictionary('status', str,\
name='pageHasDataProcessingStatus', label='status of data processing',\
comment='The status of the data processing of this page'))
properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage, subPropertyOf=cls.HAS_IMAGE))
properties.update(cls.create_semantic_property_dictionary('svg_text_field', TextField,\
name='pageIsOnSVGTextField', label='page is on svg text field',\
comment='Relates a page to the text field on a faksimile image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
for key in [ 'lines','imprints', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks', 'editor_comments']:
properties.update(cls.create_semantic_property_dictionary(key, list))
dictionary.update({cls.CLASS_KEY: class_dict})
dictionary.update({cls.PROPERTIES_KEY: properties})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def get_word_deletion_path(self, path=None, d_attribute=None) ->WordDeletionPath:
"""Return a word deletion path that belongs to page.
"""
if path is None and d_attribute is None:
raise Exception('ERROR: get_word_deletion_path needs a path or a d_attribute!')
if d_attribute is None:
d_attribute = path.d_attribute
page_paths = [ dpath for dpath in self.word_deletion_paths if dpath.d_attribute == d_attribute ]
if len(page_paths) > 0:
return page_paths[0]
else:
dpath = WordDeletionPath.create_cls(self, path=path, d_attribute=d_attribute)
if dpath is not None:
dpath.id = len(self.word_deletion_paths)
self.word_deletion_paths.append(dpath)
dpath.attach_object_to_tree(self.page_tree)
return dpath
def init_node_objects(self):
"""Initialize all node objects.
"""
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ]
self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('./' + MarkForeignHands.XML_TAG) ]
#self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ]
self.words += [ TextConnectionMark.instantiate_as_word(node, id=index+len(self.words))\
for index, node in enumerate(self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG)) ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.imprints = [ Imprint.create_cls_from_node(imprint_node, self.lines) for imprint_node in self.page_tree.getroot().xpath('//' + Imprint.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
self.word_deletion_paths = [ WordDeletionPath.create_cls(self, node=node) for node in self.page_tree.xpath('./' + WordDeletionPath.XML_TAG) ]
self.editor_comments = [ EditorComment.create_cls_from_node(node=node) for node in self.page_tree.xpath('./' + EditorComment.XML_TAG) ]
if self.faksimile_image is not None and self.faksimile_image.text_field is not None:
self.faksimile_text_field = self.faksimile_image.text_field
if self.svg_image is not None and self.svg_image.text_field is not None:
self.svg_text_field = self.svg_image.text_field
for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks:
simple_word.init_word(self)
for wim in self.word_insertion_marks:
if wim.line_number > -1:
wim.line = [ line for line in self.lines if line.id == wim.line_number ][0]
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None):
"""Update the data source of page.
"""
if faksimile_svgFile is not None:
self.faksimile_svgFile = faksimile_svgFile
data_node = self.page_tree.xpath('.//data-source')[0]\
if len(self.page_tree.xpath('.//data-source')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'data-source')
data_node.set('file', self.faksimile_svgFile)
if xml_correction_file is not None:
data_node.set('xml-corrected-words', xml_correction_file)
def update_line_number_area(self, transkription_field, svg_tree=None, set_to_text_field_zero=True):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD = 0.4
if svg_tree is None:
svg_tree = ET.parse(self.source)
if len(self.line_numbers) > 1:
line_number = self.line_numbers[9]\
if transkription_field.is_page_verso() and len(self.line_numbers) > 8\
else self.line_numbers[1]
ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\
if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\
and LineNumber.IS_A_LINE_NUMBER(item)\
and LineNumber(raw_text_node=item).id == line_number.id ]
if len(ln_nodes) > 0:
matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform'))
if transkription_field.is_page_verso():
transkription_field.add_line_number_area_width(matrix.getX())
elif self.svg_file is not None and isfile(self.svg_file):
svg_path_tree = ET.parse(self.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
svg_x = matrix.getX()
svg_y = self.line_numbers[1].bottom + transkription_field.ymin\
if set_to_text_field_zero\
else self.line_numbers[1].bottom
use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
if len(use_nodes) > 0:
symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
if len(d_strings) > 0 and d_strings[0] != '':
path = parse_path(d_strings[0])
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
transkription_field.add_line_number_area_width(matrix.getX() + width)
def update_page_type(self, transkription_field=None):
"""Adds a source to page and attaches it to page_tree.
"""
if self.number.endswith('r')\
or self.number.endswith('v'):
self.page_type = Page.PAGE_VERSO\
if self.number.endswith('v')\
else Page.PAGE_RECTO
else:
if transkription_field is None:
if self.source is None or not isfile(self.source):
raise FileNotFoundError('Page does not have a source!')
transkription_field = TranskriptionField(self.source, multipage_index=self.multipage_index)
self.page_type = Page.PAGE_VERSO\
if transkription_field.is_page_verso()\
else Page.PAGE_RECTO
self.page_tree.getroot().set('pageType', self.page_type)
def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False, parentsPWPs=None):
"""Update styles of words and add them to their transkription_positions.
Args:
add_to_parents: Add styles also to word (and if not None to manuscript).
partition_according_to_styles: Partition word if its transkription_positions have different styles.
"""
style_dictionary = {}
if words is None:
words = self.words
for word in words:
if len(word.word_parts) > 0:
self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\
add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles, parentsPWPs=parentsPWPs)
overwritten = [] if word.overwrites_word is None else [ word.overwrites_word ]
if word.earlier_version is not None:
overwritten.append(word.earlier_version)
if len(overwritten) > 0:
parentsPWPs = parentsPWPs if parentsPWPs is not None else []
if len(parentsPWPs) == 0:
cword = word.word_parts[0] if len(word.word_parts) > 0 else word
for tp in cword.transkription_positions:
parentsPWPs += tp.positional_word_parts
self.update_styles(words=overwritten, manuscript=manuscript, create_css=create_css,\
add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles, parentsPWPs=parentsPWPs)
for transkription_position in word.transkription_positions:
positional_word_parts = transkription_position.positional_word_parts\
if len(transkription_position.positional_word_parts) > 0\
else parentsPWPs
if len(positional_word_parts) > 0:
style_class = positional_word_parts[0].style_class
writing_process_id = -1
for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]:
writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id)
if create_css:
if style_dictionary.get((style_class_key, word.deleted)) is None:
color = None
if len(word.deletion_paths) > 0:
if word.deletion_paths[0].style_class is not None\
and word.deletion_paths[0].style_class != ''\
and self.style_dict.get(word.deletion_paths[0].style_class) is not None:
color = Color.create_cls_from_style_object(self.style_dict.get(word.deletion_paths[0].style_class))
else:
color = Color()
style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\
create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] )
transkription_position.style = style_dictionary[(style_class_key, word.deleted)]
#print(style_dictionary[(style_class_key, word.deleted)])
else:
if style_dictionary.get(style_class_key) is None:
style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css)
style_dictionary[style_class_key].writing_process_id = style_class_key[1]
transkription_position.style = style_dictionary[style_class_key]
if add_to_parents and transkription_position.style not in word.styles:
word.styles.append(transkription_position.style)
if partition_according_to_styles:
word.split_according_to_status('style', splits_are_parts=True)
if manuscript is not None\
and add_to_parents:
manuscript.update_styles(*style_dictionary.values())
def __eq__(self, other):
"""Returns true if self is qualitatively identical to other.
"""
if other is None:
return False
if self.page_tree is None and other.page_tree is None:
return self.number == other.number
if self.page_tree is None or other.page_tree is None:
return False
return self.page_tree.docinfo.URL == other.page_tree.docinfo.URL
def __hash__(self):
"""Return a hash value for self.
"""
try:
if self.page_tree is None:
return hash(self.number)
except AttributeError:
print(self)
return hash(self.number)
return hash(self.page_tree.docinfo.URL)

Event Timeline