Page MenuHomec4science

page.py
No OneTemporary

File Metadata

Created
Wed, May 1, 03:59
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import sys
import warnings
from .box import Box
from .color import Color
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .faksimile_position import FaksimilePosition
from .lineNumber import LineNumber
from .line import Line
from .mark_foreign_hands import MarkForeignHands
from .matrix import Matrix
from .path import Path
from .positional_word_part import PositionalWordPart
from .super_page import SuperPage
from .style import Style
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
from .word_deletion_path import WordDeletionPath
from .word_insertion_mark import WordInsertionMark
sys.path.append('py2ttl')
from class_spec import SemanticClass
FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION
FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT
STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK
STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK
class Page(SemanticClass,SuperPage):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
UNITTESTING = False
def __init__(self, xml_source_file, faksimile_image=None, faksimile_svgFile=None, add_deletion_paths_to_words=True):
super(Page,self).__init__(xml_source_file)
self.update_property_dictionary('faksimile_image', faksimile_image)
self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile)
self.init_all_properties()
self.add_style(style_node=self.page_tree.getroot().find('.//style'))
self.init_node_objects()
if add_deletion_paths_to_words:
self.add_deletion_paths_to_words()
def add_deletion_paths_to_words(self):
"""Add deletion paths to words.
"""
if (self.svg_file is not None and isfile(self.svg_file))\
or (self.source is not None and isfile(self.source)):
svg_file = self.svg_file if self.svg_file is not None else self.source
transkription_field = TranskriptionField(svg_file)
words = [ word for word in self.words if word.deleted or True in [ part.deleted for part in word.word_parts ]]
for word in words:
word.add_deletion_paths(self.word_deletion_paths, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin)
@classmethod
def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree = ET.parse(xml_file)
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION:
page = cls(xml_file)
if word_selection_function is None or len(word_selection_function(page.words)) > 0:
return [ page ]
else:
return []
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
pages = []
xpath = '//page/@output'
if status_contains != '' and status_not_contain != '':
xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain)
elif status_contains != '':
xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains)
elif status_not_contain != '':
xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain)
for xml_source_file in source_tree.xpath(xpath):
if isfile(xml_source_file):
pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function)
return pages
else:
return []
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'number': { 'class': str, 'cardinality': 1},\
'faksimile_image': { 'class': FaksimileImage, 'cardinality': 1},\
'orientation': { 'class': str, 'cardinality': 1},\
'svg_image': { 'class': SVGImage, 'cardinality': 1}}
properties.update(cls.create_semantic_property_dictionary('text_field', TextField,\
cardinality=1, name='pageIsOnTextField', label='page is on text field',\
comment='Relates a page to the text field on a faksimile image.'))
for key in [ 'lines', 'words', 'word_deletion_paths', 'word_insertion_marks']:
properties.update(cls.create_semantic_property_dictionary(key, list))
dictionary.update({cls.CLASS_KEY: class_dict})
dictionary.update({cls.PROPERTIES_KEY: properties})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def init_node_objects(self):
"""Initialize all node objects.
"""
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ]
self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ]
self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
self.word_deletion_paths = [ WordDeletionPath.create_cls(node, self) for node in self.page_tree.xpath('//' + WordDeletionPath.XML_TAG) ]
if self.faksimile_image is not None and self.text_field is not None:
for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks:
simple_word.init_word(self)
for wim in self.word_insertion_marks:
if wim.line_number > -1:
wim.line = [ line for line in self.lines if line.id == wim.line_number ][0]
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None):
"""Update the data source of page.
"""
if faksimile_svgFile is not None:
self.faksimile_svgFile = faksimile_svgFile
data_node = self.page_tree.xpath('.//data-source')[0]\
if len(self.page_tree.xpath('.//data-source')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'data-source')
data_node.set('file', self.faksimile_svgFile)
if xml_correction_file is not None:
data_node.set('xml-corrected-words', xml_correction_file)
def update_line_number_area(self, transkription_field, svg_tree=None):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD = 0.4
if svg_tree is None:
svg_tree = ET.parse(self.source)
if len(self.line_numbers) > 1:
line_number = self.line_numbers[9]\
if transkription_field.is_page_verso() and len(self.line_numbers) > 8\
else self.line_numbers[1]
ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\
if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\
and LineNumber.IS_A_LINE_NUMBER(item)\
and LineNumber(raw_text_node=item, transkription_field=transkription_field).id == line_number.id ]
if len(ln_nodes) > 0:
matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform'))
if transkription_field.is_page_verso():
transkription_field.add_line_number_area_width(matrix.getX())
elif self.svg_file is not None and isfile(self.svg_file):
svg_path_tree = ET.parse(self.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
svg_x = matrix.getX()
svg_y = self.line_numbers[1].bottom + transkription_field.ymin
use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
if len(use_nodes) > 0:
symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
if len(d_strings) > 0 and d_strings[0] != '':
path = parse_path(d_strings[0])
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
transkription_field.add_line_number_area_width(matrix.getX() + width)
def update_page_type(self, transkription_field=None):
"""Adds a source to page and attaches it to page_tree.
"""
if transkription_field is None:
if self.source is None or not isfile(self.source):
raise FileNotFoundError('Page does not have a source!')
transkription_field = TranskriptionField(self.source)
self.page_type = Page.PAGE_VERSO\
if transkription_field.is_page_verso()\
else Page.PAGE_RECTO
self.page_tree.getroot().set('pageType', self.page_type)
def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False):
"""Update styles of words and add them to their transkription_positions.
Args:
add_to_parents: Add styles also to word (and if not None to manuscript).
partition_according_to_styles: Partition word if its transkription_positions have different styles.
"""
style_dictionary = {}
if words is None:
words = self.words
for word in words:
if len(word.word_parts) > 0:
self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\
add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles)
for transkription_position in word.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
style_class = transkription_position.positional_word_parts[0].style_class
writing_process_id = -1
for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]:
writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id)
if create_css:
if style_dictionary.get((style_class_key, word.deleted)) is None:
color = word.deletion_paths[0].style.color\
if len(word.deletion_paths) > 0 else None
style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\
create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] )
transkription_position.style = style_dictionary[(style_class_key, word.deleted)]
#print(style_dictionary[(style_class_key, word.deleted)])
else:
if style_dictionary.get(style_class_key) is None:
style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css)
style_dictionary[style_class_key].writing_process_id = style_class_key[1]
transkription_position.style = style_dictionary[style_class_key]
if add_to_parents and transkription_position.style not in word.styles:
word.styles.append(transkription_position.style)
if partition_according_to_styles:
word.split_according_to_status('style', splits_are_parts=True)
if manuscript is not None\
and add_to_parents:
manuscript.update_styles(*style_dictionary.values())

Event Timeline