page.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Jul 4, 05:43

page.py
View Options

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	""" This class can be used to represent a page.
	"""
	# Copyright (C) University of Basel 2019 {{{1
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}

	__author__ = "Christian Steiner"
	__maintainer__ = __author__
	__copyright__ = 'University of Basel'
	__email__ = "christian.steiner@unibas.ch"
	__status__ = "Development"
	__license__ = "GPL v3"
	__version__ = "0.0.1"

	from lxml import etree as ET
	from os.path import isfile, basename
	from progress.bar import Bar
	from svgpathtools import svg2paths2, svg_to_paths
	from svgpathtools.parser import parse_path
	import re
	import sys
	import warnings

	from .box import Box
	from .color import Color
	from .image import Image, SVGImage
	from .editor_comment import EditorComment
	from .faksimile_image import FaksimileImage
	from .faksimile_position import FaksimilePosition
	from .imprint import Imprint
	from .lineNumber import LineNumber
	from .line import Line
	from .mark_foreign_hands import MarkForeignHands
	from .matrix import Matrix
	from .path import Path
	from .positional_word_part import PositionalWordPart
	from .super_page import SuperPage
	from .style import Style
	from .text_connection_mark import TextConnectionMark
	from .text_field import TextField
	from .transkriptionField import TranskriptionField
	from .writing_process import WritingProcess
	from .word import Word
	from .word_deletion_path import WordDeletionPath
	from .word_insertion_mark import WordInsertionMark

	sys.path.append('py2ttl')
	from class_spec import SemanticClass

	sys.path.append('shared_util')
	from main_util import extract_paths_on_tf, get_paths_near_position

	FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION
	FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT
	STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK
	STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK


	class Page(SemanticClass,SuperPage):
	"""
	This class represents a page.

	Args:
	xml_source_file (str): name of the xml file to be instantiated.
	faksimile_image: FaksimileImage.
	faksimile_svgFile: svg file containing information about word positions.

	"""
	UNITTESTING = False

	def __init__(self, xml_source_file=None, faksimile_image=None, faksimile_svgFile=None, add_paths_near_words=False, warn=False, number=None):
	if xml_source_file is not None:
	super(Page,self).__init__(xml_source_file)
	self.update_property_dictionary('faksimile_image', faksimile_image)
	self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile)
	self.init_all_properties()
	self.add_style(style_node=self.page_tree.getroot().find('.//style'))
	self.faksimile_text_field = None
	self.svg_text_field = None
	self.init_node_objects()
	self.warn = warn
	self.add_deletion_paths_to_words(add_paths_near_words)
	else:
	self.page_tree = None
	self.number = number

	def add_deletion_paths_to_words(self, add_paths_near_words=False):
	"""Add deletion paths to words.
	"""
	words = [ word for word in self.words if (len(word.word_parts) == 0 and word.deleted and len(word.deletion_paths) == 0)\
	or 'add_paths_near_words' in word.process_flags ]
	words += [ word for word in self.words\
	if len(word.word_parts) > 0 and True in\
	[ (wp.deleted and len(wp.deletion_paths) == 0) for wp in word.word_parts ]]
	if len(words) > 0 and ((self.svg_file is not None and isfile(self.svg_file))\
	or (self.source is not None and isfile(self.source))):
	svg_file = self.svg_file if self.svg_file is not None else self.source
	transkription_field = TranskriptionField(svg_file)
	tr_xmin = transkription_field.xmin if (self.svg_image is None or self.svg_image.text_field is None) else 0
	tr_ymin = transkription_field.ymin if (self.svg_image is None or self.svg_image.text_field is None) else 0
	word_deletion_paths = self.word_deletion_paths
	index = 0
	dp_updated = False
	while index < len(words):
	word = words[index]
	word.add_deletion_paths(word_deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
	if len(word.deletion_paths) > 0 or True in [ len(w.deletion_paths) > 0 for w in word.word_parts ]:
	deletion_paths = word.deletion_paths
	for wp in word.word_parts: deletion_paths += wp.deletion_paths
	for deletion_path in deletion_paths:
	if deletion_path not in self.word_deletion_paths:
	self.word_deletion_paths.append(deletion_path)
	elif not dp_updated:
	word_deletion_paths = extract_paths_on_tf(self)
	dp_updated = True
	index -= 1
	if add_paths_near_words\
	and ('add_paths_near_words' in word.process_flags\
	or ((word.deleted and len(word.deletion_paths) == 0)\
	or True in [ (w.deleted and len(w.deletion_paths) == 0) for w in word.word_parts ])):
	if not dp_updated\
	and 'add_paths_near_words' in word.process_flags:
	word_deletion_paths = extract_paths_on_tf(self)
	dp_updated = True
	transform = None
	tp = None
	target_word = word
	paths_near_word = []
	if word.deleted and len(word.transkription_positions) > 0:
	transform = word.transkription_positions[0].transform
	for tp in word.transkription_positions:
	word.deletion_paths_near_word += get_paths_near_position(tp, word_deletion_paths)
	elif len(word.word_parts) > 0:
	for wp in word.word_parts:
	if wp.deleted and len(wp.transkription_positions) > 0:
	target_word = wp
	for tp in wp.transkription_positions:
	wp.deletion_paths_near_word = get_paths_near_position(tp, word_deletion_paths)
	if self.warn and (word.deleted and len(word.deletion_paths) == 0):
	warnings.warn(\
	f'WARNING: {self.title} {self.number}: {word.id} on {word.line_number}, {word.text} has no deletion paths! {target_word.deletion_paths_near_word}, {transform}')
	index += 1

	@classmethod
	def create_cls(cls, xml_source_file=None, create_dummy_page=False, isBlank=False, page_node=None):
	"""Create a Page.
	"""
	if not create_dummy_page:
	page = cls(xml_source_file)
	page.status = 'complete'
	if isBlank:
	page.status = 'blank'
	page.words = []
	page.lines = []
	page.word_deletion_paths = []
	page.word_insertion_marks = []
	return page
	else:
	m = re.match(r'(.)(page[0])(.*)(\.xml)', xml_source_file)
	if m is not None and len(m.groups()) > 3:
	number = m.group(3)
	else:
	number = basename(xml_source_file).replace('.xml','')
	return cls(number=number)

	@classmethod
	def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None):
	"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
	or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
	[optional: instantiation depends on the fulfilment of a status_contains
	and/or on the selection of some words by a word_selection_function].
	"""
	source_tree = ET.parse(xml_file)
	if source_tree.getroot().find('metadata/type').text == cls.FILE_TYPE_SVG_WORD_POSITION:
	page = cls(xml_file)
	if word_selection_function is None or len(word_selection_function(page.words)) > 0:
	return [ page ]
	else:
	return []
	elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
	pages = []
	xpath = '//page/@output'
	if status_contains != '' and status_not_contain != '':
	xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain)
	elif status_contains != '':
	xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains)
	elif status_not_contain != '':
	xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain)
	for xml_source_file in source_tree.xpath(xpath):
	if isfile(xml_source_file):
	pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function)
	return pages
	else:
	return []

	@classmethod
	def get_semantic_dictionary(cls):
	""" Creates a semantic dictionary as specified by SemanticClass.
	"""
	dictionary = {}
	class_dict = cls.get_class_dictionary()
	properties = { 'number': { 'class': str, 'cardinality': 1}}
	properties.update(cls.create_semantic_property_dictionary('status', str,\
	name='pageHasDataProcessingStatus', label='status of data processing',\
	comment='The status of the data processing of this page'))
	properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage, subPropertyOf=cls.HAS_IMAGE))
	properties.update(cls.create_semantic_property_dictionary('faksimile_text_field', TextField,\
	name='pageIsOnFaksimileTextField', label='page is on faksimile text field',\
	comment='Relates a page to the text field on a svg image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
	properties.update(cls.create_semantic_property_dictionary('orientation', str))
	properties.update(cls.create_semantic_property_dictionary('status', str,\
	name='pageHasDataProcessingStatus', label='status of data processing',\
	comment='The status of the data processing of this page'))
	properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage, subPropertyOf=cls.HAS_IMAGE))
	properties.update(cls.create_semantic_property_dictionary('svg_text_field', TextField,\
	name='pageIsOnSVGTextField', label='page is on svg text field',\
	comment='Relates a page to the text field on a faksimile image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
	for key in [ 'lines','imprints', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks', 'editor_comments']:
	properties.update(cls.create_semantic_property_dictionary(key, list))
	dictionary.update({cls.CLASS_KEY: class_dict})
	dictionary.update({cls.PROPERTIES_KEY: properties})
	return cls.return_dictionary_after_updating_super_classes(dictionary)

	def get_word_deletion_path(self, path=None, d_attribute=None) ->WordDeletionPath:
	"""Return a word deletion path that belongs to page.
	"""
	if path is None and d_attribute is None:
	raise Exception('ERROR: get_word_deletion_path needs a path or a d_attribute!')
	if d_attribute is None:
	d_attribute = path.d_attribute
	page_paths = [ dpath for dpath in self.word_deletion_paths if dpath.d_attribute == d_attribute ]
	if len(page_paths) > 0:
	return page_paths[0]
	else:
	dpath = WordDeletionPath.create_cls(self, path=path, d_attribute=d_attribute)
	if dpath is not None:
	dpath.id = len(self.word_deletion_paths)
	self.word_deletion_paths.append(dpath)
	dpath.attach_object_to_tree(self.page_tree)
	return dpath

	def init_node_objects(self):
	"""Initialize all node objects.
	"""
	self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
	self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ]
	self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('./' + MarkForeignHands.XML_TAG) ]
	#self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ]
	self.words += [ TextConnectionMark.instantiate_as_word(node, id=index+len(self.words))\
	for index, node in enumerate(self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG)) ]
	self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
	self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
	self.imprints = [ Imprint.create_cls_from_node(imprint_node, self.lines) for imprint_node in self.page_tree.getroot().xpath('//' + Imprint.XML_TAG) ]
	self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
	self.word_deletion_paths = [ WordDeletionPath.create_cls(self, node=node) for node in self.page_tree.xpath('./' + WordDeletionPath.XML_TAG) ]
	self.editor_comments = [ EditorComment.create_cls_from_node(node=node) for node in self.page_tree.xpath('./' + EditorComment.XML_TAG) ]
	if self.faksimile_image is not None and self.faksimile_image.text_field is not None:
	self.faksimile_text_field = self.faksimile_image.text_field
	if self.svg_image is not None and self.svg_image.text_field is not None:
	self.svg_text_field = self.svg_image.text_field
	for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks:
	simple_word.init_word(self)
	for wim in self.word_insertion_marks:
	if wim.line_number > -1:
	wim.line = [ line for line in self.lines if line.id == wim.line_number ][0]

	def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
	"""Update word ids and attach them to page.page_tree.
	"""
	if not self.is_locked():
	update_function_on_word = [ update_function_on_word ]\
	if type(update_function_on_word) != list\
	else update_function_on_word
	for node in self.page_tree.xpath('.//word\|.//' + MarkForeignHands.XML_TAG + '\|.//' + TextConnectionMark.XML_TAG):
	node.getparent().remove(node)
	for index, word in enumerate(self.words):
	word.id = index
	for func in update_function_on_word:
	if callable(func):
	func(word)
	word.attach_word_to_tree(self.page_tree)
	for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
	mark_foreign_hands.id = index
	if MarkForeignHands in include_special_words_of_type:
	for func in update_function_on_word:
	if callable(update_function_on_word):
	func(mark_foreign_hands)
	mark_foreign_hands.attach_word_to_tree(self.page_tree)
	for index, text_connection_mark in enumerate(self.text_connection_marks):
	text_connection_mark.id = index
	if TextConnectionMark in include_special_words_of_type:
	for func in update_function_on_word:
	if callable(update_function_on_word):
	func(text_connection_mark)
	text_connection_mark.attach_word_to_tree(self.page_tree)
	else:
	print('locked')

	def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None):
	"""Update the data source of page.
	"""
	if faksimile_svgFile is not None:
	self.faksimile_svgFile = faksimile_svgFile
	data_node = self.page_tree.xpath('.//data-source')[0]\
	if len(self.page_tree.xpath('.//data-source')) > 0\
	else ET.SubElement(self.page_tree.getroot(), 'data-source')
	data_node.set('file', self.faksimile_svgFile)
	if xml_correction_file is not None:
	data_node.set('xml-corrected-words', xml_correction_file)

	def update_line_number_area(self, transkription_field, svg_tree=None, set_to_text_field_zero=True):
	"""Determines the width of the area where the line numbers are written in the page.source file.
	"""
	THRESHOLD = 0.4
	if svg_tree is None:
	svg_tree = ET.parse(self.source)
	if len(self.line_numbers) > 1:
	line_number = self.line_numbers[9]\
	if transkription_field.is_page_verso() and len(self.line_numbers) > 8\
	else self.line_numbers[1]
	ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\
	if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\
	and LineNumber.IS_A_LINE_NUMBER(item)\
	and LineNumber(raw_text_node=item).id == line_number.id ]
	if len(ln_nodes) > 0:
	matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform'))
	if transkription_field.is_page_verso():
	transkription_field.add_line_number_area_width(matrix.getX())
	elif self.svg_file is not None and isfile(self.svg_file):
	svg_path_tree = ET.parse(self.svg_file)
	namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
	svg_x = matrix.getX()
	svg_y = self.line_numbers[1].bottom + transkription_field.ymin\
	if set_to_text_field_zero\
	else self.line_numbers[1].bottom
	use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
	.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
	if len(use_nodes) > 0:
	symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
	d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
	if len(d_strings) > 0 and d_strings[0] != '':
	path = parse_path(d_strings[0])
	xmin, xmax, ymin, ymax = path.bbox()
	width = xmax - xmin
	transkription_field.add_line_number_area_width(matrix.getX() + width)

	def update_page_type(self, transkription_field=None):
	"""Adds a source to page and attaches it to page_tree.
	"""
	if self.number.endswith('r')\
	or self.number.endswith('v'):
	self.page_type = Page.PAGE_VERSO\
	if self.number.endswith('v')\
	else Page.PAGE_RECTO
	else:
	if transkription_field is None:
	if self.source is None or not isfile(self.source):
	raise FileNotFoundError('Page does not have a source!')
	transkription_field = TranskriptionField(self.source, multipage_index=self.multipage_index)
	self.page_type = Page.PAGE_VERSO\
	if transkription_field.is_page_verso()\
	else Page.PAGE_RECTO
	self.page_tree.getroot().set('pageType', self.page_type)

	def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False, parentsPWPs=None):
	"""Update styles of words and add them to their transkription_positions.
	Args:
	add_to_parents: Add styles also to word (and if not None to manuscript).
	partition_according_to_styles: Partition word if its transkription_positions have different styles.
	"""
	style_dictionary = {}
	if words is None:
	words = self.words
	for word in words:
	if len(word.word_parts) > 0:
	self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\
	add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles, parentsPWPs=parentsPWPs)
	overwritten = [] if word.overwrites_word is None else [ word.overwrites_word ]
	if word.earlier_version is not None:
	overwritten.append(word.earlier_version)
	if len(overwritten) > 0:
	parentsPWPs = parentsPWPs if parentsPWPs is not None else []
	if len(parentsPWPs) == 0:
	cword = word.word_parts[0] if len(word.word_parts) > 0 else word
	for tp in cword.transkription_positions:
	parentsPWPs += tp.positional_word_parts
	self.update_styles(words=overwritten, manuscript=manuscript, create_css=create_css,\
	add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles, parentsPWPs=parentsPWPs)
	for transkription_position in word.transkription_positions:
	positional_word_parts = transkription_position.positional_word_parts\
	if len(transkription_position.positional_word_parts) > 0\
	else parentsPWPs
	if len(positional_word_parts) > 0:
	style_class = positional_word_parts[0].style_class
	writing_process_id = -1
	for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]:
	writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
	style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id)
	if create_css:
	if style_dictionary.get((style_class_key, word.deleted)) is None:
	color = None
	if len(word.deletion_paths) > 0:
	if word.deletion_paths[0].style_class is not None\
	and word.deletion_paths[0].style_class != ''\
	and self.style_dict.get(word.deletion_paths[0].style_class) is not None:
	color = Color.create_cls_from_style_object(self.style_dict.get(word.deletion_paths[0].style_class))
	else:
	color = Color()
	style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\
	create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] )
	transkription_position.style = style_dictionary[(style_class_key, word.deleted)]
	#print(style_dictionary[(style_class_key, word.deleted)])
	else:
	if style_dictionary.get(style_class_key) is None:
	style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css)
	style_dictionary[style_class_key].writing_process_id = style_class_key[1]
	transkription_position.style = style_dictionary[style_class_key]
	if add_to_parents and transkription_position.style not in word.styles:
	word.styles.append(transkription_position.style)
	if partition_according_to_styles:
	word.split_according_to_status('style', splits_are_parts=True)
	if manuscript is not None\
	and add_to_parents:
	manuscript.update_styles(*style_dictionary.values())

	def __eq__(self, other):
	"""Returns true if self is qualitatively identical to other.
	"""
	if other is None:
	return False
	if self.page_tree is None and other.page_tree is None:
	return self.number == other.number
	if self.page_tree is None or other.page_tree is None:
	return False
	return self.page_tree.docinfo.URL == other.page_tree.docinfo.URL

	def __hash__(self):
	"""Return a hash value for self.
	"""
	try:
	if self.page_tree is None:
	return hash(self.number)
	except AttributeError:
	print(self)
	return hash(self.number)
	return hash(self.page_tree.docinfo.URL)

page.pyNo OneTemporaryActions

File Metadata

page.pyView Options

Event Timeline

page.py
No OneTemporary
Actions

page.py
View Options