word.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jul 12, 10:19

word.py
View Options

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	""" This class can be used to represent a word.
	"""
	# Copyright (C) University of Basel 2019 {{{1
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}

	__author__ = "Christian Steiner"
	__maintainer__ = __author__
	__copyright__ = 'University of Basel'
	__email__ = "christian.steiner@unibas.ch"
	__status__ = "Development"
	__license__ = "GPL v3"
	__version__ = "0.0.1"

	import copy
	import inspect
	from lxml import etree as ET
	from operator import attrgetter
	import re
	import string
	import sys
	import warnings

	from .box import Box
	from .editor_comment import EditorComment
	from .matrix import Matrix
	from .path import Path
	from .simple_word import SimpleWord
	from .style import Style
	from .word_deletion_path import WordDeletionPath
	from .word_position import WordPosition
	from .transkription_position import TranskriptionPosition
	from .writing_process import WritingProcess

	SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)

	def execute_function_on_parts(word_parts, func_name):
	"""Execute function on parts and add those parts instead of original word to word_parts.

	:return: new word_parts, output from func
	"""
	copy_parts = word_parts[:]
	for word in word_parts:
	output = eval('word.{0}()'.format(func_name))
	if len(word.word_parts) > 0:
	for part_word in word.word_parts:
	copy_parts.insert(copy_parts.index(word), part_word)
	copy_parts.remove(word)
	word.word_parts = []
	return copy_parts, output

	def update_transkription_position_ids(word):
	"""Update transkription_position' ids according to index.
	"""
	word_part_ids = [ wp.id for wp in word.word_parts ]
	if len(word_part_ids) != len(set(word_part_ids)):
	for id, wp in enumerate(word.word_parts):
	wp.id = id
	for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
	transkription_position.id = index
	transkription_position.has_box = None
	transkription_position.deleted = False

	class Word(SimpleWord):
	"""
	This class represents a word.

	"""
	COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ]
	APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' }
	DATA = 'debug-data'
	RDFS_SUBCLASSOF_LIST = ['http://www.e-editiones.ch/ontology/text#HandwrittenText']
	XML_TAG = 'word'
	XML_EARLIER_VERSION = 'earlier-version'
	XML_OVERWRITES = 'overwrites'
	XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\
	'isDeletionOfWord': 'deletesEarlierPart',\
	'isExtensionOfWord': 'extendsEarlierVersion',\
	'isTransformationOfWord': 'transformsEarlierPart' }

	def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None):
	super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
	faksimile_positions=faksimile_positions)
	self.corrections = []
	self.deleted = deleted
	self.deletion_paths = []
	self.debug_container = {}
	self.debug_msg = None
	self.earlier_version = earlier_version
	self.edited_text = None
	self.editor_comment = None
	self.isClarificationOfWord = None
	self.isDeletionOfWord = None
	self.isExtensionOfWord = None
	self.isTransformationOfWord = None
	if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0:
	self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ])
	self.overwrites_word = None
	self.styles = styles\
	if styles is not None\
	else []
	self.verified = None
	self.writing_process_id = writing_process_id
	self.writing_processes = []
	self.word_insertion_mark = None
	self.word_box = None
	self.word_parts = word_parts if word_parts is not None else []
	self.word_part_objs = word_part_objs if word_part_objs is not None else []

	def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
	"""Add a word deletion path to word.
	"""
	if len(self.word_parts) > 0:
	for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
	elif self.deleted and len(self.transkription_positions) > 0:
	word_path = Path.create_path_from_transkription_position(self.transkription_positions[0],\
	tr_xmin=tr_xmin, tr_ymin=tr_ymin)
	self.deletion_paths = [ deletion_path for deletion_path in deletion_paths\
	if do_paths_intersect_saveMode(deletion_path, word_path) ]

	def attach_word_to_tree(self, target_tree):
	"""Attaches word to tree target_tree.
	"""
	word_node = super(Word,self).attach_word_to_tree(target_tree)
	if self.deleted is not None:
	word_node.set('deleted', str(self.deleted).lower())
	if self.verified is not None:
	word_node.set('verified', str(self.verified).lower())
	if self.edited_text is not None:
	word_node.set('edited-text', self.edited_text)
	if self.editor_comment is not None:
	self.editor_comment.attach_object_to_tree(word_node)
	if self.writing_process_id > -1:
	word_node.set('writing-process-id', str(self.writing_process_id))
	for index, word_part in enumerate(self.word_parts):
	word_part.id = index
	word_part.attach_word_to_tree(word_node)
	if self.earlier_version is not None:
	earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION)
	self.earlier_version.attach_word_to_tree(earlier_node)
	if self.overwrites_word is not None\
	and len(self.overwrites_word.transkription_positions) > 0:
	overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES)
	self.overwrites_word.attach_word_to_tree(overwrite_node)
	if self.word_box is not None:
	self.word_box.attach_object_to_tree(word_node)
	if len(self.corrections) > 0:
	word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ])))
	for key in self.XML_CORRECTION_DICT.keys():
	if self.__dict__[key] is not None:
	word_node.set(self.XML_CORRECTION_DICT[key], 'true')
	return word_node

	def belongs_to_multiple_writing_processes(self, include_parts=False):
	"""Returns true if transkription_positions belong to different WritingProcesses.
	"""
	if len(self.word_parts) > 0 and include_parts:
	return len(set(word.writing_process_id for word in self.word_parts)) > 1
	return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1

	def set_parent_word_writing_process_id(self):
	"""Set writing_process_id for parent word.
	"""
	ids = set(word.transkription_positions[0].style for word in self.word_parts\
	if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None)
	if len(ids) > 1:
	self.writing_process_id = max([style.writing_process_id for style in ids])
	if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\
	for word in self.word_parts\
	if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\
	> 1:
	self.writing_process_id += 1


	@classmethod
	def create_cls(cls, word_node):
	"""Creates a word from a (lxml.Element) node.

	[:return:] Word
	"""
	cls = super(Word,cls).create_cls(word_node)
	cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
	cls.split_strings = None
	cls.join_string = word_node.get('join')
	if bool(word_node.get('split')):
	cls.split_strings = word_node.get('split').split(' ')
	if ''.join(cls.split_strings) != cls.text:
	error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\
	format(word_node.getroottree().docinfo.URL, str(cls.id))\
	+ 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\
	+ 'Text attribute: "{0}".\n'.format(cls.text)
	raise Exception(error_msg)
	cls.verified = word_node.get('verified') == 'true'\
	if bool(word_node.get('verified')) else None
	cls.deleted = word_node.get('deleted') == 'true'\
	if bool(word_node.get('deleted')) else None
	cls.edited_text = word_node.get('edited-text')
	cls.editor_comment = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ][0]\
	if len([ node for node in word_node.xpath('./' + EditorComment.XML_TAG) ]) > 0 else None
	cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ]
	if bool(word_node.get('corrections')):
	for index in [ int(i) for i in word_node.get('corrections').split(' ') ]:
	if index < len(cls.word_parts):
	cls.corrections.append(cls.word_parts[index])
	cls.earlier_version = None
	if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0:
	cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0]
	for key_value in cls.XML_CORRECTION_DICT.values():
	if word_node.get(key_value) == 'true':
	cls.__dict__[key_value] = True
	if cls.earlier_version is not None:
	for word_part in cls.word_parts:
	for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]:
	if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\
	and len(cls.word_parts) <= len(cls.earlier_version.word_parts):
	try:
	word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id]
	except Exception:
	msg = f'{cls.id} {cls.text}: {word_part.id}'
	raise Exception(msg)
	for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]:
	if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
	word_part.__dict__[key] = cls.earlier_version
	for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]:
	if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]:
	word_part.__dict__[key] = cls
	cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\
	if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\
	else None
	cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\
	if len(word_node.xpath('./' + Box.XML_TAG)) > 0\
	else None
	return cls

	@classmethod
	def join_words(cls, list_of_words):
	"""Creates a word from a list of words.

	[:return:] Word
	"""
	if len(list_of_words) > 1:
	deleted = True in [ word.deleted for word in list_of_words ]\
	and len(set([ word.deleted for word in list_of_words ])) == 1
	line_number = list_of_words[0].line_number\
	if len(set([ word.line_number for word in list_of_words ])) == 1\
	else -1
	for word in list_of_words:
	if len(word.word_parts) > 0:
	index = list_of_words.index(word)
	list_of_words.remove(word)
	for part_word in reversed(word.word_parts):
	list_of_words.insert(index, part_word)
	new_word = cls(id=list_of_words[0].id, text=''.join([word.text for word in list_of_words]),\
	line_number=line_number, deleted=deleted, word_parts=list_of_words)
	if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]:
	change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0]
	new_word.edited_text = new_word.text.replace(change_text, change_text[:-1])
	for id, word in enumerate(new_word.word_parts): word.id = id
	return new_word
	if len(list_of_words) > 0:
	return list_of_words[0]
	else:
	return None

	def create_earlier_version(self, root_word=None, id=0):
	"""Create an earlier version of word.
	"""
	if root_word is None:
	root_word = self
	root_word.set_parent_word_writing_process_id()
	word_parts = []
	non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\
	if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ]
	non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts)
	if non_single_punctuation_word_parts_length > 0\
	and len([ word_part for word_part in non_single_punctuation_word_parts\
	if word_part.deleted ])\
	== non_single_punctuation_word_parts_length:
	self.deleted = True
	for word_part in non_single_punctuation_word_parts: word_part.deleted = False
	for id, word_part in enumerate(self.word_parts):
	earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id)
	if word_part.deleted:
	word_part.isDeletionOfWord = earlierWordPart
	word_parts.append(earlierWordPart)
	if word_part not in self.corrections:
	self.corrections.append(word_part)
	elif word_part.overwrites_word is not None\
	and ((len(word_part.transkription_positions) > 0\
	and word_part.overwrites_word.transkription_positions[0].style is not None\
	and word_part.transkription_positions[0].style is not None\
	and word_part.transkription_positions[0].style\
	!= word_part.overwrites_word.transkription_positions[0].style)
	or word_part.word_box.earlier_version):
	word_part.overwrites_word.id = word_part.id
	word_parts.append(word_part.overwrites_word)
	word_part.isTransformationOfWord = word_part.overwrites_word
	#print(f'transform: {self.text}')
	if word_part not in self.corrections:
	self.corrections.append(word_part)
	elif root_word.writing_process_id > -1\
	and (len(word_part.transkription_positions) > 0\
	and word_part.transkription_positions[0].style is not None\
	and word_part.transkription_positions[0].style.writing_process_id\
	== root_word.writing_process_id):
	word_part.extendsEarlierVersion = True
	#print('extends')
	if word_part not in self.corrections:
	self.corrections.append(word_part)
	else:
	if word_part.deleted:
	word_part.isDeletionOfWord = earlierWordPart
	word_parts.append(earlierWordPart)
	if word_part not in self.corrections:
	self.corrections.append(word_part)
	else:
	#print(f'default: {self.text}')
	word_parts.append(earlierWordPart)
	text = ''.join([ word.text for word in word_parts ])\
	if len(word_parts) > 0\
	else self.text
	if len(word_parts) == 1:
	self.transkription_positions += word_parts[0].transkription_positions
	self.faksimile_positions += word_parts[0].faksimile_positions
	word_parts = []
	new_transkription_positions = copy.deepcopy(self.transkription_positions)
	if len(self.transkription_positions) > 0\
	and self.transkription_positions[0].style is not None:
	writing_process_id = self.transkription_positions[0].style.writing_process_id
	for new_tp in new_transkription_positions:
	new_tp.style.writing_process_id = writing_process_id
	return Word(id=id, text=text, transkription_positions=new_transkription_positions,\
	faksimile_positions=self.faksimile_positions, line_number=self.line_number,\
	word_parts=word_parts)

	def create_correction_history(self, page=None, box_style=None):
	"""Create correction history.
	"""
	if self.word_box is not None:
	manuscript = self.transkription_positions[0].style.manuscript\
	if len(self.transkription_positions) > 0\
	and self.transkription_positions[0].style is not None\
	else None
	style = Style()
	if box_style is not None:
	style = box_style
	if page is not None:
	style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript)
	for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]:
	style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
	transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions)
	for transkription_position in transkription_positions:
	transkription_position.style = style
	self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\
	line_number=self.line_number)
	for word_part in self.word_parts:
	word_part.create_correction_history(page=page, box_style=box_style)
	if len(self.word_parts) > 0:
	earlier_version = self.create_earlier_version()
	extending_words = self._get_parts_with_property_key('extendsEarlierVersion')
	if len(extending_words) > 0:
	for word in extending_words:
	word.isExtensionOfWord = earlier_version
	if self.has_mixed_status('deleted', include_parts=True):
	self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ])
	if len(self.corrections) > 0:
	self.earlier_version = earlier_version

	@staticmethod
	def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
	"""Creates a word from a (lxml.Element) node or word_part_objs.

	[:return:] Word
	"""
	if word_node is not None: # init word from xml node
	id = int(word_node.get('id'))
	line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
	text = word_node.get('text')
	deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
	transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
	faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
	word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
	if len(word_node.findall('.//' + Word.DATA)) > 0\
	else [ item.attrib for item in word_node.findall('.//part')]
	return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
	faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
	elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
	WIDTH = 5
	TOPCORRECTION = 2.0
	FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
	height = height
	x = round(float(word_part_objs[0]['x']), 3)
	if(page is not None and bool(page.style_dict)):
	HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
	style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
	biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
	height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
	TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
	if endSign is not None and '%' in endSign:
	lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
	for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
	if bool(page.style_dict[key].get('font-size'))]
	lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
	endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
	elif endSign is not None and '%' in endSign:
	endX = float(endX) + WIDTH
	bottom = round(float(word_part_objs[0]['y']), 3)
	y = round(bottom - height + TOPCORRECTION, 3)
	width = round(float(endX) - x, 3)
	transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
	text = ''.join([ dict['text'] for dict in word_part_objs])
	line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
	word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
	word.debug_msg = debug_msg
	return word
	else:
	error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
	raise Exception('Error: {}'.format(error_msg))

	@classmethod
	def get_semantic_dictionary(cls):
	""" Creates and returns a semantic dictionary as specified by SemanticClass.
	"""
	dictionary = super(Word,cls).get_semantic_dictionary()
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\
	cardinality=1, cardinality_restriction='minCardinality',\
	name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\
	name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\
	name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\
	comment='Word has been deleted by the author using a deletion path.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comment', EditorComment,\
	name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\
	name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\
	name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\
	name='isClarificationOfWord', label='word is a clarification of word',\
	comment='The author has used this part of the word in order to clarify the appearance of that word.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\
	name='isDeletionOfWord', label='word is a deletion of word',\
	comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\
	name='isExtensionOfWord', label='word is a extension of word',\
	comment='The author has used this part of a word in order to extend an earlier version of this word.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\
	name='isTransformationOfWord', label='word is a transformation of word',\
	comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.'))
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\
	name='overwritesWord', label='word overwrites word',\
	comment='The author has used this word in order to overwrite that word.'))
	# This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING,
	# cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class.
	dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\
	name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\
	subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING))
	super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\
	name='isCorrectionOfWord', label='word is a correction of word',\
	comment='The author has used this word in order to correct that word.')
	for key in cls.XML_CORRECTION_DICT.keys():
	correction_dict = dictionary[cls.PROPERTIES_KEY].get(key)
	correction_dict.update(super_property_dictionary)
	dictionary[cls.PROPERTIES_KEY].update({key: correction_dict})
	return cls.return_dictionary_after_updating_super_classes(dictionary)

	def has_mixed_status(self, property_key, include_parts=False, concerns_word=True):
	"""Returns true if transkription_positions have mixed status concerning the property_key in their __dict__.
	"""
	if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions):
	return False
	if len(self.word_parts) > 0 and include_parts:
	if concerns_word:
	if False in set(property_key in word.__dict__.keys() for word in self.word_parts):
	return False
	return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1
	else:
	return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\
	if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1
	return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1

	def init_word(self, page):
	"""Initialize word with objects from page.
	"""
	super(Word,self).init_word(page)
	if self.writing_process_id > -1:
	self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ]
	writing_processes = self.writing_processes
	for word_part in self.word_parts:
	word_part.init_word(page)
	self.lines += word_part.lines
	self.writing_processes += word_part.writing_processes
	self.lines = [ line for line in set(self.lines) ]
	self.writing_processes = [ wp for wp in set(self.writing_processes)]
	if self.overwrites_word is not None:
	self.overwrites_word.init_word(page)
	if self.earlier_version is not None:
	if self.earlier_version.writing_process_id == -1:
	self.earlier_version.writing_process_id = self.writing_process_id-1
	if self.earlier_version.line_number == -1:
	self.earlier_version.line_number = self.line_number
	self.earlier_version.init_word(page)

	def join(self, other_word, append_at_end_of_new_word=True):
	"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
	"""
	if append_at_end_of_new_word:
	self.text = self.text + other_word.text
	for position in other_word.transkription_positions:
	position.id = str(len(self.transkription_positions))
	self.transkription_positions.append(position)
	else:
	self.text = other_word.text + self.text
	index = 0
	for position in other_word.transkription_positions:
	self.transkription_positions.insert(index, position)
	index += 1
	while index < len(self.transkription_positions):
	self.transkription_positions[index].id = str(index)
	index += 1
	self.simplify_transkription_positions()

	def partition_according_to_deletion(self):
	"""Partition a word according to its transkription_positions' deletion status
	->split word and add partial words as its parts.
	"""
	if self.has_mixed_status('deleted'):
	transkription_positions = []
	last_status = None
	for transkription_position in self.transkription_positions:
	if transkription_position.deleted != last_status\
	and len(transkription_positions) > 0:
	newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
	transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
	self.word_parts.append(newWord)
	transkription_positions = []
	transkription_positions.append(transkription_position)
	last_status = transkription_position.deleted
	if len(transkription_positions) > 0:
	newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
	transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
	self.word_parts.append(newWord)
	self.transkription_positions = []
	self.line_number = -1
	self.deleted = False
	elif len(self.word_parts) > 0:
	self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion')
	elif not self.deleted\
	and len(self.transkription_positions) > 0\
	and self.transkription_positions[0].deleted:
	self.deleted = True

	def partition_according_to_writing_process_id(self):
	"""Partition a word according to its transkription_positions' writing_process_ids
	->split word and add partial words as its parts.
	"""
	if self.belongs_to_multiple_writing_processes():
	last_writing_process_id = -1
	transkription_positions = []
	for transkription_position in self.transkription_positions:
	if transkription_position.writing_process_id != last_writing_process_id\
	and len(transkription_positions) > 0:
	newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
	transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
	self.word_parts.append(newWord)
	transkription_positions = []
	transkription_positions.append(transkription_position)
	last_writing_process_id = transkription_position.writing_process_id
	if len(transkription_positions) > 0:
	newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
	transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
	self.word_parts.append(newWord)
	self.transkription_positions = []
	elif len(self.word_parts) > 0:
	self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id')
	if self.belongs_to_multiple_writing_processes(include_parts=True):
	self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0]
	elif len(self.transkription_positions) > 0:
	self.writing_process_id = self.transkription_positions[0].writing_process_id

	def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False):
	"""Determines whether word is over a word box.
	"""
	word_over_box = None
	if len(self.word_parts) > 0:
	for word in self.word_parts:
	current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None))
	if current_word is not None and current_word.word_box is not None:
	word_over_box = current_word
	else:
	new_tp_dict = {}
	for index, transkription_position in enumerate(self.transkription_positions):
	if previous_word_has_box and index == 0:
	if len(transkription_position.positional_word_parts) > 0:
	transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2
	#print(f'{self.text}: {transkription_position.positional_word_parts[0].left}')
	else:
	transkription_position.left += 1
	word_path = Path.create_path_from_transkription_position(transkription_position,\
	tr_xmin=tr_xmin, tr_ymin=tr_ymin)
	containing_boxes = [ box_path for box_path in box_paths\
	if word_path.is_partially_contained_by(box_path)\
	or box_path.do_paths_intersect(word_path) ]
	if len(containing_boxes) > 0:
	if previous_word_has_box:
	print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}')
	self._set_box_to_transkription_position(containing_boxes[0], word_path,\
	transkription_position, new_tp_dict, tr_xmin)
	box_paths.remove(containing_boxes[0])
	for replace_tp in new_tp_dict.keys():
	for tp in new_tp_dict.get(replace_tp):
	self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp)
	self.transkription_positions.remove(replace_tp)
	word_over_box = self._get_partial_word_over_box()
	update_transkription_position_ids(self)
	return word_over_box

	def set_word_insertion_mark(self, word_insertion_mark):
	"""Sets word_insertion_mark
	"""
	self.word_insertion_mark = word_insertion_mark

	def set_writing_process_id_to_transkription_positions(self, page):
	"""Determines the writing process id of the transkription_positions.
	"""
	for transkription_position in self.transkription_positions:
	if len(transkription_position.positional_word_parts) > 0:
	for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
	if font_key in page.fontsizekey2stage_mapping.keys():
	transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)

	def simplify_transkription_positions(self):
	"""Merge transkription_positions if possible.
	"""
	index = len(self.transkription_positions)-1
	while index > 0\
	and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
	current_tp = self.transkription_positions[index]
	index -= 1
	previous_tp = self.transkription_positions[index]
	if previous_tp.is_mergebale_with(current_tp):
	positional_word_parts = previous_tp.positional_word_parts
	positional_word_parts += current_tp.positional_word_parts
	transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
	positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
	if len(transkription_positions) == 1:
	transkription_positions[0].writing_process_id = previous_tp.writing_process_id\
	if previous_tp.writing_process_id != -1\
	else current_tp.writing_process_id
	self.transkription_positions.pop(index+1)
	self.transkription_positions[index] = transkription_positions[0]
	#print(self.text, len(self.transkription_positions))

	def split(self, split_string, start_id=0):
	"""Splits the word and returns an 3-tuple of new words.
	"""
	previousString, currentString, nextString = self.text.partition(split_string)
	currentWord = None
	previousWord = None
	nextWord = None
	previousIndex = 0
	current_id = start_id
	all_positional_word_parts = []
	for position in self.transkription_positions:
	all_positional_word_parts += position.positional_word_parts
	if len(all_positional_word_parts) == 0:
	warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString))
	if len(previousString) > 0:
	previous_pwps = []
	while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
	previous_pwps.append(all_positional_word_parts[previousIndex])
	previousIndex += 1
	if previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
	warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString))
	else:
	previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split')
	previous_text = ''.join([ pwp.text for pwp in previous_pwps ])
	previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions)
	current_id += 1
	all_positional_word_parts = all_positional_word_parts[previousIndex:]
	if len(nextString) > 0:
	tmp_pwps = []
	index = 0
	while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
	tmp_pwps.append(all_positional_word_parts[index])
	index += 1
	if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
	warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString))
	else:
	next_pwps = all_positional_word_parts[index:]
	next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split')
	next_text = ''.join([ pwp.text for pwp in next_pwps ])
	nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions)
	all_positional_word_parts = all_positional_word_parts[:index]
	current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split')
	current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ])
	currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions)
	return previousWord, currentWord, nextWord

	def split_according_to_status(self, status, splits_are_parts=False):
	"""Split a word according to its transkription_positions' text.

	:return: a list of new word.Word
	"""
	new_words = []
	if self.has_mixed_status(status):
	last_status = None
	transkription_positions = []
	for transkription_position in self.transkription_positions:
	if transkription_position.__dict__[status] != last_status\
	and len(transkription_positions) > 0:
	new_words.append(\
	self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
	transkription_positions = []
	transkription_positions.append(transkription_position)
	last_status = transkription_position.__dict__[status]
	if len(transkription_positions) > 0:
	new_words.append(\
	self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words)))
	if splits_are_parts:
	self.word_parts += new_words
	if len(self.word_parts) > 0:
	self.transkription_positions = []
	return new_words

	def undo_partitioning(self):
	"""Undo partitioning.
	"""
	if len(self.word_parts) > 0:
	for word_part in self.word_parts:
	word_part.undo_partitioning()
	if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]):
	self.transkription_positions += word_part.transkription_positions
	self.earlier_version = None
	self.edited_text = None
	self.word_box = None
	self.word_parts = []
	self.corrections = []
	self.earlier_versions = []
	self.box_paths = []

	def _create_new_word(self, transkription_positions, status, new_id=0):
	"""Create a new word from self and transkription_positions.
	"""
	newWord = Word(id=new_id, transkription_positions=transkription_positions)
	for key in self.COPY_PROPERTY_KEY:
	if key != status and key in self.__dict__.keys():
	newWord.__dict__[key] = self.__dict__[key]
	if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys():
	newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status])
	else:
	newWord.__dict__[status] = transkription_positions[0].__dict__[status]
	return newWord

	def _get_parts_with_property_key(self, property_key):
	"""Return a list of word_parts with property == property_key.
	"""
	word_parts = []
	for word_part in self.word_parts:
	if property_key in word_part.__dict__.keys():
	word_parts.append(word_part)
	else:
	word_parts += word_part._get_parts_with_property_key(property_key)
	return word_parts

	def _get_partial_word_over_box(self):
	"""Partition a word according to its transkription_positions' has_box
	->split word and add partial words as its parts.

	:return: word over box or self
	"""
	word_over_box = None
	if self.has_mixed_status('has_box'):
	transkription_positions = []
	last_word_box = None
	for transkription_position in self.transkription_positions:
	if transkription_position.has_box != last_word_box\
	and len(transkription_positions) > 0:
	newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
	transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
	self.word_parts.append(newWord)
	if last_word_box is not None:
	word_over_box = newWord
	word_over_box.word_box = last_word_box
	transkription_positions = []
	transkription_positions.append(transkription_position)
	last_word_box = transkription_position.has_box
	if len(transkription_positions) > 0:
	newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
	transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
	self.word_parts.append(newWord)
	if last_word_box is not None:
	word_over_box = newWord
	word_over_box.word_box = last_word_box
	self.transkription_positions = []
	elif len(self.word_parts) > 0:
	#self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box')
	for word_part in self.word_parts:
	if word_over_box is None:
	word_over_box = word_part._get_partial_word_over_box()
	else:
	break
	elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1:
	word_over_box = self
	word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box
	return word_over_box

	def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin):
	"""Set box_path to transkription_position that is contained by box_path.
	Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary.
	"""
	if box_path.contains_path(word_path):
	transkription_position.has_box = box_path
	elif box_path.contains_start_of_path(word_path):
	split_position = box_path.path.bbox()[1] - tr_xmin
	new_tps = transkription_position.split(split_position)
	if len(new_tps) == 2:
	new_tps[0].has_box = box_path
	new_transkription_positions_dictionary.update({ transkription_position: new_tps })
	else:
	transkription_position.has_box = box_path
	elif box_path.contains_end_of_path(word_path):
	split_position = box_path.path.bbox()[0] - tr_xmin
	new_tps = transkription_position.split(split_position)
	if len(new_tps) == 2:
	new_tps[1].has_box = box_path
	new_transkription_positions_dictionary.update({ transkription_position: new_tps })
	else:
	transkription_position.has_box = box_path
	else: # box_path in the middle of word_pathz
	split_position1 = box_path.path.bbox()[0] - tr_xmin
	split_position2 = box_path.path.bbox()[1] - tr_xmin
	new_tps = transkription_position.split(split_position1, split_position2)
	if len(new_tps) >= 2:
	new_tps[1].has_box = box_path
	new_transkription_positions_dictionary.update({ transkription_position: new_tps })
	else:
	transkription_position.has_box = box_path

	def do_paths_intersect_saveMode(mypath1, mypath2):
	"""Returns true if paths intersect, false if not or if there was an exception.
	"""
	try:
	return mypath1.path.intersect(mypath2.path, justonemode=True)\
	or mypath1.is_partially_contained_by(mypath2)
	except AssertionError:
	return False

word.pyNo OneTemporaryActions

File Metadata

word.pyView Options

Event Timeline

word.py
No OneTemporary
Actions

word.py
View Options