Page MenuHomec4science
No OneTemporary

File Metadata

Wed, Jun 5, 20:51
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
# Copyright (C) University of Basel 2019 {{{1
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <> 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = ""
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
from .class_spec import SemanticClass
from .image import Image
from .lineNumber import LineNumber
from .path import Path
from .positional_word_part import PositionalWordPart
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
from .word_insertion_mark import WordInsertionMark
class Page(SemanticClass):
This class represents a page.
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, pdfFile=None, svg_file=None, orientation='North', extract_transkription_field_only=False):
self.title = title
self.line_numbers = []
self.style_dict = {}
self.sonderzeichen_list = []
self.svg_file = None
self.pdfFile = None
self.source = None
self.number = page_number if page_number is not None else -1
self.orientation = orientation
self.word_deletion_paths = []
if xml_source_file is not None:
if isfile(xml_source_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_source_file, parser)
self.title = self.page_tree.getroot().get('title')
self.number = self.page_tree.getroot().get('number')
self.source = self.page_tree.getroot().get('source')
self.orientation = self.page_tree.getroot().get('orientation')
self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\
if len(self.page_tree.xpath('.//svg/@file')) > 0 else None
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\
if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None
if pdfFile is not None and self.pdfFile is None:
self.pdfFile = pdfFile
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
if svg_file is not None and self.svg_file is None:
self.svg_file = svg_file
tf = TranskriptionField(svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
raise Exception('File "{}" does not exist!'.format(xml_source_file))
elif xml_target_file is not None:
self.word_insertion_marks = []
self.words = []
self.writing_processes = []
self.svg_file = svg_file
self.pdfFile = pdfFile
if isfile(xml_target_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_target_file, parser)
self.source = self.page_tree.getroot().get('source')
if bool(self.page_tree.getroot().get('orientation')):
self.orientation = self.page_tree.getroot().get('orientation')
elif orientation is not None:
self.page_tree.getroot().set('orientation', orientation)
if bool(self.page_tree.getroot().get('title')):
self.title = self.page_tree.getroot().get('title')
elif title is not None:
self.page_tree.getroot().set('title', title)
if self.svg_file is None:
self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\
if len(self.page_tree.xpath('.//svg/@file')) > 0 else None
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
elif len(self.page_tree.xpath('.//svg/@file')) == 0:
tf = TranskriptionField(svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
if self.pdfFile is None:
self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\
if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None
elif len(self.page_tree.xpath('.//pdf/@file')) == 0:
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG, WordInsertionMark.XML_TAG,\
for node in self.page_tree.xpath('//' + xpath2remove):
self.page_tree = ET.ElementTree(ET.Element('page'))
self.pdfFile = pdfFile
self.svg_file = svg_file
if title is not None:
self.page_tree.getroot().set('title', title)
if orientation is not None:
self.page_tree.getroot().set('orientation', orientation)
self.page_tree.getroot().set('transkription-field-only', str(extract_transkription_field_only).lower())
if page_number is not None:
self.page_tree.getroot().set('number', str(page_number))
if self.pdfFile is not None:
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
if self.svg_file is not None:
tf = TranskriptionField(self.svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
self.svg_image = Image(file_name=self.svg_file, height=self.height, width=self.width) if self.svg_file is not None\
else None
def categorize_paths(self, transkription_field=None):
"""Categorize all paths that are part of the transkription field.
if self.source is not None and isfile(self.source):
max_line = sorted(\
[ for line_number in self.line_numbers if % 2 == 0],\
reverse=True)[0] + 2 if len(self.line_numbers) > 0 else 17
tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0
tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0
paths, attributes = svg_to_paths.svg2paths(self.source)
allpaths_on_tf = []
if transkription_field is not None:
for index in range(0, len(paths)):
path = paths[index]
attribute = attributes[index]
if len(path) > 0\
and path != transkription_field.path\
and path.start.real > tr_xmin\
and path.end.real < transkription_field.xmax:
allpaths_on_tf.append(Path(id=index, path=path, style_class=attribute.get('class')))
text_area_deletion_paths = []
deletion_or_underline_paths = []
box_paths = []
dots_paths = []
word_connector_paths = []
uncategorized_paths = []
for mypath in allpaths_on_tf:
xmin, xmax, ymin, ymax = mypath.path.bbox()
start_line_number = self.get_line_number(mypath.path.start.imag-tr_ymin)
if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1:
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed():
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
elif abs(ymax-ymin) < MAX_HEIGHT_LINES:
elif start_line_number != -1 and start_line_number != self.get_line_number(mypath.path.end.imag-tr_ymin):
self.mark_words_intersecting_with_paths_as_deleted(deletion_or_underline_paths, tr_xmin, tr_ymin)
elif not Page.UNITTESTING:
error_msg = 'Svg source file {} does not exist!'.format(self.source)\
if self.source is not None else 'Page does not contain a source file!'
raise FileNotFoundError(error_msg)
def init_line_numbers(self, line_numbers, document_bottom):
"""Init line numbers.
even_index = 0
self.line_numbers = []
if len(line_numbers) > 0:
first_line_bottom = line_numbers[even_index].top - MINABOVE
self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom))
even_index += 1
while even_index < len(line_numbers):
even_index += 1
for line_number in self.line_numbers:
def init_words(self):
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.CREATE_WORD(word_node=word_node) for word_node in self.page_tree.getroot().xpath('//word') ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
self.word_deletion_paths = [ Path(node=node) for node in self.page_tree.xpath('//' + Path.WORD_DELETION_PATH_TAG) ]
for index, word in enumerate(self.words):
for word_insertion_mark in self.word_insertion_marks:
self.words[index] = word_insertion_mark.attach_and_update_word_if_involved(word)
if self.words[index] != word:
def create_writing_processes_and_attach2tree(self):
"""Creates three stages of Nietzsche's process of writing.
self.writing_processes = [ WritingProcess(version=WritingProcess.FIRST_VERSION),\
WritingProcess(version=WritingProcess.LATER_INSERTION_AND_ADDITION) ]
for writing_process in self.writing_processes:
for word in self.words:
for transkription_position in word.transkription_positions:
for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
if font_key in self.fontsizekey2stage_mapping.keys():
transkription_position.writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
self.sonderzeichen_list = sonderzeichen_list
self.letterspacing_list = letterspacing_list
self.style_dict = style_dict
if style_node is not None:
self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') }
self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ]
self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('letterspacing-list')) ]
elif bool(self.style_dict):
style_node = ET.SubElement(self.page_tree.getroot(), 'style')
if len(self.sonderzeichen_list) > 0:
style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list))
if len(self.letterspacing_list) > 0:
style_node.set('letterspacing-list', ' '.join(self.letterspacing_list))
for key in self.style_dict.keys():
self.style_dict[key]['name'] = key
ET.SubElement(style_node, 'class', attrib=self.style_dict[key])
fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value }
fontsizes = sorted(fontsize_dict.values(), reverse=True)
# create a mapping between fontsizes and word stages
self.fontsizekey2stage_mapping = {}
for fontsize_key, value in fontsize_dict.items():
if value >= fontsizes[0]-1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION })
elif value <= fontsizes[len(fontsizes)-1]+1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION })
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION })
def add_source(self, source):
"""Adds a source to page and attaches it to page_tree.
self.source = source
self.page_tree.getroot().set('source', self.source)
def get_biggest_fontSize4styles(self, style_set={}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
if bool(self.style_dict):
sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True)
return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1
return 1
def get_line_number(self, y):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
if len(self.line_numbers) > 0:
result_list = [ for line_number in self.line_numbers if y >= and y <= line_number.bottom ]
return result_list[0] if len(result_list) > 0 else -1
return -1
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'title': (str, 1, '/page/@title'), 'number': (str, 1, '/page/@number'),\
'line_numbers': (LineNumber, SemanticClass.LIST, '/page/@number|/page/@title'),\
'orientation': { 'class': str, 'cardinality': 1, 'xpath': '/page/@orientation'},\
'words': (Word, SemanticClass.LIST, '/page/@number|/page/@title'),\
'svg_image': (Image, 1, '/page/svg'),\
'writing_processes': (WritingProcess, SemanticClass.LIST, '/page/@number|/page/@title'),\
'word_deletion_paths': (Path, SemanticClass.LIST, '/page/@number|/page/@title'),\
'word_insertion_marks': (WordInsertionMark, SemanticClass.LIST, '/page/@number|/page/@title')}
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
def mark_words_intersecting_with_paths_as_deleted(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Marks all words that intersect with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] list of .path.Path that might be word_underline_paths
if not Page.UNITTESTING:
bar = Bar('mark words that intersect with deletion paths', max=len(self.words))
for word in self.words:
not bool(Page.UNITTESTING) and
for transkription_position in word.transkription_positions:
first_pwp = transkription_position.positional_word_parts[0]
last_pwp = transkription_position.positional_word_parts[len(transkription_position.positional_word_parts)-1]
xmin = tr_xmin + first_pwp.left
xmax = tr_xmin + last_pwp.left + last_pwp.width
ymin = tr_ymin + sorted( for pwp in transkription_position.positional_word_parts)[0]
ymax = tr_ymin + sorted([pwp.bottom for pwp in transkription_position.positional_word_parts], reverse=True)[0]
word_path = parse_path('M {}, {} L {}, {} L {}, {} L {}, {} z'.format(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax))
intersecting_paths = [ deletion_path for deletion_path in deletion_paths\
if do_paths_intersect_saveMode(deletion_path.path, word_path) ]
if len(intersecting_paths) > 0:
word.deleted = True
for deletion_path in intersecting_paths:
if deletion_path not in self.word_deletion_paths:
deletion_path.tag = Path.WORD_DELETION_PATH_TAG
not bool(Page.UNITTESTING) and bar.finish()
# return those paths in deletion_paths that are not in self.word_deletion_paths
return [ word_underline_path for word_underline_path in set(deletion_paths) - set(self.word_deletion_paths) ]
def do_paths_intersect_saveMode(path1, path2):
"""Returns true if paths intersect, false if not or if there was an exception.
return path1.intersect(path2, justonemode=True)
except AssertionError:
return False

Event Timeline