Index: svgscripts/datatypes/page.py
===================================================================
--- svgscripts/datatypes/page.py (revision 71)
+++ svgscripts/datatypes/page.py (revision 72)
@@ -1,655 +1,495 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import sys
import warnings
from .box import Box
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .faksimile_position import FaksimilePosition
from .lineNumber import LineNumber
from .line import Line
from .mark_foreign_hands import MarkForeignHands
from .matrix import Matrix
from .path import Path
from .positional_word_part import PositionalWordPart
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
from .word_insertion_mark import WordInsertionMark
sys.path.append('py2ttl')
from class_spec import SemanticClass
FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition'
FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile'
+STATUS_MERGED_OK = 'faksimile merged'
+STATUS_POSTMERGED_OK = 'words processed'
class Page(SemanticClass):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
UNITTESTING = False
WARNING_MISSING_USE_NODE4PWP = PositionalWordPart.WARN_NO_USE_NODE_FOUND
WARNING_MISSING_GLYPH_ID4WIM = WordInsertionMark.WARN_NO_GLYPH_ID
PAGE_RECTO = 'recto'
PAGE_VERSO = 'verso'
def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, faksimile_image=None, faksimile_svgFile=None, pdfFile=None, svg_file=None, orientation='North', page_type=PAGE_VERSO, extract_transkription_field_only=True):
self.title = title
self.mark_foreign_hands = []
self.text_connection_marks = []
self.line_numbers = []
self.style_dict = {}
self.sonderzeichen_list = []
self.svg_file = None
self.svg_image = None
self.pdfFile = None
self.faksimile_svgFile = None
self.source = None
self.number = page_number if page_number is not None else -1
self.orientation = orientation
self.page_type = page_type
self.word_deletion_paths = []
self.faksimile_image = faksimile_image
self.text_field = None
self.lines = []
if xml_source_file is not None:
if isfile(xml_source_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_source_file, parser)
self.title = self.page_tree.getroot().get('title')
self.number = self.page_tree.getroot().get('number')
self.source = self.page_tree.getroot().get('source')
self.orientation = self.page_tree.getroot().get('orientation')
self.page_type = self.page_tree.getroot().get('pageType')
self.add_style(style_node=self.page_tree.getroot().find('.//style'))
self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\
if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None
- self.faksimile_svgFile = self.page_tree.xpath('.//faksimile-svg/@file')[0]\
- if len(self.page_tree.xpath('.//faksimile-svg/@file')) > 0 else None
+ self.faksimile_svgFile = self.page_tree.xpath('.//data-source/@file')[0]\
+ if len(self.page_tree.xpath('.//data-source/@file')) > 0 else None
self.svg_image = SVGImage(node=self.page_tree.xpath('.//' + SVGImage.XML_TAG)[0])\
if len(self.page_tree.xpath('.//' + SVGImage.XML_TAG)) > 0 else None
if len(self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)) > 0:
self.faksimile_image = FaksimileImage(node=self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)[0])
self.text_field = self.faksimile_image.text_field
self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\
if len(self.page_tree.xpath('.//svg/@file')) > 0 else None
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
if pdfFile is not None and self.pdfFile is None:
self.pdfFile = pdfFile
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
if faksimile_svgFile is not None and self.faksimile_svgFile is None:
- self.faksimile_svgFile = faksimile_svgFile
- ET.SubElement(self.page_tree.getroot(), 'faksimile-svg', attrib={'file': self.faksimile_svgFile})
+ self.update_data_source(faksimile_svgFile=faksimile_svgFile)
if faksimile_image is not None:
self.faksimile_image = faksimile_image
self.faksimile_image.attach_object_to_tree(self.page_tree)
if svg_file is not None and self.svg_file is None:
self.svg_file = svg_file
tf = TranskriptionField(svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
self.svg_image.attach_object_to_tree(self.page_tree)
if self.svg_image is not None and self.svg_file is None:
self.svg_file = self.svg_image.file_name
if self.svg_image is not None and self.width == 0.0:
self.width = self.svg_image.width
if self.svg_image is not None and self.height == 0.0:
self.height = self.svg_image.height
self.init_node_objects()
else:
raise Exception('File "{}" does not exist!'.format(xml_source_file))
elif xml_target_file is not None:
self.word_insertion_marks = []
self.words = []
self.writing_processes = []
self.svg_file = svg_file
self.pdfFile = pdfFile
self.faksimile_svgFile = faksimile_svgFile
if isfile(xml_target_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_target_file, parser)
self.source = self.page_tree.getroot().get('source')
if bool(self.page_tree.getroot().get('orientation')):
self.orientation = self.page_tree.getroot().get('orientation')
elif orientation is not None:
self.page_tree.getroot().set('orientation', orientation)
if bool(self.page_tree.getroot().get('title')):
self.title = self.page_tree.getroot().get('title')
elif title is not None:
self.page_tree.getroot().set('title', title)
if self.svg_file is None:
self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\
if len(self.page_tree.xpath('.//svg/@file')) > 0 else None
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
elif len(self.page_tree.xpath('.//svg/@file')) == 0:
tf = TranskriptionField(svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
self.svg_image.attach_object_to_tree(self.page_tree)
#ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
else:
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
if self.pdfFile is None:
self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\
if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None
elif len(self.page_tree.xpath('.//pdf/@file')) == 0:
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG, WordInsertionMark.XML_TAG,\
WritingProcess.XML_TAG, Path.WORD_DELETION_PATH_TAG ]:
for node in self.page_tree.xpath('//' + xpath2remove):
node.getparent().remove(node)
else:
self.page_tree = ET.ElementTree(ET.Element('page'))
self.pdfFile = pdfFile
self.svg_file = svg_file
if title is not None:
self.page_tree.getroot().set('title', title)
if orientation is not None:
self.page_tree.getroot().set('orientation', orientation)
self.page_tree.getroot().set('transkription-field-only', str(extract_transkription_field_only).lower())
if page_number is not None:
self.page_tree.getroot().set('number', str(page_number))
if self.pdfFile is not None:
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
if self.svg_file is not None:
tf = TranskriptionField(self.svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
self.svg_image.attach_object_to_tree(self.page_tree)
#ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
if self.svg_image is None and self.svg_file is not None:
self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
self.svg_image.attach_object_to_tree(self.page_tree)
def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self.sonderzeichen_list = sonderzeichen_list
self.letterspacing_list = letterspacing_list
self.style_dict = style_dict
if style_node is not None:
self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') }
self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ]
self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('letterspacing-list')) ]
elif bool(self.style_dict):
style_node = ET.SubElement(self.page_tree.getroot(), 'style')
if len(self.sonderzeichen_list) > 0:
style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list))
if len(self.letterspacing_list) > 0:
style_node.set('letterspacing-list', ' '.join(self.letterspacing_list))
for key in self.style_dict.keys():
self.style_dict[key]['name'] = key
ET.SubElement(style_node, 'class', attrib=self.style_dict[key])
fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value }
fontsizes = sorted(fontsize_dict.values(), reverse=True)
# create a mapping between fontsizes and word stages
self.fontsizekey2stage_mapping = {}
for fontsize_key, value in fontsize_dict.items():
if value >= fontsizes[0]-1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION })
elif value <= fontsizes[len(fontsizes)-1]+1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION })
else:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION })
def add_source(self, source):
"""Adds a source to page and attaches it to page_tree.
"""
self.source = source
self.page_tree.getroot().set('source', self.source)
- def categorize_paths(self, transkription_field=None):
- """Categorize all paths that are part of the transkription field.
-
- :return: a dictionary containig a list for each category of path.
- """
- if self.source is not None and isfile(self.source):
- MAX_HEIGHT_LINES = 1
- max_line = sorted(\
- [line_number.bottom-line_number.top for line_number in self.line_numbers if line_number.id % 2 == 0],\
- reverse=True)[0] + 2 if len(self.line_numbers) > 0 else 17
- tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0
- tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0
- paths, attributes = svg_to_paths.svg2paths(self.source)
- allpaths_on_tf = []
- allpaths_outside_tf = []
- attributes_outside_tf = []
- if transkription_field is not None:
- for index in range(0, len(paths)):
- path = paths[index]
- attribute = attributes[index]
- if len(path) > 0\
- and path != transkription_field.path\
- and path.bbox()[0] > tr_xmin\
- and path.bbox()[1] < transkription_field.xmax:
- allpaths_on_tf.append(Path(id=index, path=path, style_class=attribute.get('class')))
- elif len(path) > 0\
- and path != transkription_field.path:
- allpaths_outside_tf.append(path)
- attributes_outside_tf.append(attribute)
- path_dict = { 'text_area_deletion_paths': [],\
- 'deletion_or_underline_paths': [],\
- 'box_paths': [],\
- 'dots_paths': [],\
- 'word_connector_paths': [],\
- 'uncategorized_paths': [] }
- for mypath in allpaths_on_tf:
- xmin, xmax, ymin, ymax = mypath.path.bbox()
- start_line_number = self.get_line_number(mypath.path.start.imag-tr_ymin)
- if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1:
- path_dict.get('dots_paths').append(mypath)
- elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed():
- path_dict.get('box_paths').append(mypath)
- elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
- path_dict.get('word_connector_paths').append(mypath)
- elif abs(ymax-ymin) < MAX_HEIGHT_LINES:
- path_dict.get('deletion_or_underline_paths').append(mypath)
- elif start_line_number != -1 and start_line_number != self.get_line_number(mypath.path.end.imag-tr_ymin):
- path_dict.get('text_area_deletion_paths').append(mypath)
- else:
- path_dict.get('uncategorized_paths').append(mypath)
- underline_path = self.mark_words_intersecting_with_paths_as_deleted(path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin)
- path_dict.update({'underline_path': underline_path})
- self.process_word_boxes(path_dict.get('box_paths'), transkription_field,\
- paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line)
- return path_dict
- elif not Page.UNITTESTING:
- error_msg = 'Svg source file {} does not exist!'.format(self.source)\
- if self.source is not None else 'Page does not contain a source file!'
- raise FileNotFoundError(error_msg)
- return {}
-
def create_writing_processes_and_attach2tree(self):
"""Creates three stages of Nietzsche's process of writing.
"""
self.writing_processes = [ WritingProcess(version=WritingProcess.FIRST_VERSION),\
WritingProcess(version=WritingProcess.INSERTION_AND_ADDITION),\
WritingProcess(version=WritingProcess.LATER_INSERTION_AND_ADDITION) ]
for writing_process in self.writing_processes:
writing_process.attach_object_to_tree(self.page_tree)
- for word in self.words:
- for transkription_position in word.transkription_positions:
- for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
- if font_key in self.fontsizekey2stage_mapping.keys():
- transkription_position.writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
-
- def find_special_words(self, transkription_field=None):
- """Find special words, remove them from words, process their content.
- """
- if self.source is None or not isfile(self.source):
- raise FileNotFoundError('Page does not have a source!')
- if transkription_field is None:
- transkription_field = TranskriptionField(self.source)
- special_char_list = MarkForeignHands.get_special_char_list()
- special_char_list += TextConnectionMark.get_special_char_list()
- single_char_words = [ word for word in self.words if len(word.text) == 1 and word.text in special_char_list ]
- for word in single_char_words:
- if word.text == MarkForeignHands.CLASS_MARK:
- id = len(self.mark_foreign_hands)
- self.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id))
- self.words.remove(word)
- elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\
- or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\
- and any(style in self.sonderzeichen_list for style\
- in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))):
- id = len(self.text_connection_marks)
- self.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id))
- self.words.remove(word)
- svg_tree = ET.parse(self.source)
- self.update_page_type(transkription_field=transkription_field)
- self.update_line_number_area(transkription_field, svg_tree=svg_tree)
- italic_classes = [ key for key in self.style_dict\
- if bool(self.style_dict[key].get('font-family')) and self.style_dict[key]['font-family'].endswith('Italic') ]
- if len(self.mark_foreign_hands) > 0:
- MarkForeignHands.find_content(self.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\
- SonderzeichenList=self.sonderzeichen_list)
- if len(self.text_connection_marks) > 0:
- TextConnectionMark.find_content_in_footnotes(self.text_connection_marks, transkription_field, svg_tree,\
- title=self.title, page_number=self.number)
+ #for word in self.words:
+ # for transkription_position in word.transkription_positions:
+ # for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
+ # if font_key in self.fontsizekey2stage_mapping.keys():
+ # transkription_position.writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
def get_biggest_fontSize4styles(self, style_set={}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if bool(self.style_dict):
sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True)
return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1
else:
return 1
def get_line_number(self, y):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if len(self.line_numbers) > 0:
result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ]
return result_list[0] if len(result_list) > 0 else -1
else:
return -1
@classmethod
- def get_pages_from_xml_file(cls, xml_file, status_contains='', word_selection_function=None):
+ def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree = ET.parse(xml_file)
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION:
page = cls(xml_source_file=xml_file)
if word_selection_function is None or len(word_selection_function(page.words)) > 0:
return [ page ]
else:
return []
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
pages = []
- xpath = '//page/@output'\
- if status_contains == ''\
- else '//page[contains(@status, "{0}")]/@output'.format(status_contains)
+ xpath = '//page/@output'
+ if status_contains != '' and status_not_contain != '':
+ xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain)
+ elif status_contains != '':
+ xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains)
+ elif status_not_contain != '':
+ xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain)
for xml_source_file in source_tree.xpath(xpath):
if isfile(xml_source_file):
pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function)
return pages
else:
return []
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'number': { 'class': str, 'cardinality': 1},\
'faksimile_image': { 'class': FaksimileImage, 'cardinality': 1},\
'orientation': { 'class': str, 'cardinality': 1},\
'svg_image': { 'class': SVGImage, 'cardinality': 1}}
properties.update(cls.create_semantic_property_dictionary('text_field', TextField,\
cardinality=1, name='pageIsOnTextField', label='page is on text field',\
comment='Relates a page to the text field on a faksimile image.'))
for key in [ 'lines', 'words', 'writing_processes', 'word_deletion_paths', 'word_insertion_marks']:
properties.update(cls.create_semantic_property_dictionary(key, list))
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
def init_line_numbers(self, line_numbers, document_bottom):
"""Init line numbers.
"""
even_index = 0
MINABOVE = 1
self.line_numbers = []
if len(line_numbers) > 0:
first_line_bottom = line_numbers[even_index].top - MINABOVE
self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom))
self.line_numbers.append(line_numbers[even_index])
even_index += 1
while even_index < len(line_numbers):
self.line_numbers.append(LineNumber(id=line_numbers[even_index].id-1,\
top=line_numbers[even_index-1].bottom+MINABOVE,\
bottom=line_numbers[even_index].top-MINABOVE))
self.line_numbers.append(line_numbers[even_index])
even_index += 1
self.line_numbers.append(LineNumber(id=line_numbers[even_index-1].id+1,\
top=line_numbers[even_index-1].bottom+MINABOVE,\
bottom=document_bottom))
for line_number in self.line_numbers:
line_number.attach_object_to_tree(self.page_tree)
def init_node_objects(self):
"""Initialize all node objects.
"""
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ]
self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ]
self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
self.word_deletion_paths = [ Path(node=node) for node in self.page_tree.xpath('//' + Path.WORD_DELETION_PATH_TAG) ]
if self.faksimile_image is not None and self.text_field is not None:
for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks:
simple_word.init_word(self)
for wim in self.word_insertion_marks:
if wim.line_number > -1:
wim.line = [ line for line in self.lines if line.id == wim.line_number ][0]
def is_locked(self):
"""Return true if page is locked.
"""
return len(self.page_tree.xpath('//metadata/lock')) > 0
def lock(self, reference_file, message=''):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if not self.is_locked():
metadata = self.page_tree.xpath('./metadata')[0]\
if len(self.page_tree.xpath('./metadata')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'metadata')
lock = ET.SubElement(metadata, 'lock')
ET.SubElement(lock, 'reference-file').text = reference_file
if message != '':
ET.SubElement(lock, 'message').text = message
-
- def mark_words_intersecting_with_paths_as_deleted(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
- """Marks all words that intersect with deletion paths as deleted
- and adds these paths to word_deletion_paths.
-
- [:return:] list of .path.Path that might be word_underline_paths
- """
- if not Page.UNITTESTING:
- bar = Bar('mark words that intersect with deletion paths', max=len(self.words))
- for word in self.words:
- not bool(Page.UNITTESTING) and bar.next()
- word.deleted = False
- for transkription_position in word.transkription_positions:
- word_path = Path.create_path_from_transkription_position(transkription_position,\
- tr_xmin=tr_xmin, tr_ymin=tr_ymin)
- intersecting_paths = [ deletion_path for deletion_path in deletion_paths\
- if do_paths_intersect_saveMode(deletion_path.path, word_path.path) ]
- if len(intersecting_paths) > 0:
- transkription_position.deleted = True
- for deletion_path in intersecting_paths:
- if deletion_path not in self.word_deletion_paths:
- deletion_path.tag = Path.WORD_DELETION_PATH_TAG
- deletion_path.attach_object_to_tree(self.page_tree)
- self.word_deletion_paths.append(deletion_path)
- word.partition_according_to_writing_process_id()
- word.partition_according_to_deletion()
- not bool(Page.UNITTESTING) and bar.finish()
- # return those paths in deletion_paths that are not in self.word_deletion_paths
- return [ word_underline_path for word_underline_path in set(deletion_paths) - set(self.word_deletion_paths) ]
-
- def process_word_boxes(self, box_paths, transkription_field, paths=None, attributes=None, max_line=17):
- """Process word boxes: partition words according to word boxes.
- """
- MAX_HEIGHT_LINES = 1
- if not Page.UNITTESTING:
- bar = Bar('process word boxes', max=len(self.words))
- svg_tree = ET.parse(self.source)
- namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
- allpaths_on_margin_field = []
- if paths is None or attributes is None:
- paths, attributes = svg_to_paths.svg2paths(self.source)
- for index in range(0, len(paths)):
- path = paths[index]
- xmin, xmax, ymin, ymax = path.bbox()
- attribute = attributes[index]
- if len(path) > 0\
- and path != transkription_field.path\
- and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\
- or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\
- and abs(ymax-ymin) < max_line:
- allpaths_on_margin_field.append(Path(id=index, path=path, style_class=attribute.get('class')))
- box_line_number_dict = {}
- for box_path in sorted(box_paths, key=lambda path: path.get_median_y()):
- line_number = self.get_line_number(box_path.get_median_y(tr_ymin=transkription_field.ymin))
- if line_number not in box_line_number_dict.keys():
- box_line_number_dict.update({ line_number: [ box_path ]})
- else:
- box_line_number_dict.get(line_number).append(box_path)
- boxes = []
- for line_number in box_line_number_dict.keys():
- box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x())
- margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\
- if self.get_line_number(margin_box.get_median_y(tr_ymin=transkription_field.ymin)) == line_number ],\
- key=lambda path: path.get_x())
- threshold = 3 if line_number % 2 == 0 else 1.5
- for box_path in box_paths_on_line:
- box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\
- transkription_field=transkription_field, namespaces=namespaces, threshold=threshold)
- if box is not None:
- boxes.append(box)
- for word in self.words:
- not bool(Page.UNITTESTING) and bar.next()
- word.process_boxes(boxes, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin)
- not bool(Page.UNITTESTING) and bar.finish()
def unlock(self):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if self.is_locked():
lock = self.page_tree.xpath('//metadata/lock')[0]
lock.getparent().remove(lock)
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
+ def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None):
+ """Update the data source of page.
+ """
+ if faksimile_svgFile is not None:
+ self.faksimile_svgFile = faksimile_svgFile
+ data_node = self.page_tree.xpath('.//data-source')[0]\
+ if len(self.page_tree.xpath('.//data-source')) > 0\
+ else ET.SubElement(self.page_tree.getroot(), 'data-source')
+ data_node.set('file', self.faksimile_svgFile)
+ if xml_correction_file is not None:
+ data_node.set('xml-corrected-words', xml_correction_file)
+
def update_line_number_area(self, transkription_field, svg_tree=None):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD = 0.4
if svg_tree is None:
svg_tree = ET.parse(self.source)
if len(self.line_numbers) > 1:
line_number = self.line_numbers[9]\
if transkription_field.is_page_verso() and len(self.line_numbers) > 8\
else self.line_numbers[1]
ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\
if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\
and LineNumber.IS_A_LINE_NUMBER(item)\
and LineNumber(raw_text_node=item, transkription_field=transkription_field).id == line_number.id ]
if len(ln_nodes) > 0:
matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform'))
if transkription_field.is_page_verso():
transkription_field.add_line_number_area_width(matrix.getX())
elif self.svg_file is not None and isfile(self.svg_file):
svg_path_tree = ET.parse(self.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
svg_x = matrix.getX()
svg_y = self.line_numbers[1].bottom + transkription_field.ymin
use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
if len(use_nodes) > 0:
symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
if len(d_strings) > 0 and d_strings[0] != '':
path = parse_path(d_strings[0])
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
transkription_field.add_line_number_area_width(matrix.getX() + width)
def update_page_type(self, transkription_field=None):
"""Adds a source to page and attaches it to page_tree.
"""
if transkription_field is None:
if self.source is None or not isfile(self.source):
raise FileNotFoundError('Page does not have a source!')
transkription_field = TranskriptionField(self.source)
self.page_type = Page.PAGE_VERSO\
if transkription_field.is_page_verso()\
else Page.PAGE_RECTO
self.page_tree.getroot().set('pageType', self.page_type)
-
-def do_paths_intersect_saveMode(path1, path2):
- """Returns true if paths intersect, false if not or if there was an exception.
- """
- try:
- return path1.intersect(path2, justonemode=True)
- except AssertionError:
- return False
Index: svgscripts/datatypes/word_insertion_mark.py
===================================================================
--- svgscripts/datatypes/word_insertion_mark.py (revision 71)
+++ svgscripts/datatypes/word_insertion_mark.py (revision 72)
@@ -1,138 +1,138 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word insertion mark.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from svgpathtools.parser import parse_path
import warnings
from .line import Line
from .positional_object import PositionalObject
from .word import Word
class WordInsertionMark(PositionalObject):
"""
This class represents a word insertion mark.
Args:
wim_node (etree.Element): element that contains information about a word_insertion_mark.
OR
id (int): word id
x (float)
y (float)
height (float)
width (float)
previous_word_id (int): id of the word to which word insertion mark is attached
inserted_words: Array->Word of inserted words marked by the word insertion mark.
"""
WARN_NO_GLYPH_ID = 'No glyph_id found'
XML_TAG = 'word-insertion-mark'
extraStringKeys = [ 'mark_type', 'symbol_id' ]
def __init__(self, wim_node=None, id=0, x=-1.0, y=-1.0, height=0, width=0, previous_word_id=-1, next_word_id=-1, line_number=-1, symbol_id=None, inserted_words=[], inserted_word_id=-1, mark_type='A'):
super(WordInsertionMark, self).__init__(id=id, node=wim_node, height=height, width=width, x=x, y=y, tag=WordInsertionMark.XML_TAG)
self.stringKeys += [ 'mark_type', 'symbol_id' ]
self.intKeys += [ 'line_number', 'next_word_id', 'previous_word_id' ]
self.symbol_id = symbol_id
self.mark_type = mark_type
self.line_number = line_number
self.line = None
self.previous_word_id = previous_word_id
self.next_word_id = next_word_id
if wim_node is not None:
self.mark_type = wim_node.get('mark-type')
self.line_number = int(wim_node.get('line-number')) if bool(wim_node.get('line-number')) else -1
self.previous_word_id = int(wim_node.get('previous-word-id')) if bool(wim_node.get('previous-word-id')) else -1
self.next_word_id = int(wim_node.get('next-word-id')) if bool(wim_node.get('next-word-id')) else -1
def init_inserted_words(self, inserted_words=[], wim_node=None, inserted_word_id_string=None):
if wim_node is not None and inserted_word_id_string is not None:
ids = inserted_word_id_string.split(' ')
inserted_words = [ Word.CREATE_WORD(word_node=word_node) for word_node in wim_node.getroottree().getroot().xpath('.//word[@id>="{0}" and @id<="{1}"]'.format(ids[0], ids[len(ids)-1])) ]
if len(inserted_words) > 0:
inserted_words[0].is_head_of_inserted_words = True
inserted_words[len(inserted_words)-1].is_tail_of_inserted_words = True
for word in inserted_words:
word.set_word_insertion_mark(self)
return inserted_words
def attach_and_update_word_if_involved(self, word):
if word.id == self.previous_word_id:
word.is_before_inserted_words = True
word.word_insertion_mark = self
elif word.id == self.next_word_id:
word.is_after_inserted_words = True
word.word_insertion_mark = self
elif word.id in [ inserted.id for inserted in self.inserted_words ]:
word = [ inserted for inserted in self.inserted_words if inserted.id == word.id ][0]
return word
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(cls,cls).get_semantic_dictionary()
word_dicts = { key: { 'class': Word, 'cardinality': 1, 'cardinality_restriction': 'maxCardinality',\
'label': 'has {} word'.format(key.replace('_word_id','')),\
'name': 'has{}'.format(key.title().replace('_Id','').replace('_','')) }\
for key in [ 'previous_word_id', 'next_word_id' ] }
dictionary['properties'].update(word_dicts)
dictionary['properties'].update({'line': {'class': Line, 'cardinality': 1,\
'name': 'wordInsertionMarkBelongsToLine', 'label': 'word insertion mark belongs to a specific line'}})
- for extraStringKey in cls.extraStringKeys:
- dictionary['properties'].update(cls.create_semantic_property_dictionary(extraStringKey, str, cardinality=1))
+ dictionary['properties'].update(cls.create_semantic_property_dictionary('mark_type', str, cardinality=1))
+ dictionary['properties'].update(cls.create_semantic_property_dictionary('symbol_id', str, cardinality=1, cardinality_restriction='maxCardinality'))
return dictionary
@staticmethod
def CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=0, x=0.0, y=0.0, xmin=0.0, ymin=0.0, line_number=-1, mark_type='A'):
"""Creates a (datatypes.word_insertion_mark) WordInsertionMark
using a (lxml.ElementTree) svg_path_tree and the corresponding namespaces.
"""
THRESHOLD = 0.4
svg_x = x + xmin
svg_y = y + ymin
use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
if len(use_nodes) > 0:
symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
height = 0.0
width = 0.0
if len(d_strings) > 0 and d_strings[0] != '':
path = parse_path(d_strings[0])
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
height = ymax - ymin
return WordInsertionMark(id=id, x=x, y=y-height, height=height, width=width, line_number=line_number,\
mark_type=mark_type, symbol_id=symbol_id)
else:
warnings.warn('{} for word insertion mark {} on line {}'.format(WordInsertionMark.WARN_NO_GLYPH_ID, id, line_number))
return WordInsertionMark(id=id, x=x, y=y, line_number=line_number, mark_type=mark_type)
Index: svgscripts/datatypes/image.py
===================================================================
--- svgscripts/datatypes/image.py (revision 71)
+++ svgscripts/datatypes/image.py (revision 72)
@@ -1,116 +1,116 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent all image types.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
import sys
from .attachable_object import AttachableObject
from .text_field import TextField
sys.path.append('py2ttl')
from class_spec import SemanticClass
class Image(AttachableObject,SemanticClass):
"""
This super class represents all types of images.
Args:
file_name (str): name of the image file.
node (lxml.etree.Element) node, containing information
URL (str): URL of image file.
height (float): height of image
width (float): width of image
text_field (.text_field.TextField) text_field on image representation
"""
stringKeys = [ 'file_name', 'URL', 'local_path' ]
floatKeys = [ 'height', 'width' ]
XML_TAG = 'image'
def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG):
self.text_field = text_field
if node is not None:
self.file_name = node.get('file-name')
self.local_path = node.get('local-path')
self.URL = node.get('URL')
self.height = float(node.get('height'))
self.width = float(node.get('width'))
if len(node.findall(TextField.XML_TAG)) > 0:
self.text_field = TextField(node=node.find(TextField.XML_TAG))
else:
self.tag = tag
self.file_name = file_name
self.local_path = local_path
self.URL = URL
self.height = height
self.width = width
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
obj_node = target_tree.getroot().find('.//' + self.tag) \
if(len(target_tree.getroot().findall('.//' + self.tag)) > 0) \
else ET.SubElement(target_tree.getroot(), self.tag)
for key in self.floatKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3)))
for key in self.stringKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), self.__dict__[key])
if self.text_field is not None:
self.text_field.attach_object_to_tree(obj_node)
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {}
for floatKey in Image.floatKeys:
- properties.update(cls.create_semantic_property_dictionary(floatKey, float))
+ properties.update(cls.create_semantic_property_dictionary(floatKey, float, cardinality=1))
properties.update(cls.create_semantic_property_dictionary('file_name', str, cardinality=1))
- properties.update(cls.create_semantic_property_dictionary('URL', str))
+ #properties.update(cls.create_semantic_property_dictionary('URL', str))
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
class SVGImage(Image):
"""This class represents a svg image.
"""
XML_TAG = 'svg-image'
def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG):
if node is not None and node.tag != self.XML_TAG:
file_name = node.get('file')
height = float(node.get('height')) if bool(node.get('height')) else 0.0
width = float(node.get('width')) if bool(node.get('width')) else 0.0
node = None
super(SVGImage, self).__init__(node=node, file_name=file_name, URL=URL,\
height=height, width=width, text_field=text_field, tag=self.XML_TAG)
Index: svgscripts/datatypes/simple_word.py
===================================================================
--- svgscripts/datatypes/simple_word.py (revision 71)
+++ svgscripts/datatypes/simple_word.py (revision 72)
@@ -1,118 +1,121 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent a simple word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import abc
from lxml import etree as ET
import sys
from .line import Line
from .faksimile_position import FaksimilePosition
from .transkription_position import TranskriptionPosition
from .word_position import WordPosition
sys.path.append('py2ttl')
from class_spec import SemanticClass
class SimpleWord(SemanticClass, metaclass=abc.ABCMeta):
"""
This class represents a simple word.
"""
XML_TAG = 'simple-word'
XML_SUB_TAG = 'content'
def __init__(self, id=0, line_number=-1, line=None, text='', deleted=False, transkription_positions=None, faksimile_positions=None):
self.id = id
self.text = text
self.line_number = line_number
self.lines = []
if line is not None:
self.lines.append(line)
self.transkription_positions = transkription_positions if transkription_positions is not None else []
self.faksimile_positions = faksimile_positions if faksimile_positions is not None else []
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
if len(target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)) > 0:
word_node = target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)[0]
word_node.getparent().remove(word_node)
word_node = ET.SubElement(target_tree, self.XML_TAG, attrib={'id': str(self.id)})
word_node.set('text', self.text)
if self.line_number > -1:
word_node.set('line-number', str(self.line_number))
for transkription_position in self.transkription_positions:
transkription_position.attach_object_to_tree(word_node)
for faksimile_position in self.faksimile_positions:
faksimile_position.attach_object_to_tree(word_node)
return word_node
@classmethod
def create_cls(cls, word_node):
"""Creates a cls from a (lxml.Element) node.
[:return:] cls
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else -1
text = word_node.get('text')
transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
return cls(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
else:
error_msg = 'word_node has not been defined'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
- properties = { 'lines': {'class': Line, 'cardinality': 1,\
+ properties = { 'lines': {'class': Line,\
+ 'cardinality': 1,\
'cardinality_restriction': 'minCardinality',\
'name': 'wordBelongsToLine',\
'label': 'word belongs to a line',\
'comment': 'Relating a word to a line.'}}
properties.update(cls.create_semantic_property_dictionary('transkription_positions', list, cardinality=1, cardinality_restriction='minCardinality'))
properties.update(cls.create_semantic_property_dictionary('faksimile_positions', list, cardinality=1, cardinality_restriction='minCardinality'))
properties.update(cls.create_semantic_property_dictionary('text', str, cardinality=1))
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
def init_word(self, page):
"""Initialize word with objects from page.
"""
+ for transkription_position in self.transkription_positions:
+ transkription_position.svg_image = page.svg_image
self.faksimile_positions = FaksimilePosition.create_list_of_cls(self.faksimile_positions, page.faksimile_image, page.text_field)
if self.line_number > -1:
self.lines += [ line for line in page.lines if line.id == self.line_number ]
Index: svgscripts/datatypes/faksimile_image.py
===================================================================
--- svgscripts/datatypes/faksimile_image.py (revision 71)
+++ svgscripts/datatypes/faksimile_image.py (revision 72)
@@ -1,102 +1,103 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent faksimile images.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import fnmatch
from lxml import etree as ET
import os
from os.path import basename, dirname, isfile, realpath, sep
import sys
from .image import Image
from .text_field import TextField
sys.path.append('svgscripts')
from local_config import FAKSIMILE_LOCATION
class FaksimileImage(Image):
"""
This class represents a faksimile image.
Args:
file_name (str): name of the image file.
node (lxml.etree.Element) node, containing information
URL (str): URL of image file.
height (float): height of image
width (float): width of image
x (float): x
y (float): y
"""
XML_TAG = 'faksimile-image'
NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/download/'
def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0, text_field=None):
super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL, local_path=local_path,\
height=height, width=width, text_field=text_field, tag=self.XML_TAG)
self.x = x
self.y = y
def get_image_joined_with_text_field(self, text_field):
"""Returns a new instance of itself that has a text_field (text_field.TextField).
"""
return FaksimileImage(file_name=self.file_name, local_path=self.local_path, URL=self.URL, height=self.height,\
width=self.width, x=self.x, y=self.y, text_field=text_field)
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(FaksimileImage,cls).get_semantic_dictionary()
dictionary['properties'].update(cls.create_semantic_property_dictionary('text_field', TextField))
+ dictionary['properties'].update(cls.create_semantic_property_dictionary('URL', str, cardinality=1))
return dictionary
@staticmethod
def CREATE_IMAGE(image_node, source_file=None):
"""Instantiates a FaksimileImage from a (lxml.etree.Element) image_node.
"""
namespaces = image_node.nsmap
if len(namespaces) == 0:
namespaces = { 'xlink': '' }
local_path = image_node.get('{%s}href' % namespaces['xlink'])
file_name = basename(local_path)
if file_name != local_path and source_file is not None:
local_path = realpath(dirname(source_file)) + sep + local_path
local_path = realpath(local_path)
if not isfile(local_path):
local_path = None
for path, dirs, files in os.walk(os.path.abspath(FAKSIMILE_LOCATION)):
for filename in fnmatch.filter(files, file_name):
local_path = os.path.join(path, filename)
break
URL = FaksimileImage.NIETZSCHE_SOURCES_URL + file_name.replace('.jpg','')
height = float(image_node.get('height')) if bool(image_node.get('height')) else 0.0
width = float(image_node.get('width')) if bool(image_node.get('width')) else 0.0
x = float(image_node.get('x')) if bool(image_node.get('x')) else 0.0
y = float(image_node.get('y')) if bool(image_node.get('y')) else 0.0
return FaksimileImage(file_name=file_name, local_path=local_path, URL=URL, height=height, width=width, x=x, y=y)
Index: svgscripts/datatypes/word.py
===================================================================
--- svgscripts/datatypes/word.py (revision 71)
+++ svgscripts/datatypes/word.py (revision 72)
@@ -1,509 +1,555 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from operator import attrgetter
import sys
import warnings
from .matrix import Matrix
from .path import Path
from .simple_word import SimpleWord
from .word_position import WordPosition
from .transkription_position import TranskriptionPosition
from .writing_process import WritingProcess
sys.path.append('py2ttl')
from class_spec import SemanticClass
+def execute_function_on_parts(word_parts, func_name):
+ """Execute function on parts and add those parts instead of original word to word_parts.
+
+ :return: new word_parts, output from func
+ """
+ copy_parts = word_parts[:]
+ for word in word_parts:
+ output = eval('word.{0}()'.format(func_name))
+ if len(word.word_parts) > 0:
+ for part_word in word.word_parts:
+ copy_parts.insert(copy_parts.index(word), part_word)
+ copy_parts.remove(word)
+ word.word_parts = []
+ return copy_parts, output
+
+def update_transkription_position_ids(word):
+ """Update transkription_position' ids according to index.
+ """
+ for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
+ transkription_position.id = index
+
class Word(SimpleWord):
"""
This class represents a word.
"""
DATA = 'debug-data'
XML_TAG = 'word'
XML_EARLIER_VERSION = 'earlier-version'
def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None):
super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
self.deleted = deleted
self.debug_container = {}
if len([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ]) > len(self.text):
self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ])
self.word_part_objs = word_part_objs if word_part_objs is not None else []
self.is_head_of_inserted_words = False
self.is_tail_of_inserted_words = False
self.is_before_inserted_words = False
self.is_after_inserted_words = False
self.word_insertion_mark = None
self.debug_msg = None
self.writing_process_id = writing_process_id
self.writing_processes = []
self.word_parts = word_parts if word_parts is not None else []
self.earlier_version = earlier_version
self.box_paths = box_paths if box_paths is not None else []
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
word_node = super(Word,self).attach_word_to_tree(target_tree)
if self.deleted is not None:
word_node.set('deleted', str(self.deleted).lower())
if self.writing_process_id > -1:
word_node.set('writing-process-id', str(self.writing_process_id))
for word_part in self.word_parts:
word_part.attach_word_to_tree(word_node)
if self.earlier_version is not None:
earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION)
self.earlier_version.attach_word_to_tree(earlier_node)
for index, box_path in enumerate(self.box_paths):
box_path.id = index
box_path.attach_object_to_tree(word_node)
return word_node
def belongs_to_multiple_writing_processes(self, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if len(self.word_parts) > 0 and include_parts:
return len(set(word.writing_process_id for word in self.word_parts)) > 1
return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1
+ @classmethod
+ def create_cls(cls, word_node):
+ """Creates a word from a (lxml.Element) node.
+
+ [:return:] Word
+ """
+ cls = super(Word,cls).create_cls(word_node)
+ cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
+ cls.split_strings = None
+ if bool(word_node.get('split')):
+ cls.split_strings = word_node.get('split').split(' ')
+ if ''.join(cls.split_strings) != cls.text:
+ error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\
+ format(word_node.getroottree().docinfo.URL, str(cls.id))\
+ + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\
+ + 'Text attribute: "{0}".\n'.format(cls.text)
+ raise Exception(error_msg)
+ cls.deleted = word_node.get('deleted') == 'true'\
+ if bool(word_node.get('deleted')) else None
+ cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_TAG) ]
+ cls.box_paths = [ Path(node=node) for node in word_node.xpath('.//' + Path.BOX_TAG ) ]
+ earlier_versions = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ]
+ if len(earlier_versions) > 0:
+ cls.earlier_version = earlier_versions[0]
+ return cls
+
+ @staticmethod
+ def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
+ """Creates a word from a (lxml.Element) node or word_part_objs.
+
+ [:return:] Word
+ """
+ if word_node is not None: # init word from xml node
+ id = int(word_node.get('id'))
+ line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
+ text = word_node.get('text')
+ deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
+ transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
+ faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
+ word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
+ if len(word_node.findall('.//' + Word.DATA)) > 0\
+ else [ item.attrib for item in word_node.findall('.//part')]
+ return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
+ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
+ elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
+ WIDTH = 5
+ TOPCORRECTION = 2.0
+ FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
+ height = height
+ x = round(float(word_part_objs[0]['x']), 3)
+ if(page is not None and bool(page.style_dict)):
+ HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
+ style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
+ biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
+ height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
+ TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
+ if endSign is not None and '%' in endSign:
+ lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
+ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
+ if bool(page.style_dict[key].get('font-size'))]
+ lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
+ endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
+ elif endSign is not None and '%' in endSign:
+ endX = float(endX) + WIDTH
+ bottom = round(float(word_part_objs[0]['y']), 3)
+ y = round(bottom - height + TOPCORRECTION, 3)
+ width = round(float(endX) - x, 3)
+ transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
+ text = ''.join([ dict['text'] for dict in word_part_objs])
+ line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
+ word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
+ word.debug_msg = debug_msg
+ return word
+ else:
+ error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
+ raise Exception('Error: {}'.format(error_msg))
+
+ @classmethod
+ def get_semantic_dictionary(cls):
+ """ Creates and returns a semantic dictionary as specified by SemanticClass.
+ """
+ dictionary = super(Word,cls).get_semantic_dictionary()
+ dictionary['properties'].update(cls.create_semantic_property_dictionary('deleted', bool,\
+ name='isWordDeleted', label='has word been deleted'))
+ #dictionary['properties'].update(cls.create_semantic_property_dictionary('writing_processes', WritingProcess, cardinality=1,\
+ # cardinality_restriction='minCardinality', name='wordBelongsToWritingProcess', label='word has been written in a specific writing process'))
+ # TODO: change me after fixing word box issue!!!!
+ dictionary['properties'].update(cls.create_semantic_property_dictionary('writing_processes', WritingProcess,\
+ name='wordBelongsToWritingProcess', label='word has been written in a specific writing process'))
+ #dictionary['properties'].update(cls.create_semantic_property_dictionary('word_parts', list,\
+ # name='wordHasWordParts', label='word has word parts', comment='word consists of a list of words'))
+ return dictionary
+
def get_partial_word_over_box(self):
"""Partition a word according to its transkription_positions' has_box
->split word and add partial words as its parts.
:return: word over box or self
"""
word_over_box = self
if self.has_mixed_status('has_box'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.has_box != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_status:
word_over_box = newWord
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.has_box
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_status:
word_over_box = newWord
self.transkription_positions = []
self.line_number = -1
elif len(self.word_parts) > 0:
self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, 'get_partial_word_over_box')
return word_over_box
def has_mixed_status(self, property_key, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions):
return False
if len(self.word_parts) > 0 and include_parts:
if False in set(property_key in word.__dict__.keys() for word in self.word_parts):
return False
return len(set(word.deleted for word in self.word_parts)) > 1
return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1
def init_word(self, page):
"""Initialize word with objects from page.
"""
super(Word,self).init_word(page)
if self.writing_process_id > -1:
self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ]
writing_processes = self.writing_processes
for word_part in self.word_parts:
word_part.init_word(page)
self.lines += word_part.lines
- self.writing_processes + word_part.writing_processes
+ self.writing_processes += word_part.writing_processes
self.lines = [ line for line in set(self.lines) ]
self.writing_processes = [ wp for wp in set(self.writing_processes)]
-
+
+ def join(self, other_word, append_at_end_of_new_word=True):
+ """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
+ """
+ if append_at_end_of_new_word:
+ self.text = self.text + other_word.text
+ for position in other_word.transkription_positions:
+ position.id = str(len(self.transkription_positions))
+ self.transkription_positions.append(position)
+ else:
+ self.text = other_word.text + self.text
+ index = 0
+ for position in other_word.transkription_positions:
+ self.transkription_positions.insert(index, position)
+ index += 1
+ while index < len(self.transkription_positions):
+ self.transkription_positions[index].id = str(index)
+ index += 1
+ self.simplify_transkription_positions()
+
def partition_according_to_deletion(self):
"""Partition a word according to its transkription_positions' deletion status
->split word and add partial words as its parts.
"""
if self.has_mixed_status('deleted'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.deleted != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.deleted
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
self.line_number = -1
self.deleted = False
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion')
elif not self.deleted\
and len(self.transkription_positions) > 0\
and self.transkription_positions[0].deleted:
self.deleted = True
def partition_according_to_writing_process_id(self):
"""Partition a word according to its transkription_positions' writing_process_ids
->split word and add partial words as its parts.
"""
if self.belongs_to_multiple_writing_processes():
last_writing_process_id = -1
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.writing_process_id != last_writing_process_id\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_writing_process_id = transkription_position.writing_process_id
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
- self.line_number = -1
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id')
if self.belongs_to_multiple_writing_processes(include_parts=True):
self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0]
elif len(self.transkription_positions) > 0:
self.writing_process_id = self.transkription_positions[0].writing_process_id
def process_boxes(self, box_paths, parent_word=None, tr_xmin=0.0, tr_ymin=0.0):
"""Determines whether word is over a word box.
"""
test_case = len(box_paths) == 1
later_version_word = None
if len(self.word_parts) > 0:
for word in self.word_parts:
later_version = word.process_boxes(box_paths, parent_word=self, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
- if later_version is not None and later_version.earlier_version is not None:
+ if later_version_word is None and later_version is not None and later_version.earlier_version is not None:
later_version_word = later_version
else:
new_tp_dict = {}
for transkription_position in self.transkription_positions:
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
containing_boxes = [ box_path for box_path in box_paths\
if word_path.is_partially_contained_by(box_path)\
or box_path.do_paths_intersect(word_path) ]
+ #containing_boxes = [ box_path for box_path in box_paths\
+ # if box_path.contains_start_of_path(word_path) ]
+ if len(containing_boxes) > 0:
+ box_path = containing_boxes[0]
+ if box_path.contains_path(word_path):
+ transkription_position.has_box = box_path
+ elif box_path.contains_start_of_path(word_path):
+ split_position = box_path.path.bbox()[1] - tr_xmin
+ new_tps = transkription_position.split(split_position)
+ if len(new_tps) == 2:
+ new_tps[0].has_box = box_path
+ new_tp_dict.update({ transkription_position: new_tps })
+ else:
+ transkription_position.has_box = box_path
+ elif box_path.contains_end_of_path(word_path):
+ split_position = box_path.path.bbox()[0] - tr_xmin
+ new_tps = transkription_position.split(split_position)
+ if len(new_tps) == 2:
+ new_tps[1].has_box = box_path
+ new_tp_dict.update({ transkription_position: new_tps })
+ else:
+ transkription_position.has_box = box_path
+ else:
+ split_position1 = box_path.path.bbox()[0] - tr_xmin
+ split_position2 = box_path.path.bbox()[1] - tr_xmin
+ new_tps = transkription_position.split(split_position1, split_position2)
+ if len(new_tps) >= 2:
+ new_tps[1].has_box = box_path
+ new_tp_dict.update({ transkription_position: new_tps })
+ else:
+ transkription_position.has_box = box_path
+ for replace_tp in new_tp_dict.keys():
if len(containing_boxes) > 0:
box_path = containing_boxes[0]
if box_path.contains_path(word_path):
transkription_position.has_box = box_path
elif box_path.contains_start_of_path(word_path):
split_position = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[0].has_box = box_path
new_tp_dict.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
elif box_path.contains_end_of_path(word_path):
split_position = box_path.path.bbox()[0] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[1].has_box = box_path
new_tp_dict.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
else:
split_position1 = box_path.path.bbox()[0] - tr_xmin
split_position2 = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position1, split_position2)
if len(new_tps) >= 2:
new_tps[1].has_box = box_path
new_tp_dict.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
for replace_tp in new_tp_dict.keys():
for tp in new_tp_dict.get(replace_tp):
self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp)
self.transkription_positions.remove(replace_tp)
update_transkription_position_ids(self)
later_version_word = self.get_partial_word_over_box()
if len(later_version_word.transkription_positions) > 0\
and later_version_word.transkription_positions[0].has_box is not None:
box_holder = self if parent_word is None else parent_word
box_holder.box_paths.append(later_version_word.transkription_positions[0].has_box)
box_text = later_version_word.transkription_positions[0].has_box.earlier_text
transkription_positions = TranskriptionPosition.copy_list_of_cls(later_version_word.transkription_positions)
later_version_word.earlier_version = Word(text=box_text, transkription_positions=transkription_positions)
#print(later_version_word.text, later_version_word.earlier_version.text)
return later_version_word
return later_version_word
+ def set_word_insertion_mark(self, word_insertion_mark):
+ """Sets word_insertion_mark
+ """
+ self.word_insertion_mark = word_insertion_mark
+
+ def set_writing_process_id_to_transkription_positions(self, page):
+ """Determines the writing process id of the transkription_positions.
+ """
+ for transkription_position in self.transkription_positions:
+ if len(transkription_position.positional_word_parts) > 0:
+ for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
+ if font_key in page.fontsizekey2stage_mapping.keys():
+ transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key)
+
+
+ def simplify_transkription_positions(self):
+ """Merge transkription_positions if possible.
+ """
+ index = len(self.transkription_positions)-1
+ while index > 0\
+ and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
+ current_tp = self.transkription_positions[index]
+ index -= 1
+ previous_tp = self.transkription_positions[index]
+ if previous_tp.writing_process_id == current_tp.writing_process_id:
+ positional_word_parts = previous_tp.positional_word_parts
+ positional_word_parts += current_tp.positional_word_parts
+ transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
+ positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
+ if len(transkription_positions) == 1:
+ transkription_positions[0].writing_process_id = previous_tp.writing_process_id
+ self.transkription_positions.pop(index+1)
+ self.transkription_positions[index] = transkription_positions[0]
+ #print(self.text, len(self.transkription_positions))
+
def split(self, split_string, start_id=0):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString, currentString, nextString = self.text.partition(split_string)
currentWord = None
previousWord = None
nextWord = None
previousIndex = 0
current_id = start_id
all_positional_word_parts = []
for position in self.transkription_positions:
all_positional_word_parts += position.positional_word_parts
if len(all_positional_word_parts) == 0:
warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString))
if len(previousString) > 0:
previous_pwps = []
while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
previous_pwps.append(all_positional_word_parts[previousIndex])
previousIndex += 1
if previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString))
else:
previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split')
previous_text = ''.join([ pwp.text for pwp in previous_pwps ])
previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions)
current_id += 1
all_positional_word_parts = all_positional_word_parts[previousIndex:]
if len(nextString) > 0:
tmp_pwps = []
index = 0
while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
tmp_pwps.append(all_positional_word_parts[index])
index += 1
if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString))
else:
next_pwps = all_positional_word_parts[index:]
next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split')
next_text = ''.join([ pwp.text for pwp in next_pwps ])
nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions)
all_positional_word_parts = all_positional_word_parts[:index]
current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split')
current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ])
currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions)
return previousWord, currentWord, nextWord
def split_according_to_status(self, status):
"""Split a word according to its transkription_positions' text.
:return: a list of new word.Word
"""
new_words = []
if self.has_mixed_status(status):
last_status = None
transkription_positions = []
copy_keys = [ 'line_number', 'text', 'deleted', 'writing_process_id' ]
for transkription_position in self.transkription_positions:
if transkription_position.__dict__[status] != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=self.id+len(new_words), transkription_positions=transkription_positions)
for key in copy_keys:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
new_words.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.__dict__[status]
if len(transkription_positions) > 0:
newWord = Word(id=self.id+len(new_words), transkription_positions=transkription_positions)
for key in copy_keys:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
new_words.append(newWord)
return new_words
-
- def join(self, other_word, append_at_end_of_new_word=True):
- """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
- """
- if append_at_end_of_new_word:
- self.text = self.text + other_word.text
- for position in other_word.transkription_positions:
- position.id = str(len(self.transkription_positions))
- self.transkription_positions.append(position)
- else:
- self.text = other_word.text + self.text
- index = 0
- for position in other_word.transkription_positions:
- self.transkription_positions.insert(index, position)
- index += 1
- while index < len(self.transkription_positions):
- self.transkription_positions[index].id = str(index)
- index += 1
- self.simplify_transkription_positions()
-
- def set_word_insertion_mark(self, word_insertion_mark):
- """Sets word_insertion_mark
- """
- self.word_insertion_mark = word_insertion_mark
- def simplify_transkription_positions(self):
- """Merge transkription_positions if possible.
- """
- index = len(self.transkription_positions)-1
- while index > 0\
- and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
- current_tp = self.transkription_positions[index]
- index -= 1
- previous_tp = self.transkription_positions[index]
- if previous_tp.writing_process_id == current_tp.writing_process_id:
- positional_word_parts = previous_tp.positional_word_parts
- positional_word_parts += current_tp.positional_word_parts
- transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
- positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
- if len(transkription_positions) == 1:
- transkription_positions[0].writing_process_id = previous_tp.writing_process_id
- self.transkription_positions.pop(index+1)
- self.transkription_positions[index] = transkription_positions[0]
- #print(self.text, len(self.transkription_positions))
-
- @classmethod
- def create_cls(cls, word_node):
- """Creates a word from a (lxml.Element) node.
- [:return:] Word
- """
- cls = super(Word,cls).create_cls(word_node)
- cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
- cls.split_strings = None
- if bool(word_node.get('split')):
- cls.split_strings = word_node.get('split').split(' ')
- if ''.join(cls.split_strings) != cls.text:
- error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\
- format(word_node.getroottree().docinfo.URL, str(cls.id))\
- + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\
- + 'Text attribute: "{0}".\n'.format(cls.text)
- raise Exception(error_msg)
- cls.deleted = word_node.get('deleted') == 'true'\
- if bool(word_node.get('deleted')) else None
- cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_TAG) ]
- cls.box_paths = [ Path(node=node) for node in word_node.xpath('.//' + Path.BOX_TAG ) ]
- earlier_versions = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ]
- if len(earlier_versions) > 0:
- cls.earlier_version = earlier_versions[0]
- return cls
-
- @staticmethod
- def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
- """Creates a word from a (lxml.Element) node or word_part_objs.
-
- [:return:] Word
- """
- if word_node is not None: # init word from xml node
- id = int(word_node.get('id'))
- line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
- text = word_node.get('text')
- deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
- transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
- faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
- word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
- if len(word_node.findall('.//' + Word.DATA)) > 0\
- else [ item.attrib for item in word_node.findall('.//part')]
- return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
- faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
- elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
- WIDTH = 5
- TOPCORRECTION = 2.0
- FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
- height = height
- x = round(float(word_part_objs[0]['x']), 3)
- if(page is not None and bool(page.style_dict)):
- HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
- style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
- biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
- height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
- TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
- if endSign is not None and '%' in endSign:
- lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
- for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
- if bool(page.style_dict[key].get('font-size'))]
- lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
- endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
- elif endSign is not None and '%' in endSign:
- endX = float(endX) + WIDTH
- bottom = round(float(word_part_objs[0]['y']), 3)
- y = round(bottom - height + TOPCORRECTION, 3)
- width = round(float(endX) - x, 3)
- transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
- text = ''.join([ dict['text'] for dict in word_part_objs])
- line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
- word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
- word.debug_msg = debug_msg
- return word
- else:
- error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
- raise Exception('Error: {}'.format(error_msg))
-
- @classmethod
- def get_semantic_dictionary(cls):
- """ Creates and returns a semantic dictionary as specified by SemanticClass.
- """
- dictionary = super(Word,cls).get_semantic_dictionary()
- dictionary['properties'].update(cls.create_semantic_property_dictionary('deleted', bool,\
- name='isWordDeleted', label='has word been deleted'))
- dictionary['properties'].update(cls.create_semantic_property_dictionary('writing_processes', WritingProcess, cardinality=1,\
- cardinality_restriction='minCardinality', name='wordBelongsToWritingProcess', label='word has been written in a specific writing process'))
- dictionary['properties'].update(cls.create_semantic_property_dictionary('word_parts', list,\
- name='wordHasWordParts', label='word has word parts', comment='word consists of a list of words'))
- return dictionary
-
-def execute_function_on_parts(word_parts, func_name):
- """Execute function on parts and add those parts instead of original word to word_parts.
-
- :return: new word_parts, output from func
- """
- copy_parts = word_parts[:]
- for word in word_parts:
- output = eval('word.{0}()'.format(func_name))
- if len(word.word_parts) > 0:
- for part_word in word.word_parts:
- copy_parts.insert(copy_parts.index(word), part_word)
- copy_parts.remove(word)
- word.word_parts = []
- return copy_parts, output
-
-def update_transkription_position_ids(word):
- """Update transkription_position' ids according to index.
- """
- for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
- transkription_position.id = index
Index: svgscripts/process_words_post_merging.py
===================================================================
--- svgscripts/process_words_post_merging.py (revision 0)
+++ svgscripts/process_words_post_merging.py (revision 72)
@@ -0,0 +1,326 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This program can be used to process words after they have been merged with faksimile data.
+"""
+# Copyright (C) University of Basel 2019 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+from colorama import Fore, Style
+from deprecated import deprecated
+from functools import cmp_to_key
+import getopt
+import inspect
+import lxml.etree as ET
+import re
+import shutil
+import string
+from svgpathtools import svg2paths2, svg_to_paths
+import sys
+import tempfile
+from operator import attrgetter
+import os
+from os import listdir, sep, path, setpgrp, devnull
+from os.path import exists, isfile, isdir, dirname, basename
+from progress.bar import Bar
+import warnings
+
+if dirname(__file__) not in sys.path:
+ sys.path.append(dirname(__file__))
+
+from datatypes.box import Box
+from datatypes.mark_foreign_hands import MarkForeignHands
+from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
+from datatypes.path import Path
+from datatypes.text_connection_mark import TextConnectionMark
+from datatypes.transkriptionField import TranskriptionField
+from util import back_up
+from process_files import update_svgposfile_status
+
+sys.path.append('shared_util')
+from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
+
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+UNITTESTING = False
+
+def categorize_paths(page, transkription_field=None):
+ """Categorize all paths that are part of the transkription field.
+
+ :return: a dictionary containig a list for each category of path.
+ """
+ if page.source is not None and isfile(page.source):
+ MAX_HEIGHT_LINES = 1
+ max_line = sorted(\
+ [line_number.bottom-line_number.top for line_number in page.line_numbers if line_number.id % 2 == 0],\
+ reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17
+ tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0
+ tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0
+ paths, attributes = svg_to_paths.svg2paths(page.source)
+ allpaths_on_tf = []
+ allpaths_outside_tf = []
+ attributes_outside_tf = []
+ if transkription_field is None:
+ transkription_field = TranskriptionField(page.source)
+ for index in range(0, len(paths)):
+ path = paths[index]
+ attribute = attributes[index]
+ if len(path) > 0\
+ and path != transkription_field.path\
+ and path.bbox()[0] > tr_xmin\
+ and path.bbox()[1] < transkription_field.xmax:
+ allpaths_on_tf.append(Path(id=index, path=path, style_class=attribute.get('class')))
+ elif len(path) > 0\
+ and path != transkription_field.path:
+ allpaths_outside_tf.append(path)
+ attributes_outside_tf.append(attribute)
+ path_dict = { 'text_area_deletion_paths': [],\
+ 'deletion_or_underline_paths': [],\
+ 'box_paths': [],\
+ 'dots_paths': [],\
+ 'word_connector_paths': [],\
+ 'uncategorized_paths': [] }
+ for mypath in allpaths_on_tf:
+ xmin, xmax, ymin, ymax = mypath.path.bbox()
+ start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin)
+ if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1:
+ path_dict.get('dots_paths').append(mypath)
+ elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed():
+ path_dict.get('box_paths').append(mypath)
+ elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
+ path_dict.get('word_connector_paths').append(mypath)
+ elif abs(ymax-ymin) < MAX_HEIGHT_LINES:
+ path_dict.get('deletion_or_underline_paths').append(mypath)
+ elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin):
+ path_dict.get('text_area_deletion_paths').append(mypath)
+ else:
+ path_dict.get('uncategorized_paths').append(mypath)
+ underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin)
+ path_dict.update({'underline_path': underline_path})
+ process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\
+ paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line)
+ return path_dict
+ elif not UNITTESTING:
+ error_msg = 'Svg source file {} does not exist!'.format(page.source)\
+ if page.source is not None else 'Page does not contain a source file!'
+ raise FileNotFoundError(error_msg)
+ return {}
+
+def do_paths_intersect_saveMode(path1, path2):
+ """Returns true if paths intersect, false if not or if there was an exception.
+ """
+ try:
+ return path1.intersect(path2, justonemode=True)
+ except AssertionError:
+ return False
+
+def find_special_words(page, transkription_field=None):
+ """Find special words, remove them from words, process their content.
+ """
+ if page.source is None or not isfile(page.source):
+ raise FileNotFoundError('Page does not have a source!')
+ if transkription_field is None:
+ transkription_field = TranskriptionField(page.source)
+ special_char_list = MarkForeignHands.get_special_char_list()
+ special_char_list += TextConnectionMark.get_special_char_list()
+ single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ]
+ if not UNITTESTING:
+ bar = Bar('find special words', max=len(single_char_words))
+ for word in single_char_words:
+ not bool(UNITTESTING) and bar.next()
+ if word.text == MarkForeignHands.CLASS_MARK:
+ id = len(page.mark_foreign_hands)
+ page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id))
+ page.words.remove(word)
+ elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\
+ or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\
+ and any(style in page.sonderzeichen_list for style\
+ in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))):
+ id = len(page.text_connection_marks)
+ page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id))
+ page.words.remove(word)
+ not bool(UNITTESTING) and bar.finish()
+ svg_tree = ET.parse(page.source)
+ page.update_page_type(transkription_field=transkription_field)
+ page.update_line_number_area(transkription_field, svg_tree=svg_tree)
+ italic_classes = [ key for key in page.style_dict\
+ if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ]
+ if len(page.mark_foreign_hands) > 0:
+ MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\
+ SonderzeichenList=page.sonderzeichen_list)
+ if len(page.text_connection_marks) > 0:
+ TextConnectionMark.find_content_in_footnotes(page.text_connection_marks, transkription_field, svg_tree,\
+ title=page.title, page_number=page.number)
+
+def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
+ """Marks all words that intersect with deletion paths as deleted
+ and adds these paths to word_deletion_paths.
+
+ [:return:] list of .path.Path that might be word_underline_paths
+ """
+ if not UNITTESTING:
+ bar = Bar('mark words that intersect with deletion paths', max=len(page.words))
+ for word in page.words:
+ not bool(UNITTESTING) and bar.next()
+ word.deleted = False
+ for transkription_position in word.transkription_positions:
+ word_path = Path.create_path_from_transkription_position(transkription_position,\
+ tr_xmin=tr_xmin, tr_ymin=tr_ymin)
+ intersecting_paths = [ deletion_path for deletion_path in deletion_paths\
+ if do_paths_intersect_saveMode(deletion_path.path, word_path.path) ]
+ if len(intersecting_paths) > 0:
+ transkription_position.deleted = True
+ for deletion_path in intersecting_paths:
+ if deletion_path not in page.word_deletion_paths:
+ deletion_path.tag = Path.WORD_DELETION_PATH_TAG
+ deletion_path.attach_object_to_tree(page.page_tree)
+ page.word_deletion_paths.append(deletion_path)
+ word.partition_according_to_deletion()
+ not bool(UNITTESTING) and bar.finish()
+ # return those paths in deletion_paths that are not in page.word_deletion_paths
+ return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ]
+
+def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None):
+ """Process words after merging with faksimile word positions.
+ """
+ if page is None and svg_pos_file is None:
+ raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!')
+ if page is None:
+ page = Page(xml_source_file=svg_pos_file)
+ if page.source is None or not isfile(page.source):
+ raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file))
+ if svg_pos_file is None:
+ svg_pos_file = page.page_tree.docinfo.URL
+ if new_words is not None:
+ page.words = sorted(new_words, key=attrgetter('id'))
+ for word_node in page.page_tree.xpath('.//word'):
+ word_node.getparent().remove(word_node)
+ transkription_field = TranskriptionField(page.source)
+ find_special_words(page, transkription_field=transkription_field)
+ update_writing_process_ids(page)
+ #TODO: find_hyphenated_words(page)
+ categorize_paths(page, transkription_field=transkription_field)
+ page.update_and_attach_words2tree()
+ if not UNITTESTING:
+ if target_svg_pos_file is None:
+ target_svg_pos_file = svg_pos_file
+ status = STATUS_MERGED_OK + ":" + STATUS_POSTMERGED_OK
+ update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status)
+ write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
+
+def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17):
+ """Process word boxes: partition words according to word boxes.
+ """
+ MAX_HEIGHT_LINES = 1
+ if not UNITTESTING:
+ bar = Bar('process word boxes', max=len(page.words))
+ svg_tree = ET.parse(page.source)
+ namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
+ allpaths_on_margin_field = []
+ if paths is None or attributes is None:
+ paths, attributes = svg_to_paths.svg2paths(page.source)
+ for index in range(0, len(paths)):
+ path = paths[index]
+ xmin, xmax, ymin, ymax = path.bbox()
+ attribute = attributes[index]
+ if len(path) > 0\
+ and path != transkription_field.path\
+ and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\
+ or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\
+ and abs(ymax-ymin) < max_line:
+ allpaths_on_margin_field.append(Path(id=index, path=path, style_class=attribute.get('class')))
+ box_line_number_dict = {}
+ for box_path in sorted(box_paths, key=lambda path: path.get_median_y()):
+ line_number = page.get_line_number(box_path.get_median_y(tr_ymin=transkription_field.ymin))
+ if line_number not in box_line_number_dict.keys():
+ box_line_number_dict.update({ line_number: [ box_path ]})
+ else:
+ box_line_number_dict.get(line_number).append(box_path)
+ boxes = []
+ for line_number in box_line_number_dict.keys():
+ box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x())
+ margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\
+ if page.get_line_number(margin_box.get_median_y(tr_ymin=transkription_field.ymin)) == line_number ],\
+ key=lambda path: path.get_x())
+ threshold = 3 if line_number % 2 == 0 else 1.5
+ for box_path in box_paths_on_line:
+ box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\
+ transkription_field=transkription_field, namespaces=namespaces, threshold=threshold)
+ if box is not None:
+ boxes.append(box)
+ for word in page.words:
+ not bool(UNITTESTING) and bar.next()
+ word.process_boxes(boxes, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin)
+ not bool(UNITTESTING) and bar.finish()
+
+def update_writing_process_ids(page):
+ """Update the writing_process_ids of the words and split accordingly.
+ """
+ for word in page.words:
+ word.set_writing_process_id_to_transkription_positions(page)
+ word.partition_according_to_writing_process_id()
+
+def usage():
+ """prints information on how to use the script
+ """
+ print(main.__doc__)
+
+def main(argv):
+ """This program can be used to process words after they have been merged with faksimile data.
+
+ svgscripts/process_words_post_merging.py [OPTIONS]
+
+ a xml file about a manuscript, containing information about its pages.
+ a xml file about a page, containing information about svg word positions.
+
+ OPTIONS:
+ -h|--help: show help
+
+ :return: exit code (int)
+ """
+ try:
+ opts, args = getopt.getopt(argv, "h", ["help"])
+ except getopt.GetoptError:
+ usage()
+ return 2
+ for opt, arg in opts:
+ if opt in ('-h', '--help'):
+ usage()
+ return 0
+ if len(args) < 1:
+ usage()
+ return 2
+ exit_status = 0
+ file_a = args[0]
+ if isfile(file_a):
+ manuscript_file = file_a\
+ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
+ else None
+ for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=STATUS_POSTMERGED_OK):
+ back_up(page, page.page_tree.docinfo.URL)
+ post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file)
+ else:
+ raise FileNotFoundError('File {} does not exist!'.format(file_a))
+ return exit_status
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
Index: svgscripts/extractWordPosition.py
===================================================================
--- svgscripts/extractWordPosition.py (revision 71)
+++ svgscripts/extractWordPosition.py (revision 72)
@@ -1,590 +1,586 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract the position of the words in a svg file and write them to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import inspect
import getopt
from lxml import etree as ET
from os import sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
from progress.bar import Bar
import re
import sys
import warnings
from datatypes.lineNumber import LineNumber
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.pdf import PDFText
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word
from datatypes.word_insertion_mark import WordInsertionMark
sys.path.append('shared_util')
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Extractor:
"""
This class can be used to extract the word positions in a svg file and write it to a xml file.
Args:
[xml_dir (str): target directory]
[title (str): title of document]
[manuscript_file (str): xml file containing information about the archival unity to which the current page belongs
[extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that
are part of the transkription field.
"""
UNITTESTING = False
SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ]
def __init__(self, xml_dir=None, title=None, manuscript_file=None, extract_transkription_field_only=False, compare2pdf=False):
if bool(xml_dir):
self.xml_dir = xml_dir
not isdir(self.xml_dir) and mkdir(self.xml_dir)
else:
self.xml_dir = 'xml' if(isdir('xml')) else ''
self.latest_status = None
self.compare2pdf = compare2pdf
self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
self.title = title
self.manuscript_file = manuscript_file
self.extract_transkription_field_only = extract_transkription_field_only
self.manuscript_tree = None
if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file):
self.manuscript_tree = ET.parse(self.manuscript_file)
self.title = self.manuscript_tree.getroot().get('title')
elif bool(self.manuscript_file):
raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file))
elif bool(self.title):
self.update_title_and_manuscript(self.title, False)
def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None):
"""Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word).
If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created.
:returns: the new word counter (int)
"""
break_points = []
if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
for Sonderzeichen in self.SONDERZEICHEN_LIST:
contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ]
if True in contains_Sonderzeichen:
break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]]
for sz_point in [i for i, e in break_points]:
wim_index = len(page.word_insertion_marks)
x = float(word_part_objs[sz_point]['x'])
y = float(word_part_objs[sz_point]['y'])
if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None:
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
xmin = transkription_field.xmin
ymin = transkription_field.ymin
wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\
line_number=page.get_line_number(y-1), mark_type=Sonderzeichen)
page.word_insertion_marks.append(wim)
if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points
THRESHOLDX = 20 # Threshold between line number and text
last_x = -1
for i, x in enumerate([float(dict['x']) for dict in word_part_objs]):
if(last_x > -1 and (x - last_x > THRESHOLDX)):
break_points.append((i, i))
last_x = x
if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words
from_index = 0
for end_point, next_from_index in break_points:
new_word_part_objs = word_part_objs[from_index:end_point]
new_endX = word_part_objs[end_point]['x']
from_index = next_from_index
index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
if from_index > 0 and from_index < len(word_part_objs):
new_word_part_objs = word_part_objs[from_index:]
index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
return index
else:
if len(word_part_objs) > 0:
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\
debug_msg_string=debug_msg, transkription_field=transkription_field)
text = self.get_word_from_part_obj(word_part_objs)
line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
if line_number == -1:
if len(page.words) > 0:
lastWord = page.words[len(page.words)-1]
lastWord_lastTP = lastWord.transkription_positions[len(lastWord.transkription_positions)-1]
lastTP = transkription_positions[len(transkription_positions)-1]
if transkription_positions[0].left > lastWord_lastTP.left\
and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2:
line_number = lastWord.line_number
else:
line_number = lastWord.line_number+1
newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
page.words.append(newWord)
return int(index) + 1
else:
return int(index)
def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default'):
"""Extracts information about positions of text elements and writes them to a xml file.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
exit_status = 0
with warnings.catch_warnings(record=record_warnings) as w:
warnings.simplefilter(warning_filter)
page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile)
status_message = 'OK'
if w is not None and len(w) > 0:
status_message = 'with warnings'
if True in [ str(warn.message).startswith(Page.WARNING_MISSING_USE_NODE4PWP) for warn in w ]:
status_message += ':{}:'.format(Page.WARNING_MISSING_USE_NODE4PWP.lower())
if True in [ str(warn.message).startswith(Page.WARNING_MISSING_GLYPH_ID4WIM) for warn in w ]:
status_message += ':{}:'.format(Page.WARNING_MISSING_GLYPH_ID4WIM.lower())
self.latest_status = status_message
exit_status = 1
else:
self.latest_status = None
page.page_tree.getroot().set('status', status_message)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition')
return exit_status
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None):
"""Extracts information about positions of text elements.
[:returns:] (datatypes.page) the Page containing all information.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
transkription_field = TranskriptionField(file_name) if bool(self.extract_transkription_field_only) else None
svg_tree = ET.parse(file_name)
page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number, pdfFile=pdfFile,\
svg_file=svg_file, extract_transkription_field_only=self.extract_transkription_field_only)
page.add_source(file_name)
sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot())
page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
if transkription_field is not None:
page.init_line_numbers(self.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax)
self.extract_word_position(svg_tree, page, transkription_field=transkription_field)
- #if page.pdfFile is not None and isfile(page.pdfFile):
- # pdftext = PDFText(page.pdfFile, sonderzeichen=self.SONDERZEICHEN_LIST)
- # pdftext.compare_svgWords2pdfWords(page, transkription_field=transkription_field, split_wrongly_concatenated_words=self.compare2pdf)
page.create_writing_processes_and_attach2tree()
- #page.categorize_paths(transkription_field=transkription_field)
page.update_and_attach_words2tree()
for word_insertion_mark in page.word_insertion_marks:
# it is not clear if we really need to know this alternative word ordering. See 'TODO.md'
#word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark)
word_insertion_mark.attach_object_to_tree(page.page_tree)
return page
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def extract_line_numbers(self, svg_tree, transkription_field):
"""Extracts line numbers and write them to a xml file.
"""
nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
line_numbers = [ LineNumber(raw_text_node=item, transkription_field=transkription_field)\
for item in filter(lambda x: LineNumber.IS_A_LINE_NUMBER(x), nodes_near_tf)]
if len(line_numbers) > 0:
MINABOVE = 3
last_to_position = transkription_field.ymin
for line_number in line_numbers:
above_current_line_bottom = line_number.bottom + transkription_field.ymin - MINABOVE
bottoms = self.get_bottoms(svg_tree.getroot(), from_position=last_to_position, to_position=above_current_line_bottom)
last_to_position = above_current_line_bottom
if len(bottoms) > 0:
current_line_top = float(bottoms[len(bottoms)-1]) - transkription_field.ymin + MINABOVE
line_number.setTop(current_line_top)
return line_numbers
def extract_word_position(self, svg_tree, page, transkription_field=None):
"""Extracts word positions.
"""
counter = 0
word_part_obj = []
endSign = '%'
last_matrix = None
MAXBOTTOMDIFF = 5
MAXXDIFF = 6
if not Extractor.UNITTESTING:
bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field)
# check for line breaks
if (last_matrix is not None and len(word_part_obj) > 0 and (\
Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field)
word_part_obj = []
endX = current_matrix.getX()
if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: