Page MenuHomec4science

faksimile.py
No OneTemporary

File Metadata

Created
Wed, May 1, 03:24

faksimile.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a faksimile page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import re
from lxml import etree as ET
from os import path
from os.path import isdir, isfile, sep, basename
from svgpathtools.parser import parse_path
from .faksimile_image import FaksimileImage
from .matrix import Matrix
from .text_field import TextField
from .word_position import WordPosition
class FaksimilePage:
"""
This class represents a faksimile page.
Args:
xml_target_file (str): name of the xml file to which page info will be written.
xml_source_file (str): name of the xml file that will be instantiated.
"""
XML_TAG = 'faksimile-page'
def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, svg_source_file=None, faksimile_image=None, text_field=None):
xml_file = xml_source_file if xml_source_file is not None else xml_target_file
self.title = title
self.page_number = page_number
self.xml_file = xml_file
if xml_file is not None and isfile(xml_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_file, parser)
self.title = self.page_tree.getroot().get('title')
self.page_number = self.page_tree.getroot().get('page-number')
self.width = float(self.page_tree.getroot().get('width')) if bool(self.page_tree.getroot().get('width')) else 0.0
self.height = float(self.page_tree.getroot().get('height')) if bool(self.page_tree.getroot().get('height')) else 0.0
else:
self.page_tree = ET.ElementTree(ET.Element(self.XML_TAG))
if title is not None:
self.page_tree.getroot().set('title', title)
if page_number is not None:
self.page_tree.getroot().set('page-number', str(page_number))
if xml_target_file is not None:
self.remove_tags_from_page_tree([WordPosition.FAKSIMILE])
if svg_source_file is not None:
self.page_tree.getroot().set('svg-source-file', svg_source_file)
if faksimile_image is not None:
faksimile_image.attach_object_to_tree(self.page_tree)
if text_field is not None:
text_field.attach_object_to_tree(self.page_tree)
self.svg_source_file = self.page_tree.getroot().get('svg-source-file')
self.faksimile_image = FaksimileImage(node=self.page_tree.getroot().find('.//' + FaksimileImage.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + FaksimileImage.XML_TAG)) > 0 else None
self.text_field = TextField(node=self.page_tree.getroot().find('.//' + TextField.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + TextField.XML_TAG)) > 0 else None
self.word_positions = [ WordPosition(node=node) for node in self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE) ]\
if len(self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE)) > 0 else []
def append_word_position(self, word_position):
"""Appends word_position to word_positions and attaches it to page_tree.
"""
self.word_positions.append(word_position)
word_position.attach_object_to_tree(self.page_tree)
@classmethod
def get_faksimile_pages(cls, svg_file, page_number='') -> list:
"""Creates and returns text fields contained in a svg_file as a list.
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
return cls.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces, page_number=page_number)
@staticmethod
def GET_FAKSIMILEPAGES(svg_tree, namespaces=None, page_number='') -> list:
"""Creates and returns text fields contained in a svg_tree as a list.
"""
THRESHOLD_X = 10
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
source_file_name = svg_tree.docinfo.URL
image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap), source_file_name)
xml_dir = '.{}xml'.format(sep)
faksimile_pages = list()
title_string = re.sub(r'[,_][0-9]+.*\.svg', '', basename(source_file_name))
if re.match(r'.*-\d+[a-z]$', title_string):
title_string = re.sub(r'-\d+[a-z]$', '', title_string)
title = title_string.replace('-', ' ')
rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap)\
if rect.get('id', svg_tree.getroot().nsmap).startswith(title_string)\
and rect.get('id', svg_tree.getroot().nsmap).endswith(str(page_number)) ]
for text_field_rect in rect_list:
tf_x = float(text_field_rect.get('x', svg_tree.getroot().nsmap)) - image.x
tf_y = float(text_field_rect.get('y', svg_tree.getroot().nsmap)) - image.y
tf_width = float(text_field_rect.get('width', svg_tree.getroot().nsmap))
tf_height = float(text_field_rect.get('height', svg_tree.getroot().nsmap))
tf_matrix = Matrix(transform_matrix_string=text_field_rect.get('transform'))\
if bool(text_field_rect.get('transform'))\
else None
id = text_field_rect.get('id', svg_tree.getroot().nsmap)
target_file_name = xml_dir + sep + id + '.xml' if isdir(xml_dir) else id + '.xml'
page_number = re.sub(r'.*[,_]', '', id)
if page_number.startswith('0'):
page_number = page_number.lstrip('0')
text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y, matrix=tf_matrix)
faksimile_page = FaksimilePage(xml_target_file=target_file_name, svg_source_file=source_file_name,\
title=title, page_number=page_number, faksimile_image=image, text_field=text_field)
x_min = text_field.xmin + image.x
y_min = text_field.ymin + image.y
#rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\
# x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces)
rect_titles = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\
y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces)
rect_titles += get_paths_inside_rect(svg_tree, '//ns:path/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\
y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces)
for rect_title in rect_titles:
rect = rect_title.getparent()
x, y, height, width = 0.0, 0.0, 0.0, 0.0
if rect.tag.endswith('path'):
path = parse_path(rect.get('d'))
x, xmax, y, ymax = path.bbox()
width = xmax - x
height = ymax - y
else:
x = float(rect.get('x', svg_tree.getroot().nsmap))
y = float(rect.get('y', svg_tree.getroot().nsmap))
height = float(rect.get('height', svg_tree.getroot().nsmap))
width = width=float(rect.get('width', svg_tree.getroot().nsmap))
matrix = None
if bool(rect.get('transform')):
matrix = Matrix(transform_matrix_string=rect.get('transform'))
text = re.sub(r'(\s(?=[-;:.,?!’–])|(?<=[-;:.,?!’–])\s)', '', rect_title.text)
faksimile_page.append_word_position(\
WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=text, height=height,\
width=width, x=x, y=y, matrix=matrix, tag=WordPosition.FAKSIMILE))
faksimile_pages.append(faksimile_page)
return faksimile_pages
def remove_tags_from_page_tree(self, list_of_tags_to_remove):
"""Removes the tags specified in the list from the target tree.
"""
for xpath2remove in list_of_tags_to_remove:
for node in self.page_tree.xpath('//' + xpath2remove):
node.getparent().remove(node)
def get_paths_inside_rect(svg_tree, xpath, x_min, x_max, y_min, y_max, not_id, namespaces={}):
"""Returns a list of all paths selected by xpath that are inside x_min, x_max, y_min, y_max and do not have id == not_id.
"""
paths = []
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
for path_node in svg_tree.xpath(xpath, namespaces=namespaces):
append_node = path_node
if not path_node.tag.endswith('path') and not path_node.tag.endswith('rect'):
path_node = path_node.getparent()
x, xmax, y, ymax = -1, -1, -1, -1
init_xy = False
if path_node.tag.endswith('rect'):
x = float(path_node.get('x')) if bool(path_node.get('x')) else -1
y = float(path_node.get('y')) if bool(path_node.get('y')) else -1
xmax = x + float(path_node.get('width')) if bool(path_node.get('width')) else -1
ymax = y + float(path_node.get('height')) if bool(path_node.get('height')) else -1
init_xy = True
elif path_node.tag.endswith('path') and bool(path_node.get('d')) and path_node.get('d') != 0:
path = parse_path(path_node.get('d'))
x, xmax, y, ymax = path.bbox()
init_xy = True
if init_xy:
if bool(path_node.get('transform')):
matrix = Matrix(transform_matrix_string=path_node.get('transform'))
x, xmax = matrix.get_new_x(x=x, y=y), matrix.get_new_x(x=xmax, y=ymax)
y, ymax = matrix.get_new_y(x=x, y=y), matrix.get_new_y(x=xmax, y=ymax)
width = xmax - x
height = ymax - y
if x > x_min and x < x_max\
and y > y_min and y < y_max\
and path_node.get('id') != not_id:
paths.append(append_node)
return paths

Event Timeline