Page MenuHomec4science

box.py
No OneTemporary

File Metadata

Created
Tue, May 7, 00:44
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent svg paths of type 'box'.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from svgpathtools.parser import parse_path
import warnings
from .matrix import Matrix
from .path import Path
from .transkriptionField import TranskriptionField
class Box(Path):
"""
This represents box svg paths.
Args:
node (lxml.etree.Element) node, containing information
path (svgpathtools.path.Path) svg path representation.
"""
XML_TAG = 'box-path'
def __init__(self, id=0, node=None, path=None, d_string=None, style_class='', earlier_text='', text_style_class='', earlier_version=False):
super(Box,self).__init__(id=id, node=node, path=path, d_string=d_string, style_class=style_class, tag=Box.XML_TAG)
self.stringKeys += [ 'earlier_text', 'text_style_class' ]
self.earlier_text = earlier_text
self.text_style_class = text_style_class
self.earlier_version = earlier_version
if node is not None:
if bool(node.get('earlier-text')):
self.earlier_text = node.get('earlier-text')
if bool(node.get('text-style-class')):
self.text_style_class = node.get('text-style-class')
@classmethod
def create_box(cls, path, margin_boxes_on_line, svg_source=None, svg_tree=None, transkription_field=None, namespaces={}, threshold=1.5):
"""Create a Box from a path and find its corresponding earlier_text outside of transkription_field.
:return: box.Box
"""
if svg_source is not None:
svg_tree = ET.parse(svg_source)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
if transkription_field is None:
transkription_field = TranskriptionField(svg_source) if svg_source is not None\
else TranskriptionField(svg_tree.docinfo.URL)
matching_boxes = [ margin_box for margin_box in margin_boxes_on_line\
if abs(margin_box.get_median_y()-path.get_median_y()) < threshold ]
box = None
if len(matching_boxes) > 0:
matching_box = matching_boxes[0]
margin_boxes_on_line.remove(matching_box)
xmin, xmax, ymin, ymax = matching_box.path.bbox()
if ymin == ymax:
ymin = path.path.bbox()[2]
ymax = path.path.bbox()[3]
text_nodes = [ text_node for text_node in svg_tree.xpath('//ns:text', namespaces=namespaces)\
if text_node_is_inside_match_box(text_node, xmin, xmax, ymin, ymax) ]
tspan_nodes = [ tspan_node for tspan_node in svg_tree.xpath('//ns:text/ns:tspan', namespaces=namespaces)\
if tspan_node_is_inside_match_box(tspan_node, xmin, xmax, ymin, ymax) ]
box_text = ''
text_styles = []
if len(text_nodes) > 0:
text_nodes = sorted(text_nodes, key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX())
for text_node in text_nodes:
if len(text_node.xpath('./ns:tspan', namespaces=namespaces)) == 0:
text_styles += [ text_node.get('class') ]
box_text += text_node.text
else:
matrix = Matrix(transform_matrix_string=text_node.get('transform'))
for tspan_node in text_node.xpath('./ns:tspan', namespaces=namespaces):
if matrix.add2X(add_to_x=tspan_node.get('x')) < xmax:
text_styles.append(tspan_node.get('class'))
box_text += tspan_node.text
elif len(tspan_nodes) > 0:
for tspan_node in tspan_nodes:
text_styles.append(tspan_node.get('class'))
box_text += tspan_node.text
else:
warnings.warn('No text_node found for xmin, xmax, ymin, ymax: {0} {1} {2} {3}'.format(xmin, xmax, ymin, ymax))
text_style_class = ' '.join(list(set([ item for style in text_styles for item in style.split(' ') ])))
box = Box(id=path.id, path=path.path, style_class=path.style_class,\
earlier_text=box_text.replace(' ',''), text_style_class=text_style_class)
else:
#print([ margin_box.path.bbox() for margin_box in margin_boxes_on_line ], len(margin_boxes_on_line))
warnings.warn(f'No margin box found for box with bbox: {path.path.bbox()}, {margin_boxes_on_line} {threshold}')
return box
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(Box,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_text', str))
return cls.return_dictionary_after_updating_super_classes(dictionary)
def text_node_is_inside_match_box(text_node, xmin, xmax, ymin, ymax):
"""Return true if text_node is inside xmin, xmax, ymin, ymax.
"""
if not bool(text_node.get('transform')):
return False
matrix = Matrix(transform_matrix_string=text_node.get('transform'))
return matrix.getY() > ymin and matrix.getY() < ymax\
and matrix.getX() > xmin and matrix.getX() < xmax
def tspan_node_is_inside_match_box(tspan_node, xmin, xmax, ymin, ymax):
"""Return true if tspan_node is inside xmin, xmax, ymin, ymax.
"""
if not bool(tspan_node.getparent().get('transform')):
return False
matrix = Matrix(transform_matrix_string=tspan_node.getparent().get('transform'))
tspan_x = matrix.add2X(add_to_x=tspan_node.get('x'))
return matrix.getY() > ymin and matrix.getY() < ymax\
and tspan_x > xmin and tspan_x < xmax

Event Timeline