Page MenuHomec4science

extract_line_continuation.py
No OneTemporary

File Metadata

Created
Fri, Apr 26, 09:26

extract_line_continuation.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract line continuations.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
import getopt
import lxml.etree as ET
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import warnings
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from datatypes.box import text_node_is_inside_match_box, tspan_node_is_inside_match_box
from datatypes.line import Line
from datatypes.line_continuation import LineContinuation
from datatypes.matrix import Matrix
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.reference import Reference
from datatypes.transkriptionField import TranskriptionField
from util import back_up
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
UNITTESTING = False
DEBUG = False
def extract_line_continuations(page: Page, svg_file=None, warning_message='WARNING'):
"""Extract line continuations.
"""
if svg_file is None:
if page.source is None or not isfile(page.source):
raise Exception('Function "extract_line_continuations" needs a page with a valid source or a svg_file!')
svg_file = page.source
if not UNITTESTING:
print(Fore.CYAN + f'Extracting line continuations on {page.title}, {page.number} ...' + Style.RESET_ALL)
svg_tree = ET.parse(svg_file)
transkription_field = TranskriptionField(svg_file, multipage_index=page.multipage_index)
set_to_text_field_zero = (page.svg_image is None or page.svg_image.text_field is None)
tr_xmin = transkription_field.xmin if set_to_text_field_zero else 0
tr_ymin = transkription_field.ymin if set_to_text_field_zero else 0
page.update_line_number_area(transkription_field, svg_tree=svg_tree, set_to_text_field_zero=set_to_text_field_zero)
for line in page.lines: line.editor_comments = []
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
arrow_style_key = [ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen'][0]\
if len([ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen']) > 0\
else None
if arrow_style_key is not None:
if DEBUG:
print(arrow_style_key)
for arrow in _extract_arrow_nodes(svg_tree, arrow_style_key, transkription_field, namespaces):
matrix = Matrix(transform_matrix_string=arrow.get('transform'))\
if not arrow.tag.endswith('tspan')\
else Matrix(transform_matrix_string=arrow.getparent().get('transform'))
line = _get_line_of_arrow(arrow, page, tr_ymin)
if line is not None:
reference_counter = 0
reference = None
while reference is None and reference_counter < 2:
reference = _get_reference(svg_tree, arrow, matrix, transkription_field, namespaces, is_from_reference=(reference_counter==0))
reference_counter += 1
if reference is not None:
line.editor_comments.append(LineContinuation(reference=reference, to_reference=(reference_counter>1)))
else:
to_reference = (matrix.getX() > transkription_field.xmax)
line.editor_comments.append(LineContinuation(reference=Reference(), to_reference=to_reference))
else:
y = round(matrix.getY() - tr_ymin, 2)
warnings.warn(f'{warning_message}: There is no line for {y}')
for line in page.lines: line.attach_object_to_tree(page.page_tree)
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
def _extract_arrow_nodes(svg_tree: ET.ElementTree, arrow_style_key: str, transkription_field=None, namespaces=None) ->list:
"""Extract arrow nodes from svg_tree.
"""
if transkription_field is None:
transkription_field = TranskriptionField(svg_tree.docinfo.URL)
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
return [ arrow for arrow in svg_tree.xpath('//ns:text[contains(@class, "{0}")]'.format(arrow_style_key)\
+ '|//ns:tspan[contains(@class, "{0}")]'.format(arrow_style_key),\
namespaces=namespaces)\
if arrow.text == ')' and node_is_on_marginalia(arrow, transkription_field) ]
def _get_arrow_y(arrow: ET.Element, matrix=None) ->float:
"""Return y of arrow node.
"""
if matrix is None:
matrix = Matrix(transform_matrix_string=arrow.get('transform'))\
if not arrow.tag.endswith('tspan')\
else Matrix(transform_matrix_string=arrow.getparent().get('transform'))
if arrow.tag.endswith('tspan'):
return matrix.add2Y(add_to_y=arrow.get('y'))
else:
return matrix.getY()
def _get_line_of_arrow(arrow: ET.Element, page: Page, tr_ymin: float, matrix=None) ->Line:
"""Return Line next to arrow.
"""
arrow_y = _get_arrow_y(arrow, matrix=matrix)
line_number = page.get_line_number(round(arrow_y - tr_ymin, 2) -.5)
lines = [ line for line in page.lines if line.id == line_number ]
if len(lines) > 0:
return lines[0]
return None
def _get_reference(svg_tree: ET.ElementTree, arrow: ET.Element, arrow_matrix: Matrix, transkription_field: TranskriptionField, namespaces: dict, is_from_reference=True) ->Reference:
"""Return reference.
"""
reference = None
arrow_left = arrow_matrix.add2X(add_to_x=arrow.get('x'))\
if arrow.tag.endswith('tspan')\
else arrow_matrix.getX()
arrow_y = _get_arrow_y(arrow, matrix=arrow_matrix)
xmin = 0\
if arrow_left < transkription_field.xmin\
else transkription_field.xmax + transkription_field.line_number_area_width
xmax = arrow_left
ymin = arrow_y -5
ymax = arrow_y +5
if not is_from_reference:
xmin = xmax
xmax = transkription_field.xmin - transkription_field.line_number_area_width\
if arrow_left < transkription_field.xmin\
else transkription_field.documentWidth + transkription_field.line_number_area_width
text_nodes_on_arrow_line = sorted([ text_node for text_node in svg_tree.xpath('//ns:text', namespaces=namespaces)\
if text_node != arrow and text_node_is_inside_match_box(text_node, xmin, xmax, ymin, ymax) ],\
key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX())
reference_string = ''
for text_node in text_nodes_on_arrow_line:
reference_string += ''.join([ child.text for child in text_node.getchildren()])\
if len(text_node.getchildren()) > 0\
else text_node.text
if reference_string != '':
try:
reference = Reference.create_cls(reference_string=reference_string)
except Exception:
print(reference_string)
return reference
def node_is_on_marginalia(node: ET.Element, transkription_field: TranskriptionField) ->bool:
"""Return true if node is on marginalia.
"""
if node.tag.endswith('tspan'):
return tspan_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\
or tspan_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax)
return text_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\
or text_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to extract the line continuations.
svgscripts/extract_line_continuation.py [OPTIONS] <xmlManuscriptFile|svg_pos_file>
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
manuscript_file = file_a\
if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
else None
counter = 0
for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK):
if not UNITTESTING:
back_up(page, page.xml_file)
extract_line_continuations(page)
counter += 1
not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline