extract_line_continuation.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Apr 26, 09:26

extract_line_continuation.py
View Options

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	""" This program can be used to extract line continuations.
	"""
	# Copyright (C) University of Basel 2019 {{{1
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}

	from colorama import Fore, Style
	import getopt
	import lxml.etree as ET
	import re
	import sys
	from os import listdir, sep, path
	from os.path import isfile, isdir, dirname
	import warnings



	__author__ = "Christian Steiner"
	__maintainer__ = __author__
	__copyright__ = 'University of Basel'
	__email__ = "christian.steiner@unibas.ch"
	__status__ = "Development"
	__license__ = "GPL v3"
	__version__ = "0.0.1"

	from datatypes.box import text_node_is_inside_match_box, tspan_node_is_inside_match_box
	from datatypes.line import Line
	from datatypes.line_continuation import LineContinuation
	from datatypes.matrix import Matrix
	from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
	from datatypes.reference import Reference
	from datatypes.transkriptionField import TranskriptionField

	from util import back_up
	sys.path.append('shared_util')
	from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT

	UNITTESTING = False
	DEBUG = False

	def extract_line_continuations(page: Page, svg_file=None, warning_message='WARNING'):
	"""Extract line continuations.
	"""
	if svg_file is None:
	if page.source is None or not isfile(page.source):
	raise Exception('Function "extract_line_continuations" needs a page with a valid source or a svg_file!')
	svg_file = page.source
	if not UNITTESTING:
	print(Fore.CYAN + f'Extracting line continuations on {page.title}, {page.number} ...' + Style.RESET_ALL)
	svg_tree = ET.parse(svg_file)
	transkription_field = TranskriptionField(svg_file, multipage_index=page.multipage_index)
	set_to_text_field_zero = (page.svg_image is None or page.svg_image.text_field is None)
	tr_xmin = transkription_field.xmin if set_to_text_field_zero else 0
	tr_ymin = transkription_field.ymin if set_to_text_field_zero else 0
	page.update_line_number_area(transkription_field, svg_tree=svg_tree, set_to_text_field_zero=set_to_text_field_zero)
	for line in page.lines: line.editor_comments = []
	namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
	arrow_style_key = [ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen'][0]\
	if len([ key for key, value in page.style_dict.items() if value.get('font-family') == 'Frutiger-Europeen']) > 0\
	else None
	if arrow_style_key is not None:
	if DEBUG:
	print(arrow_style_key)
	for arrow in _extract_arrow_nodes(svg_tree, arrow_style_key, transkription_field, namespaces):
	matrix = Matrix(transform_matrix_string=arrow.get('transform'))\
	if not arrow.tag.endswith('tspan')\
	else Matrix(transform_matrix_string=arrow.getparent().get('transform'))
	line = _get_line_of_arrow(arrow, page, tr_ymin)
	if line is not None:
	reference_counter = 0
	reference = None
	while reference is None and reference_counter < 2:
	reference = _get_reference(svg_tree, arrow, matrix, transkription_field, namespaces, is_from_reference=(reference_counter==0))
	reference_counter += 1
	if reference is not None:
	line.editor_comments.append(LineContinuation(reference=reference, to_reference=(reference_counter>1)))
	else:
	to_reference = (matrix.getX() > transkription_field.xmax)
	line.editor_comments.append(LineContinuation(reference=Reference(), to_reference=to_reference))
	else:
	y = round(matrix.getY() - tr_ymin, 2)
	warnings.warn(f'{warning_message}: There is no line for {y}')
	for line in page.lines: line.attach_object_to_tree(page.page_tree)
	if not UNITTESTING:
	write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
	script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)

	def _extract_arrow_nodes(svg_tree: ET.ElementTree, arrow_style_key: str, transkription_field=None, namespaces=None) ->list:
	"""Extract arrow nodes from svg_tree.
	"""
	if transkription_field is None:
	transkription_field = TranskriptionField(svg_tree.docinfo.URL)
	if namespaces is None:
	namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
	return [ arrow for arrow in svg_tree.xpath('//ns:text[contains(@class, "{0}")]'.format(arrow_style_key)\
	+ '\|//ns:tspan[contains(@class, "{0}")]'.format(arrow_style_key),\
	namespaces=namespaces)\
	if arrow.text == ')' and node_is_on_marginalia(arrow, transkription_field) ]

	def _get_arrow_y(arrow: ET.Element, matrix=None) ->float:
	"""Return y of arrow node.
	"""
	if matrix is None:
	matrix = Matrix(transform_matrix_string=arrow.get('transform'))\
	if not arrow.tag.endswith('tspan')\
	else Matrix(transform_matrix_string=arrow.getparent().get('transform'))
	if arrow.tag.endswith('tspan'):
	return matrix.add2Y(add_to_y=arrow.get('y'))
	else:
	return matrix.getY()

	def _get_line_of_arrow(arrow: ET.Element, page: Page, tr_ymin: float, matrix=None) ->Line:
	"""Return Line next to arrow.
	"""
	arrow_y = _get_arrow_y(arrow, matrix=matrix)
	line_number = page.get_line_number(round(arrow_y - tr_ymin, 2) -.5)
	lines = [ line for line in page.lines if line.id == line_number ]
	if len(lines) > 0:
	return lines[0]
	return None

	def _get_reference(svg_tree: ET.ElementTree, arrow: ET.Element, arrow_matrix: Matrix, transkription_field: TranskriptionField, namespaces: dict, is_from_reference=True) ->Reference:
	"""Return reference.
	"""
	reference = None
	arrow_left = arrow_matrix.add2X(add_to_x=arrow.get('x'))\
	if arrow.tag.endswith('tspan')\
	else arrow_matrix.getX()
	arrow_y = _get_arrow_y(arrow, matrix=arrow_matrix)
	xmin = 0\
	if arrow_left < transkription_field.xmin\
	else transkription_field.xmax + transkription_field.line_number_area_width
	xmax = arrow_left
	ymin = arrow_y -5
	ymax = arrow_y +5
	if not is_from_reference:
	xmin = xmax
	xmax = transkription_field.xmin - transkription_field.line_number_area_width\
	if arrow_left < transkription_field.xmin\
	else transkription_field.documentWidth + transkription_field.line_number_area_width
	text_nodes_on_arrow_line = sorted([ text_node for text_node in svg_tree.xpath('//ns:text', namespaces=namespaces)\
	if text_node != arrow and text_node_is_inside_match_box(text_node, xmin, xmax, ymin, ymax) ],\
	key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX())
	reference_string = ''
	for text_node in text_nodes_on_arrow_line:
	reference_string += ''.join([ child.text for child in text_node.getchildren()])\
	if len(text_node.getchildren()) > 0\
	else text_node.text
	if reference_string != '':
	try:
	reference = Reference.create_cls(reference_string=reference_string)
	except Exception:
	print(reference_string)
	return reference

	def node_is_on_marginalia(node: ET.Element, transkription_field: TranskriptionField) ->bool:
	"""Return true if node is on marginalia.
	"""
	if node.tag.endswith('tspan'):
	return tspan_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\
	or tspan_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax)
	return text_node_is_inside_match_box(node, 0, transkription_field.xmin, transkription_field.ymin, transkription_field.ymax)\
	or text_node_is_inside_match_box(node, transkription_field.xmax, transkription_field.documentWidth, transkription_field.ymin, transkription_field.ymax)

	def usage():
	"""prints information on how to use the script
	"""
	print(main.__doc__)

	def main(argv):
	"""This program can be used to extract the line continuations.

	svgscripts/extract_line_continuation.py [OPTIONS] <xmlManuscriptFile\|svg_pos_file>

	<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
	<svg_pos_file> a xml file about a page, containing information about svg word positions.

	OPTIONS:
	-h\|--help show help

	:return: exit code (int)
	"""
	try:
	opts, args = getopt.getopt(argv, "h", ["help" ])
	except getopt.GetoptError:
	usage()
	return 2
	for opt, arg in opts:
	if opt in ('-h', '--help'):
	usage()
	return 0
	if len(args) < 1:
	usage()
	return 2
	exit_status = 0
	file_a = args[0]
	if isfile(file_a):
	manuscript_file = file_a\
	if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
	else None
	counter = 0
	for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK):
	if not UNITTESTING:
	back_up(page, page.xml_file)
	extract_line_continuations(page)
	counter += 1
	not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
	else:
	raise FileNotFoundError('File {} does not exist!'.format(file_a))
	return exit_status

	if __name__ == "__main__":
	sys.exit(main(sys.argv[1:]))

extract_line_continuation.pyNo OneTemporaryActions

File Metadata

extract_line_continuation.pyView Options

Event Timeline

extract_line_continuation.py
No OneTemporary
Actions

extract_line_continuation.py
View Options