extractFaksimilePosition.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Jul 4, 14:31

extractFaksimilePosition.py
View Options

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	""" This program can be used to extract the position of the word hovers in a faksimile svg file and write them to a xml file.
	"""
	# Copyright (C) University of Basel 2019 {{{1
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}

	import re
	import getopt
	import sys
	from os import sep, listdir, mkdir, path
	from os.path import exists, isfile, isdir
	from datetime import datetime
	from lxml import etree as ET
	from svgpathtools import svg2paths2

	from myxmlwriter import write_pretty
	from datatypes.faksimile import Faksimile
	from datatypes.faksimile_image import FaksimileImage
	from datatypes.matrix import Matrix
	from datatypes.transkriptionField import TranskriptionField
	from datatypes.word import Word

	__author__ = "Christian Steiner"
	__maintainer__ = __author__
	__copyright__ = 'University of Basel'
	__email__ = "christian.steiner@unibas.ch"
	__status__ = "Development"
	__license__ = "GPL v3"
	__version__ = "0.0.1"

	class Extractor:
	"""
	This class can be used to extract the word hover positions in a faksimile svg file and write it to a xml file.

	Args:
	[xml_dir (str): target directory]
	[title (str): title of document]

	TODO change everything!!!
	"""
	def __init__(self, xml_dir=None, title=None):
	if bool(xml_dir):
	self.xml_dir = xml_dir
	not isdir(self.xml_dir) and mkdir(self.xml_dir)
	else:
	self.xml_dir = 'xml' if(isdir('xml')) else ''
	self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
	self.title = title

	def get_page_number(self, file_name, page_number=None):
	""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
	"""
	if not bool(page_number) and bool(re.search(r'\d', file_name)):
	"""if page_number=None and filename contains digits,
	then split filename into its parts that contain only digits, remove empty strings
	and return the last part containing only digits.
	"""
	page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop()
	if bool(page_number):
	leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else ''
	return leading_zeros + str(page_number)
	else:
	return ''

	def get_file_name(self, file_name, page_number=None):
	"""Returns the file_name of the target xml file.
	"""
	dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else ''
	if bool(self.title):
	return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml'
	else:
	return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml'))

	def extract_faksimile_word_position(self, svg_tree, page, transkription_field=None):
	"""Extracts faksimile word hover positions.
	"""
	counter = 0
	word_part_obj = []
	endSign = '%'
	last_matrix = None
	MAXBOTTOMDIFF = 5
	MAXXDIFF = 6
	for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
	current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field)
	# check for line breaks
	if (last_matrix is not None and len(word_part_obj) > 0 and (\
	Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
	(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
	(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
	or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
	endSign = '%'
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
	abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), abs(current_matrix.getY() - last_matrix.getY()),\
	str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg=debug_msg)
	word_part_obj = []
	endX = current_matrix.getX()
	if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: <svg><text>TEXT
	if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))):
	word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class')} )
	else:
	endSign = text_item.text
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg='svg/text/\s')
	word_part_obj = []
	endSign = '%'
	for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: <svg><text><tspan>TEXT
	endX = current_matrix.add2X(tspan_item.get('x'))
	if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))):
	y = current_matrix.add2Y(tspan_item.get('y'))
	word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class') })
	if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: # text_item has letterspacing class
	endSign = '%'
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg='tspan with letterspacing')
	word_part_obj = []
	else:
	endSign = tspan_item.text
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg='svg/text/tspan/\s')
	word_part_obj = []
	endSign = '%'
	last_matrix = current_matrix
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg='end of loop')
	word_part_obj = []
	endSign = '%'

	def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None):
	"""Extracts information about positions of text elements.

	[:returns:] (svgscripts.Page) the page containing all information.
	"""
	if isfile(file_name):
	if not bool(xml_target_file):
	xml_target_file = self.get_file_name(file_name, page_number)
	if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
	xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
	transkription_field = TranskriptionField(file_name)
	svg_tree = ET.parse(file_name)
	page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number)
	self.extract_faksimile_word_position(svg_tree, page, transkription_field=transkription_field)
	return page
	else:
	raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))

	def extractAndWriteInformation(self, file_name):
	"""Extracts faksimile word positions for each faksimile page and writes them to xml files.
	"""
	if isfile(file_name):
	#TODO
	faksimile = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file)
	write_pretty(xml_element_tree=faksimile.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition')
	return 0
	else:
	raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))

	def usage():
	"""prints information on how to use the script
	"""
	print(main.__doc__)

	def main(argv):
	"""This program can be used to extract the position of the words in a svg file and write them to a xml file.

	svgscripts/extractWordPosition.py [-h\|--help, -d\|--xml-dir=xmlDir, -m\|-manuscript-file, -o\|--only-transkription-field, -p\|--page=pageNumber, -t\|--title=title, -x\|--xml-target-file=xmlOutputFile ] <file>
	svgscripts/extractWordPosition.py [-h\|--help, -d\|--xml-dir=xmlDir, -m\|-manuscript-file, -o\|--only-transkription-field, -t\|--title=title] <file\|dir> ...
	svgscripts/extractWordPosition.py [-h\|--help, -d\|--xml-dir=xmlDir, -m\|-manuscript-file -o\|--only-transkription-field]

	-h\|--help: show help
	-d\|--xml-dir=xmlDir: target directory for the xml output file(s)
	-m\|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s)
	-o\|--only-transkription-field: extract only words that are part of the transkription field.
	-p\|--page=pageNumber: page number of the current page. For use with _one_ file only.
	-x\|--xml-target-file=xmlOutputFile: target file
	-t\|--title=title: title of the manuscript to which the current page(s) belong(s)

	:return: exit code (int)
	"""
	xml_dir = ".{}xml".format(sep)
	title = None
	page_number = None
	xml_target_file = None
	manuscript_file = None
	extract_transkription_field_only = False

	try:
	opts, args = getopt.getopt(argv, "hod:m:t:p:x:", ["help", "only-transkription-field", "xml-dir=", "manuscript-file=", "title=", "page=", "xml-target-file="])
	except getopt.GetoptError:
	usage()
	return 2

	for opt, arg in opts:
	if opt in ('-h', '--help') or not args:
	usage()
	return 0
	elif opt in ('-o', '--only-transkription-field'):
	extract_transkription_field_only = True
	elif opt in ('-d', '--xml-dir'):
	xml_dir = arg
	elif opt in ('-m', '--manuscript-file'):
	manuscript_file = arg
	elif opt in ('-t', '--title'):
	title = arg
	elif opt in ('-p', '--page'):
	page_number = str(arg)
	elif opt in ('-x', '--xml-target-file'):
	xml_target_file = str(arg)
	files_to_process = list()
	for arg in args:
	if isfile(arg):
	files_to_process.append(arg)
	elif isdir(arg):
	files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg)))
	else:
	print("'{}' does not exist!".format(arg))
	return 2

	if len(files_to_process) < 1:
	usage()
	return 2
	if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file)):
	print("ERROR: too many input files: option --page and --xml-target-file presuppose one input file!")
	usage()
	return 2

	extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, extract_transkription_field_only=extract_transkription_field_only)
	for file in files_to_process:
	extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file)


	return 0

	if __name__ == "__main__":
	sys.exit(main(sys.argv[1:]))

extractFaksimilePosition.pyNo OneTemporaryActions

File Metadata

extractFaksimilePosition.pyView Options

Event Timeline

extractFaksimilePosition.py
No OneTemporary
Actions

extractFaksimilePosition.py
View Options