extractWordPosition.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Apr 30, 13:28

extractWordPosition.py
View Options

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	""" This program can be used to extract the position of the words in a svg file and write them to a xml file.
	"""
	# Copyright (C) University of Basel 2019 {{{1
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}

	import inspect
	import getopt
	from lxml import etree as ET
	from os import sep, listdir, mkdir, path
	from os.path import exists, isfile, isdir
	from progress.bar import Bar
	import re
	import sys
	import warnings

	from datatypes.lineNumber import LineNumber
	from datatypes.matrix import Matrix
	from datatypes.page import Page
	from datatypes.pdf import PDFText
	from datatypes.transkriptionField import TranskriptionField
	from datatypes.transkription_position import TranskriptionPosition
	from datatypes.word import Word
	from datatypes.word_insertion_mark import WordInsertionMark

	sys.path.append('shared_util')
	from myxmlwriter import write_pretty

	__author__ = "Christian Steiner"
	__maintainer__ = __author__
	__copyright__ = 'University of Basel'
	__email__ = "christian.steiner@unibas.ch"
	__status__ = "Development"
	__license__ = "GPL v3"
	__version__ = "0.0.1"

	class Extractor:
	"""
	This class can be used to extract the word positions in a svg file and write it to a xml file.

	Args:
	[xml_dir (str): target directory]
	[title (str): title of document]
	[manuscript_file (str): xml file containing information about the archival unity to which the current page belongs
	[extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that
	are part of the transkription field.
	"""
	UNITTESTING = False
	SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ]

	def __init__(self, xml_dir=None, title=None, manuscript_file=None, extract_transkription_field_only=False, compare2pdf=False):
	if bool(xml_dir):
	self.xml_dir = xml_dir
	not isdir(self.xml_dir) and mkdir(self.xml_dir)
	else:
	self.xml_dir = 'xml' if(isdir('xml')) else ''
	self.latest_status = None
	self.compare2pdf = compare2pdf
	self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
	self.title = title
	self.manuscript_file = manuscript_file
	self.extract_transkription_field_only = extract_transkription_field_only
	self.manuscript_tree = None
	if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file):
	self.manuscript_tree = ET.parse(self.manuscript_file)
	self.title = self.manuscript_tree.getroot().get('title')
	elif bool(self.manuscript_file):
	raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file))
	elif bool(self.title):
	self.update_title_and_manuscript(self.title, False)

	def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None):
	"""Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word).
	If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created.

	:returns: the new word counter (int)
	"""
	break_points = []
	if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
	for Sonderzeichen in self.SONDERZEICHEN_LIST:
	contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ]
	if True in contains_Sonderzeichen:
	break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]]
	for sz_point in [i for i, e in break_points]:
	wim_index = len(page.word_insertion_marks)
	x = float(word_part_objs[sz_point]['x'])
	y = float(word_part_objs[sz_point]['y'])
	if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None:
	svg_path_tree = ET.parse(page.svg_file)
	namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
	xmin = transkription_field.xmin
	ymin = transkription_field.ymin
	wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\
	line_number=page.get_line_number(y-1), mark_type=Sonderzeichen)
	page.word_insertion_marks.append(wim)
	if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points
	THRESHOLDX = 20 # Threshold between line number and text
	last_x = -1
	for i, x in enumerate([float(dict['x']) for dict in word_part_objs]):
	if(last_x > -1 and (x - last_x > THRESHOLDX)):
	break_points.append((i, i))
	last_x = x
	if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words
	from_index = 0
	for end_point, next_from_index in break_points:
	new_word_part_objs = word_part_objs[from_index:end_point]
	new_endX = word_part_objs[end_point]['x']
	from_index = next_from_index
	index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
	if from_index > 0 and from_index < len(word_part_objs):
	new_word_part_objs = word_part_objs[from_index:]
	index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
	return index
	else:
	if len(word_part_objs) > 0:
	transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\
	debug_msg_string=debug_msg, transkription_field=transkription_field)
	text = self.get_word_from_part_obj(word_part_objs)
	line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
	if line_number == -1:
	if len(page.words) > 0:
	lastWord = page.words[len(page.words)-1]
	lastWord_lastTP = lastWord.transkription_positions[len(lastWord.transkription_positions)-1]
	lastTP = transkription_positions[len(transkription_positions)-1]
	if transkription_positions[0].left > lastWord_lastTP.left\
	and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2:
	line_number = lastWord.line_number
	else:
	line_number = lastWord.line_number+1
	newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
	page.words.append(newWord)
	return int(index) + 1
	else:
	return int(index)

	def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default'):
	"""Extracts information about positions of text elements and writes them to a xml file.
	"""
	if isfile(file_name):
	if not bool(xml_target_file):
	xml_target_file = self.get_file_name(file_name, page_number)
	if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
	xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
	exit_status = 0
	with warnings.catch_warnings(record=record_warnings) as w:
	warnings.simplefilter(warning_filter)
	page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile)
	status_message = 'OK'
	if w is not None and len(w) > 0:
	status_message = 'with warnings'
	if True in [ str(warn.message).startswith(Page.WARNING_MISSING_USE_NODE4PWP) for warn in w ]:
	status_message += ':{}:'.format(Page.WARNING_MISSING_USE_NODE4PWP.lower())
	if True in [ str(warn.message).startswith(Page.WARNING_MISSING_GLYPH_ID4WIM) for warn in w ]:
	status_message += ':{}:'.format(Page.WARNING_MISSING_GLYPH_ID4WIM.lower())
	self.latest_status = status_message
	exit_status = 1
	else:
	self.latest_status = None
	page.page_tree.getroot().set('status', status_message)
	write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition')
	return exit_status
	else:
	raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))

	def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None):
	"""Extracts information about positions of text elements.

	[:returns:] (datatypes.page) the Page containing all information.
	"""
	if isfile(file_name):
	if not bool(xml_target_file):
	xml_target_file = self.get_file_name(file_name, page_number)
	if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
	xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
	transkription_field = TranskriptionField(file_name) if bool(self.extract_transkription_field_only) else None
	svg_tree = ET.parse(file_name)
	page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number, pdfFile=pdfFile,\
	svg_file=svg_file, extract_transkription_field_only=self.extract_transkription_field_only)
	page.add_source(file_name)
	sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot())
	page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
	if transkription_field is not None:
	page.init_line_numbers(self.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax)
	self.extract_word_position(svg_tree, page, transkription_field=transkription_field)
	#if page.pdfFile is not None and isfile(page.pdfFile):
	# pdftext = PDFText(page.pdfFile, sonderzeichen=self.SONDERZEICHEN_LIST)
	# pdftext.compare_svgWords2pdfWords(page, transkription_field=transkription_field, split_wrongly_concatenated_words=self.compare2pdf)
	page.create_writing_processes_and_attach2tree()
	#page.categorize_paths(transkription_field=transkription_field)
	page.update_and_attach_words2tree()
	for word_insertion_mark in page.word_insertion_marks:
	# it is not clear if we really need to know this alternative word ordering. See 'TODO.md'
	#word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark)
	word_insertion_mark.attach_object_to_tree(page.page_tree)
	return page
	else:
	raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))

	def extract_line_numbers(self, svg_tree, transkription_field):
	"""Extracts line numbers and write them to a xml file.
	"""
	nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\
	svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
	line_numbers = [ LineNumber(raw_text_node=item, transkription_field=transkription_field)\
	for item in filter(lambda x: LineNumber.IS_A_LINE_NUMBER(x), nodes_near_tf)]
	if len(line_numbers) > 0:
	MINABOVE = 3
	last_to_position = transkription_field.ymin
	for line_number in line_numbers:
	above_current_line_bottom = line_number.bottom + transkription_field.ymin - MINABOVE
	bottoms = self.get_bottoms(svg_tree.getroot(), from_position=last_to_position, to_position=above_current_line_bottom)
	last_to_position = above_current_line_bottom
	if len(bottoms) > 0:
	current_line_top = float(bottoms[len(bottoms)-1]) - transkription_field.ymin + MINABOVE
	line_number.setTop(current_line_top)
	return line_numbers

	def extract_word_position(self, svg_tree, page, transkription_field=None):
	"""Extracts word positions.
	"""
	counter = 0
	word_part_obj = []
	endSign = '%'
	last_matrix = None
	MAXBOTTOMDIFF = 5
	MAXXDIFF = 6
	if not Extractor.UNITTESTING:
	bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
	for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
	current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field)
	# check for line breaks
	if (last_matrix is not None and len(word_part_obj) > 0 and (\
	Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
	(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
	(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
	or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
	endSign = '%'
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
	round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
	str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field)
	word_part_obj = []
	endX = current_matrix.getX()
	if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: <svg><text>TEXT
	if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))):
	word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} )
	else:
	endSign = text_item.text
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field)
	word_part_obj = []
	endSign = '%'
	for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: <svg><text><tspan>TEXT
	endX = current_matrix.add2X(tspan_item.get('x'))
	if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))):
	y = current_matrix.add2Y(tspan_item.get('y'))
	word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix })
	if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0:
	"""text_item has letterspacing class
	(set s & set t = new set with elements common to s and t)
	"""
	endSign = '%'
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\
	debug_msg='tspan with letterspacing', transkription_field=transkription_field)
	word_part_obj = []
	else:
	endSign = tspan_item.text
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\
	debug_msg='svg/text/tspan/\s', transkription_field=transkription_field)
	word_part_obj = []
	endSign = '%'
	last_matrix = current_matrix
	not bool(Extractor.UNITTESTING) and bar.next()
	if(self.get_word_from_part_obj(word_part_obj) != ''):
	counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\
	transkription_field=transkription_field)
	word_part_obj = []
	endSign = '%'
	not bool(Extractor.UNITTESTING) and bar.finish()

	def find_inserted_words_by_position(self, target_tree, x, y):
	"""Returns an Array with the words that are inserted above the x, y position or [] if not found.
	"""
	warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.')
	MINY = 31.0
	MAXY = 10.0
	DIFFX = 9.0
	if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
	result_list = []
	minus2left = 20.0
	minus2top = 19.0
	while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX :
	result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
	'//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ]
	minus2left -= 1
	minus2top += 1
	if len(result_list) > 0:
	result_bottom = result_list[len(result_list)-1].bottom
	result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
	for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)):
	result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
	result_left_max = result_left_min + DIFFX
	if float(item.get('left')) - result_left_max < DIFFX:
	result_list.append(Word.CREATE_WORD(item))
	else:
	break
	return result_list
	else:
	return []

	def find_inserted_words(self, target_tree, word_insertion_mark):
	"""Returns an Array with the words that are inserted above/underneath the word_insertion_mark.

	"""
	warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.')
	if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1:
	return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y)
	if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
	MINY = 31.0
	MAXY = 10.0
	DIFFX = 9.0
	result_list = []
	x = word_insertion_mark.x
	y = word_insertion_mark.y
	if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line
	line_number = word_insertion_mark.line_number - 1
	words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
	'//word[@line-number={0}]'.format(line_number)) ]
	if len(words_on_line) > 0:
	minus2top = 1.0
	while len(result_list) == 0 and minus2top < MINY:
	for word in words_on_line:
	for transkription_position in word.transkription_positions:
	if transkription_position.top > y - minus2top\
	and transkription_position.left > x - DIFFX\
	and transkription_position.left < x + DIFFX:
	result_list.append(word)
	break
	minus2top += 1
	elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line
	line_number = word_insertion_mark.line_number + 1
	words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
	'//word[@line-number={0}]'.format(line_number)) ]
	if len(words_on_line) > 0:
	plus2top = 1.0
	while len(result_list) == 0 and plus2top < MINY :
	for word in words_on_line:
	for transkription_position in word.transkription_positions:
	if transkription_position.top > y + plus2top\
	and transkription_position.left > x - DIFFX\
	and transkription_position.left < x + DIFFX:
	result_list.append(word)
	break
	plus2top += 1
	if len(result_list) > 0: # now, collect more words that are right of already collected words
	result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom
	result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
	+ result_list[len(result_list)-1].transkription_positions[0].width
	for item in target_tree.getroot().xpath(\
	'//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)):
	result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
	+ result_list[len(result_list)-1].transkription_positions[0].width
	result_left_max = result_left_min + DIFFX
	if float(item.get('left')) - result_left_max < DIFFX:
	result_list.append(Word.CREATE_WORD(item))
	else:
	break
	return result_list
	else:
	return []

	def get_bottoms(self, tree_root, from_position=-1.0, to_position=-1.0, transkription_field=None):
	"""Returns all unique bottom values (Float) as a sorted list.
	"""
	bottom_list = sorted(set(item.get('transform').split(' ')[5].replace(')','') for item in tree_root.findall(".//text", tree_root.nsmap)), key=float)
	if transkription_field is not None:
	from_position = transkription_field.ymin
	to_position = transkription_field.ymax
	if (from_position > 0.0 and to_position > 0.0):
	return [ item for item in filter(lambda x: float(x) > from_position and float(x) < to_position, bottom_list) ]
	else:
	return bottom_list

	def get_file_name(self, file_name, page_number=None):
	"""Returns the file_name of the target xml file.
	"""
	dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else ''
	if bool(self.title):
	return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml'
	else:
	return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml'))

	def get_page_number(self, file_name, page_number=None):
	""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
	"""
	if not bool(page_number) and bool(re.search(r'\d', file_name)):
	"""if page_number=None and filename contains digits,
	then split filename into its parts that contain only digits, remove empty strings
	and return the last part containing only digits.
	"""
	page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop()
	if bool(page_number):
	leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else ''
	return leading_zeros + str(page_number)
	else:
	return ''

	def get_style(self, etree_root):
	"""Returns the style specification as a dictionary.

	:returns:
	sonderzeichen_list: list of keys for classes that are 'Sonderzeichen'
	style_dict: dictionary: key = class name (str), value = style specification (dictionary)
	"""
	style_dict = {}
	sonderzeichen_list = []
	letterspacing_list = []
	style = etree_root.find('style', etree_root.nsmap)
	if style is not None:
	for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))):
	style_key = style_item.split('{')[0].replace('.', '')
	style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \
	for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))}
	style_dict[style_key] = style_value_dict
	if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'):
	sonderzeichen_list.append(style_key)
	if bool(style_value_dict.get('letter-spacing')):
	letterspacing_list.append(style_key)
	return sonderzeichen_list, letterspacing_list, style_dict

	def get_text_items(self, tree_root, transkription_field=None):
	"""Returns all text elements with a matrix or (if transkription_field is specified)
	all text elements that are located inside the transkription field.
	"""
	if transkription_field is not None:
	return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=x),\
	tree_root.iterfind(".//text", tree_root.nsmap))
	else:
	return tree_root.iterfind(".//text", tree_root.nsmap)

	def get_word_from_part_obj(self, word_part_obj):
	"""Extracts all 'text' from a list of dicitonaries and concats it to a string.
	"""
	return ''.join([ dict['text'] for dict in word_part_obj])

	def get_word_object_multi_char_x(self, word_part_obj_dict):
	"""Returns the x of the last char of word_part_object.

	TODO: get real widths from svg_file!!!
	"""
	WIDTHFACTOR = 2.6
	return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR

	def update_title_and_manuscript(self, title, update_manuscript=True):
	"""Updates title and manuscript.
	"""
	self.title = title
	if update_manuscript or not bool(self.manuscript_file):
	self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml'
	if not isfile(self.manuscript_file):
	self.manuscript_tree = ET.ElementTree(ET.Element('manuscript', attrib={"title": self.title}))
	write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile')

	def usage():
	"""prints information on how to use the script
	"""
	print(main.__doc__)

	def main(argv):
	"""This program can be used to extract the position of the words in a svg file and write them to a xml file.

	svgscripts/extractWordPosition.py [OPTIONS] <file\|dir>

	<file> svg file OR xml target file containing file name of svg file as "/page/@source".
	<dir> directory containing svg files

	OPTIONS:
	-h\|--help: show help
	-c\|--compare-to-pdf compare words to pdf and autocorrect
	-d\|--xml-dir=xmlDir: target directory for the xml output file(s)
	-m\|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s)
	-o\|--only-transkription-field: extract only words that are part of the transkription field.
	-p\|--page=pageNumber: page number of the current page. For use with _one_ file only.
	-P\|--PDF=pdfFile: pdf file - used for word correction
	-s\|--svg=svgFile: svg web file
	-t\|--title=title: title of the manuscript to which the current page(s) belong(s)
	-x\|--xml-target-file=xmlOutputFile: xml target file

	:return: exit code (int)
	"""
	compare2pdf = True
	extract_transkription_field_only = True
	manuscript_file = None
	page_number = None
	pdfFile = None
	svg_file = None
	title = None
	xml_target_file = None
	xml_dir = ".{}xml".format(sep)

	try:
	opts, args = getopt.getopt(argv, "hocd:m:t:p:s:x:P:", ["help", "only-transkription-field", "compare-to-pdf", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="])
	except getopt.GetoptError:
	usage()
	return 2

	for opt, arg in opts:
	if opt in ('-h', '--help') or not args:
	usage()
	return 0
	elif opt in ('-c', '--compare-to-pdf'):
	compare2pdf = True
	elif opt in ('-o', '--only-transkription-field'):
	extract_transkription_field_only = True
	elif opt in ('-d', '--xml-dir'):
	xml_dir = arg
	elif opt in ('-m', '--manuscript-file'):
	manuscript_file = arg
	elif opt in ('-t', '--title'):
	title = arg
	elif opt in ('-p', '--page'):
	page_number = str(arg)
	elif opt in ('-s', '--svg'):
	svg_file = arg
	elif opt in ('-P', '--PDF'):
	pdfFile = arg
	elif opt in ('-x', '--xml-target-file'):
	xml_target_file = str(arg)
	files_to_process = list()
	for arg in args:
	if isfile(arg):
	files_to_process.append(arg)
	elif isdir(arg):
	files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg)))
	else:
	print("'{}' does not exist!".format(arg))
	return 2

	if len(files_to_process) < 1 or args[0].endswith('xml'):
	if xml_target_file is None:
	xml_target_file = args[0] if len(args) > 0 else None
	if xml_target_file is not None and isfile(xml_target_file):
	target_file_tree = ET.parse(xml_target_file)
	file_name = target_file_tree.getroot().get('source')
	title = target_file_tree.getroot().get('title') if title is None else title
	page_number = target_file_tree.getroot().get('number') if page_number is None else page_number
	extract_transkription_field_only = (target_file_tree.getroot().get('transkription-field-only') == 'true')\
	if target_file_tree.getroot().get('transkription-field-only') is not None else False
	if svg_file is None:
	if len(target_file_tree.xpath('//svg-image')) > 0:
	svg_file = target_file_tree.xpath('.//svg-image/@file-name')[0]\
	if len(target_file_tree.xpath('.//svg-image/@file-name')) > 0 else None
	else:
	svg_file = target_file_tree.xpath('.//svg/@file')[0]\
	if len(target_file_tree.xpath('.//svg/@file')) > 0 else None
	files_to_process.insert(0, file_name)
	if xml_target_file in files_to_process:
	files_to_process.remove(xml_target_file)
	else:
	usage()
	return 2
	if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)):
	print("ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!")
	usage()
	return 2

	extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, extract_transkription_field_only=extract_transkription_field_only, compare2pdf=compare2pdf)
	for file in files_to_process:
	extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file)
	return 0

	if __name__ == "__main__":
	sys.exit(main(sys.argv[1:]))

extractWordPosition.pyNo OneTemporaryActions

File Metadata

extractWordPosition.pyView Options

Event Timeline

extractWordPosition.py
No OneTemporary
Actions

extractWordPosition.py
View Options