Page MenuHomec4science

extractFaksimilePosition.py
No OneTemporary

File Metadata

Created
Tue, Jun 4, 11:50

extractFaksimilePosition.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract the position of the word hovers in a faksimile svg file and write them to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import re
import getopt
import sys
from os import sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
from datetime import datetime
from lxml import etree as ET
from svgpathtools import svg2paths2
from myxmlwriter import write_pretty
from datatypes.faksimile import Faksimile
from datatypes.faksimile_image import FaksimileImage
from datatypes.matrix import Matrix
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Extractor:
"""
This class can be used to extract the word hover positions in a faksimile svg file and write it to a xml file.
Args:
[xml_dir (str): target directory]
[title (str): title of document]
TODO change everything!!!
"""
def __init__(self, xml_dir=None, title=None):
if bool(xml_dir):
self.xml_dir = xml_dir
not isdir(self.xml_dir) and mkdir(self.xml_dir)
else:
self.xml_dir = 'xml' if(isdir('xml')) else ''
self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
self.title = title
def get_page_number(self, file_name, page_number=None):
""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
"""
if not bool(page_number) and bool(re.search(r'\d', file_name)):
"""if page_number=None and filename contains digits,
then split filename into its parts that contain only digits, remove empty strings
and return the last part containing only digits.
"""
page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop()
if bool(page_number):
leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else ''
return leading_zeros + str(page_number)
else:
return ''
def get_file_name(self, file_name, page_number=None):
"""Returns the file_name of the target xml file.
"""
dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else ''
if bool(self.title):
return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml'
else:
return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml'))
def extract_faksimile_word_position(self, svg_tree, page, transkription_field=None):
"""Extracts faksimile word hover positions.
"""
counter = 0
word_part_obj = []
endSign = '%'
last_matrix = None
MAXBOTTOMDIFF = 5
MAXXDIFF = 6
for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field)
# check for line breaks
if (last_matrix is not None and len(word_part_obj) > 0 and (\
Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), abs(current_matrix.getY() - last_matrix.getY()),\
str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg=debug_msg)
word_part_obj = []
endX = current_matrix.getX()
if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: <svg><text>TEXT
if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))):
word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class')} )
else:
endSign = text_item.text
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg='svg/text/\s')
word_part_obj = []
endSign = '%'
for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: <svg><text><tspan>TEXT
endX = current_matrix.add2X(tspan_item.get('x'))
if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))):
y = current_matrix.add2Y(tspan_item.get('y'))
word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class') })
if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: # text_item has letterspacing class
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg='tspan with letterspacing')
word_part_obj = []
else:
endSign = tspan_item.text
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg='svg/text/tspan/\s')
word_part_obj = []
endSign = '%'
last_matrix = current_matrix
if(self.get_word_from_part_obj(word_part_obj) != ''):
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix.clone_transformation_matrix(), debug_msg='end of loop')
word_part_obj = []
endSign = '%'
def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None):
"""Extracts information about positions of text elements.
[:returns:] (svgscripts.Page) the page containing all information.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
transkription_field = TranskriptionField(file_name)
svg_tree = ET.parse(file_name)
page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number)
self.extract_faksimile_word_position(svg_tree, page, transkription_field=transkription_field)
return page
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def extractAndWriteInformation(self, file_name):
"""Extracts faksimile word positions for each faksimile page and writes them to xml files.
"""
if isfile(file_name):
#TODO
faksimile = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file)
write_pretty(xml_element_tree=faksimile.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition')
return 0
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to extract the position of the words in a svg file and write them to a xml file.
svgscripts/extractWordPosition.py [-h|--help, -d|--xml-dir=xmlDir, -m|-manuscript-file, -o|--only-transkription-field, -p|--page=pageNumber, -t|--title=title, -x|--xml-target-file=xmlOutputFile ] <file>
svgscripts/extractWordPosition.py [-h|--help, -d|--xml-dir=xmlDir, -m|-manuscript-file, -o|--only-transkription-field, -t|--title=title] <file|dir> ...
svgscripts/extractWordPosition.py [-h|--help, -d|--xml-dir=xmlDir, -m|-manuscript-file -o|--only-transkription-field]
-h|--help: show help
-d|--xml-dir=xmlDir: target directory for the xml output file(s)
-m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s)
-o|--only-transkription-field: extract only words that are part of the transkription field.
-p|--page=pageNumber: page number of the current page. For use with _one_ file only.
-x|--xml-target-file=xmlOutputFile: target file
-t|--title=title: title of the manuscript to which the current page(s) belong(s)
:return: exit code (int)
"""
xml_dir = ".{}xml".format(sep)
title = None
page_number = None
xml_target_file = None
manuscript_file = None
extract_transkription_field_only = False
try:
opts, args = getopt.getopt(argv, "hod:m:t:p:x:", ["help", "only-transkription-field", "xml-dir=", "manuscript-file=", "title=", "page=", "xml-target-file="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-o', '--only-transkription-field'):
extract_transkription_field_only = True
elif opt in ('-d', '--xml-dir'):
xml_dir = arg
elif opt in ('-m', '--manuscript-file'):
manuscript_file = arg
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-p', '--page'):
page_number = str(arg)
elif opt in ('-x', '--xml-target-file'):
xml_target_file = str(arg)
files_to_process = list()
for arg in args:
if isfile(arg):
files_to_process.append(arg)
elif isdir(arg):
files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg)))
else:
print("'{}' does not exist!".format(arg))
return 2
if len(files_to_process) < 1:
usage()
return 2
if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file)):
print("ERROR: too many input files: option --page and --xml-target-file presuppose one input file!")
usage()
return 2
extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, extract_transkription_field_only=extract_transkription_field_only)
for file in files_to_process:
extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline