Page MenuHomec4science

convert_wordPositions.py
No OneTemporary

File Metadata

Created
Tue, Jun 4, 17:22

convert_wordPositions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import getopt
from lxml.html import builder as E
from lxml.html import open_in_browser
import lxml
from os import sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
import re
import sys
from svgpathtools import svg_to_paths
import xml.etree.ElementTree as ET
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.transkriptionField import TranskriptionField
from datatypes.writing_process import WritingProcess
from datatypes.word import Word
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Converter:
"""The converter super class.
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
self.page = page
self.non_testing = non_testing
self.show_word_insertion_mark = show_word_insertion_mark
def _get_transkription_positions(self, transkription_positions, stage_version=''):
"""Returns the transkription_positions of the indicated stage_version.
"""
convertable_transkription_positions = transkription_positions
if stage_version != '':
convertable_transkription_positions = []
if re.match(r'^\d$', stage_version):
writing_process_id = int(stage_version)
for transkription_position in transkription_positions:
if transkription_position.writing_process_id == writing_process_id:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\+$', stage_version):
version_range = [ *range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\-\d$', stage_version):
start_stop = [ int(i) for i in re.split(r'-', stage_version) ]
version_range = [ *range(start_stop[0], start_stop[1]+1) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
return convertable_transkription_positions
def convert(self, output_file=None, stage_version=''):
"""Prints all words.
"""
first_word_of_line = None
out = sys.stdout
if output_file is not None:
out = open(output_file, 'w')
for word in self.page.words:
if first_word_of_line is None or first_word_of_line.line_number != word.line_number:
out.write('\n')
first_word_of_line = word
if word.line_number % 2 == 0:
out.write(str(word.line_number).zfill(2) + ' ')
else:
out.write(' ')
if stage_version == '' or len(self._get_transkription_positions(word.transkription_positions, stage_version=stage_version)) > 0:
if word.text is not None:
out.write(word.text + ' ')
out.close()
@classmethod
def CREATE_CONVERTER(cls, page, non_testing=True,converter_type='', show_word_insertion_mark=False):
"""Returns a converter of type converter_type.
[:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None
"""
cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() }
cls_key = converter_type + 'Converter'
if bool(cls_dict.get(cls_key)):
return cls_dict.get(cls_key)(page, non_testing, show_word_insertion_mark)
else:
return Converter(page, non_testing, show_word_insertion_mark)
class SVGConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text.
"""
BG_COLOR = 'yellow'
OPACITY = '0.2'
def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
self.bg_color = bg_color
self.opacity = opacity
def convert(self, output_file=None, stage_version=''):
"""Converts Page to SVG
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
transkription_field = TranskriptionField(self.page.svg_file)
if bool(transkription_field.get_svg_attributes('xmlns')):
ET.register_namespace('', transkription_field.get_svg_attributes('xmlns'))
if bool(transkription_field.get_svg_attributes('xmlns:xlink')):
ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink'))
svg_tree = ET.parse(self.page.svg_file)
transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'})
colors = [ 'yellow', 'orange' ]
color_index = 0
for word in self.page.words:
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
rect_node = ET.SubElement(transkription_node, 'rect',\
attrib={'id': str(transkription_position.id), 'x': str(transkription_position.left + transkription_field.xmin),\
'y': str(transkription_position.top + transkription_field.ymin), 'width': str(transkription_position.width),\
'height': str(transkription_position.height), 'fill': colors[color_index], 'opacity': self.opacity})
if transkription_position.transform is not None:
matrix = transkription_position.transform.clone_transformation_matrix()
matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3)
rect_node.set('transform', matrix.toString())
rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3)))
rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3)))
ET.SubElement(rect_node, 'title').text = word.text
color_index = (color_index + 1) % len(colors)
if output_file is not None:
svg_tree.write(output_file)
class HTMLConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a test HTML file.
"""
CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; }
.highlight1 { background-color: pink; opacity: 0.2; }
.word-insertion-mark { background-color: orange; opacity: 0.2; }
.deleted { background-color: grey; opacity: 0.2; }
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
def convert(self, output_file=None, stage_version=''):
"""Converts Page to HTML
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
if stage_version != '':
title = title + ', Schreibstufe: ' + stage_version
width = self.page.width
height = self.page.height
style_content = ' position: relative; width: {}px; height: {}px; background-image: url({}); background-size: {}px {}px '\
.format(width, height, path.abspath(self.page.svg_file), width, height)
style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS)
head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style)
transkription = E.DIV(id="transkription")
counter = 0
for word in self.page.words:
highlight_class = 'highlight' + str(counter)\
if not word.deleted else 'deleted'
word_title = 'id: {}/line: {}\n{}'.format(str(word.id), str(word.line_number), word.text)
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
transkription_position.top, transkription_position.left, transkription_position.width, transkription_position.height)
if transkription_position.transform is not None:
style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString())
transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\
if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0
style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height)
link = E.A(' ', E.CLASS(highlight_class), title=word_title, style=style_content)
transkription.append(link)
counter = (counter + 1) % 2
word_insertion_mark_class = 'word-insertion-mark'
if self.show_word_insertion_mark:
for word_insertion_mark in self.page.word_insertion_marks:
wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(word_insertion_mark.id), str(word_insertion_mark.line_number))
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
word_insertion_mark.top, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height)
link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content)
transkription.append(link)
html = E.HTML(head,E.BODY(transkription))
bool(self.non_testing) and open_in_browser(html)
if output_file is not None:
with open(output_file, 'wb') as f:
f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8'))
f.closed
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes.
svgscripts/convert_wordPositions.py OPTIONS <file>
OPTIONS:
-h|--help: show help
-H|--HTML [default] convert to HTML test file
-o|--output=outputFile save output to file outputFile
-S|--SVG convert to SVG test file
-s|--svg=svgFile: svg web file
-T|--TEXT convert to TEXT output
-t|--testing execute in test mode, do not write to file or open browser
-w|--word-insertion-mark show word insertion mark on HTML
-v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. }
:return: exit code (int)
"""
convert_to_type = None
svg_file = None
output_file = None
non_testing = True
show_word_insertion_mark = False
page = None
stage_version = ''
try:
opts, args = getopt.getopt(argv, "htHSTws:o:v:", ["help", "testing", "HTML", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-v', '--version'):
if re.match(r'^(\d|\d\+|\d\-\d)$', arg):
stage_version = arg
else:
raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg))
elif opt in ('-w', '--word-insertion-mark'):
show_word_insertion_mark = True
elif opt in ('-S', '--SVG'):
convert_to_type = 'SVG'
elif opt in ('-T', '--TEXT'):
convert_to_type = 'TEXT'
elif opt in ('-H', '--HTML'):
convert_to_type = 'HTML'
elif opt in ('-t', '--testing'):
non_testing = False
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-o', '--output'):
output_file = arg
if len(args) < 1:
usage()
return 2
if convert_to_type is None:
if output_file is not None and len(re.split(r'\.', output_file)) > 1:
output_file_part_list = re.split(r'\.', output_file)
convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper()
else:
convert_to_type = 'HTML'
for word_position_file in args:
if not isfile(word_position_file):
print("'{}' does not exist!".format(word_position_file))
return 2
if svg_file is not None:
if isfile(svg_file):
page = Page(xml_source_file=word_position_file, svg_file=svg_file)
else:
print("'{}' does not exist!".format(word_position_file))
return 2
else:
page = Page(xml_source_file=word_position_file)
if page.svg_file is None:
print('Please specify a svg file!')
usage()
return 2
converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark)
converter.convert(output_file=output_file, stage_version=stage_version)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline