Page MenuHomec4science

convert_wordPositions.py
No OneTemporary

File Metadata

Created
Fri, Apr 26, 11:01

convert_wordPositions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import cairosvg
import getopt
import json
from lxml.html import builder as E
from lxml.html import open_in_browser
import lxml
from pathlib import Path as PathLibPath
from os import sep, listdir, mkdir, path, remove
from os.path import exists, isfile, isdir, dirname
import re
import sys
from svgpathtools import svg_to_paths
import xml.etree.ElementTree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.page_creator import PageCreator
from datatypes.transkriptionField import TranskriptionField
from datatypes.text_field import TextField
from datatypes.writing_process import WritingProcess
from datatypes.word import Word
sys.path.append('shared_util')
from main_util import extract_paths_on_tf, get_paths_near_position
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
EXIST_DB = 'http://existdb-test.dasch.swiss/exist/rest/db/storage/nietzsche/'
LOCAL_SERVER = 'http://localhost:8000/'
class Converter:
"""The converter super class.
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
self.page = page
self.non_testing = non_testing
self.show_word_insertion_mark = show_word_insertion_mark
def _get_transkription_positions(self, transkription_positions, stage_version=''):
"""Returns the transkription_positions of the indicated stage_version.
"""
convertable_transkription_positions = transkription_positions
if stage_version != '':
convertable_transkription_positions = []
if re.match(r'^\d$', stage_version):
writing_process_id = int(stage_version)
for transkription_position in transkription_positions:
if transkription_position.writing_process_id == writing_process_id:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\+$', stage_version):
version_range = [ *range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\-\d$', stage_version):
start_stop = [ int(i) for i in re.split(r'-', stage_version) ]
version_range = [ *range(start_stop[0], start_stop[1]+1) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
return convertable_transkription_positions
def _get_words(self, words, highlighted_words=None):
"""Return the words that will be hightlighted.
"""
return highlighted_words if highlighted_words is not None else words
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Prints all words.
"""
first_word_of_line = None
out = sys.stdout
if output_file is not None:
out = open(output_file, 'w')
for word in self.page.words:
if first_word_of_line is None or first_word_of_line.line_number != word.line_number:
out.write('\n')
first_word_of_line = word
if output_file is None:
if word.line_number % 2 == 0:
out.write(str(word.line_number).zfill(2) + ' ')
else:
out.write(' ')
if stage_version == '' or len(self._get_transkription_positions(word.transkription_positions, stage_version=stage_version)) > 0:
if word.edited_text is not None:
out.write(word.edited_text + ' ')
elif word.text is not None:
out.write(word.text + ' ')
out.close()
return 0
@classmethod
def CREATE_CONVERTER(cls, page, non_testing=True, converter_type='', show_word_insertion_mark=False, key=''):
"""Returns a converter of type converter_type.
[:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None
"""
cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() }
cls_key = converter_type + 'Converter'
if bool(cls_dict.get(cls_key)):
converter_cls = cls_dict[cls_key]
if converter_cls == JSONConverter:
return converter_cls(page, non_testing=non_testing, key=key)
return converter_cls(page, non_testing, show_word_insertion_mark)
else:
return Converter(page, non_testing, show_word_insertion_mark)
class JSONConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a json file.
"""
def __init__(self, page, faksimile_page=None, non_testing=True, key=''):
Converter.__init__(self, page, non_testing, False)
self.faksimile_page = faksimile_page
def _add_word_to_list(self, words, word, text, text_field=None, edited_text=None, earlier_version=None, overwrites_word=None, parent_id=-1, faksimile_positions=None):
"""Add word to list.
"""
id = word.id\
if parent_id == -1\
else parent_id
edited_text = word.edited_text\
if edited_text is None\
else edited_text
earlier_version = word.earlier_version\
if earlier_version is None\
else earlier_version
overwrites_word = word.overwrites_word\
if overwrites_word is None\
else overwrites_word
line_number = word.line_number
for tp in word.transkription_positions:
tp_id = f'w{word.id}:tp{tp.id}'\
if parent_id == -1\
else f'w{parent_id}:w{word.id}:tp{tp.id}'
if text_field is not None:
word_dict = { 'id': id, 'text': text, 'left': tp.left + text_field.left, 'top': tp.top + text_field.top,\
'width': tp.width, 'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted }
if tp.transform is not None:
matrix = tp.transform.clone_transformation_matrix()
xmin = text_field.left
ymin = text_field.top
matrix.matrix[Matrix.XINDEX] = round(tp.transform.matrix[Matrix.XINDEX] + xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(tp.transform.matrix[Matrix.YINDEX] + ymin, 3)
word_dict.update({ 'transform': matrix.toString() })
if tp.left > 0:
word_dict.update({ 'left': round(tp.left - tp.transform.matrix[Matrix.XINDEX], 3)})
else:
word_dict.update({ 'left': 0})
word_dict.update({ 'top': round((tp.height-1.5)*-1, 3)})
else:
word_dict = { 'id': id, 'text': text, 'left': tp.left, 'top': tp.top, 'width': tp.width,\
'height': tp.height, 'line': line_number, 'tp_id': tp_id, 'deleted': word.deleted }
if tp.transform is not None:
word_dict.update({ 'transform': tp.transform.toString() })
if edited_text is not None:
word_dict.update({'edited_text': edited_text})
if earlier_version is not None:
word_dict.update({'earlier_version': earlier_version.text })
if overwrites_word is not None:
word_dict.update({'overwrites_word': overwrites_word.text })
if parent_id > -1:
word_dict.update({'part_text': word.text })
if len(word.deletion_paths) > 0:
for dp_index, dp in enumerate(word.deletion_paths):
if bool(word_dict.get('deletion_path')):
word_dict = word_dict.copy()
word_dict.update({'deletion_path': dp.d_attribute})
words.append(word_dict)
if len(word.deletion_paths_near_word) > 0:
word_dict.update({'paths_near_word': word.deletion_paths_near_word })
words.append(word_dict)
else:
words.append(word_dict)
if faksimile_positions is not None:
faksimile_dict = {}
for fp in word.faksimile_positions:
self._add_faksimile_to_list(id, line_number, fp, word.deleted, faksimile_positions, text, edited_text=edited_text,\
earlier_version=earlier_version, overwrites_word=overwrites_word, parent_id=parent_id, word_text=word.text)
for wp in word.word_parts:
self._add_word_to_list(words, wp, text, text_field=text_field, edited_text=edited_text,\
earlier_version=earlier_version, overwrites_word=overwrites_word, parent_id=word.id, faksimile_positions=faksimile_positions)
def _add_faksimile_to_list(self, id, line_number, fp, deleted, faksimile_positions, text, edited_text=None, earlier_version=None, overwrites_word=None, parent_id=-1, word_text='') ->dict:
"""Create and return a json dictionary.
"""
faksimile_dict = { 'id': id, 'text': text, 'left': fp.left, 'top': fp.top,\
'width': fp.width, 'height': fp.height, 'line': line_number, 'fp_id': fp.id, 'deleted': deleted }
if fp.transform is not None:
faksimile_dict.update({ 'transform': fp.transform.toString() })
if len(faksimile_dict) > 0:
if edited_text is not None:
faksimile_dict.update({'edited_text': edited_text})
if earlier_version is not None:
faksimile_dict.update({'earlier_version': earlier_version.text })
if overwrites_word is not None:
faksimile_dict.update({'overwrites_word': overwrites_word.text })
if parent_id > -1:
faksimile_dict.update({'part_text': word_text })
faksimile_positions.append(faksimile_dict)
def create_json_dict(self) ->dict:
"""Create and return a json dictionary.
"""
words = []
faksimile_positions = []
text_field = None
if self.page.svg_image is not None:
if self.page.svg_image.text_field is None:
text_field = self.page.svg_image.text_field = TranskriptionField(self.page.svg_image.file_name).convert_to_text_field()
for word in self.page.words:
self._add_word_to_list(words, word, word.text, text_field=text_field, faksimile_positions=faksimile_positions)
lines = []
faksimile_lines = []
offset = 0 if text_field is None else text_field.ymin
svg_image = self.add_object2dict(self.page.svg_image)
if self.faksimile_page is not None:
if self.page.faksimile_image is None:
if self.faksimile_page.faksimile_image.text_field is None\
and self.faksimile_page.text_field is not None:
self.faksimile_page.faksimile_image.text_field = self.faksimile_page.text_field
self.page.faksimile_image = self.faksimile_page.faksimile_image
for fp in self.faksimile_page.word_positions:
if fp.id not in [ f_dict.get('fp_id') for f_dict in faksimile_positions ]:
self._add_faksimile_to_list(fp.id, -1, fp, False, faksimile_positions, fp.text)
faksimile_image = self.add_object2dict(self.page.faksimile_image)
if svg_image is not None:
svg_image.update({ 'URL': self.page.svg_image.primaryURL })
svg_image.update({ 'x': self.page.svg_image.text_field.left })
svg_image.update({ 'y': self.page.svg_image.text_field.top })
if faksimile_image is not None:
if bool(faksimile_image.get('transform_string')):
faksimile_image.update({ 'transform': faksimile_image.get('transform_string') })
faksimile_image.update({ 'secondaryURL': LOCAL_SERVER + "faksimiles/" + self.page.faksimile_image.file_name })
faksimile_image.update({ 'x': 0 })
faksimile_image.update({ 'y': 0 })
for line in self.page.lines:
lines.append({ 'id': line.id, 'number': line.id, 'top': line.top + offset, 'bottom': line.bottom })
faksimile_lines.append({ 'id': line.id, 'number': line.id, 'top': line.faksimile_inner_top, 'bottom': line.faksimile_inner_bottom })
return { 'title': self.page.title, 'number': self.page.number, 'words': words, 'svg': svg_image, 'lines': lines,\
'faksimile': faksimile_image, 'faksimile_positions': faksimile_positions, 'faksimile_lines': faksimile_lines }
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to JSON.
"""
if output_file is None:
output_file = 'output.json'
json_file = open(output_file, "w+")
try:
json.dump(self.create_json_dict(), json_file)
except Exception:
raise Exception('Error in json.dump')
json_file.close()
return 0
def add_object2dict(self, object_instance):
"""Add an object to json_dict and generate json data and interfaces.
[:return:] json dict or object_instance
"""
json_dict = {}
object_type = type(object_instance)
if object_type.__module__ == 'builtins':
if object_type != list:
return object_instance
else:
items = []
for item in object_instance:
items.append(self.add_object2dict(item))
if len(items) > 0:
return items
else:
return { self.key: [] }
semantic_dictionary = object_type.get_semantic_dictionary()
for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]:
content = object_instance.__dict__.get(key)
if content_type == list\
and content is not None\
and len(content) > 0\
and type(content[0]).__module__ != 'builtins':
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item))
json_dict.update({key: content_list})
elif content_type.__module__ == 'builtins':
if content is not None:
json_dict.update({key: content})
else:
if content is not None and type(content) == list:
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item))
json_dict.update({key: content_list})
else:
if content is not None:
json_dict.update({key: self.add_object2dict(content)})
return json_dict
class oldJSONConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a json file.
"""
PY2TS_DICT = { float: 'number', int: 'number', bool: 'boolean', str: 'string' }
def __init__(self, page, non_testing=True, key=''):
Converter.__init__(self, page, non_testing, False)
self.key = key
self.interface_output_dir = PathLibPath('ts_interfaces')
if not self.interface_output_dir.is_dir():
self.interface_output_dir.mkdir()
elif len(list(self.interface_output_dir.glob('*.ts'))) > 0:
for ts_file in self.interface_output_dir.glob('*.ts'):
remove(ts_file)
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to JSON.
"""
if output_file is None:
output_file = 'output.json'
class_dict = {}
if self.key != '':
object_instance = self.page.__dict__.get(self.key)
if object_instance is not None:
json_dict = self.add_object2dict(object_instance, class_dict)
if type(json_dict) == list:
json_dict = { self.key : json_dict }
else:
print(f'Page initialized from {self.page.page_tree.docinfo.URL} does not have an object at "{self.key}"!')
return 2
else:
json_dict = self.add_object2dict(self.page, class_dict)
json_file = open(output_file, "w+")
try:
json.dump(json_dict, json_file)
except Exception:
raise Exception('Error in json.dump')
json_file.close()
self.create_imports(class_dict)
return 0
def add_object2dict(self, object_instance, class_dict):
"""Add an object to json_dict and generate json data and interfaces.
[:return:] json dict or object_instance
"""
json_dict = {}
interface_list = []
object_type = type(object_instance)
if object_type.__module__ == 'builtins':
if object_type != list:
return object_instance
else:
items = []
for item in object_instance:
items.append(self.add_object2dict(item, class_dict))
if len(items) > 0:
return { self.key: items }
else:
return { self.key: 'null' }
semantic_dictionary = object_type.get_semantic_dictionary()
for key, content_type in [ (key, content.get('class')) for key, content in semantic_dictionary['properties'].items()]:
content = object_instance.__dict__.get(key)
if content_type == list\
and content is not None\
and len(content) > 0\
and type(content[0]).__module__ != 'builtins':
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item, class_dict))
json_dict.update({key: content_list})
interface_list.append(f'{key}: {type(content[0]).__name__}[];')
elif content_type.__module__ == 'builtins':
if content_type != list:
ts_type = self.PY2TS_DICT[content_type]\
if content_type in self.PY2TS_DICT.keys()\
else 'string'
interface_list.append(f'{key}: {ts_type};')
json_dict.update({key: content})
else:
if content is not None and type(content) == list:
interface_list.append(f'{key}: {content_type.__name__}[];')
content_list = []
for content_item in content:
content_list.append(self.add_object2dict(content_item, class_dict))
json_dict.update({key: content_list})
else:
interface_list.append(f'{key}: {content_type.__name__};')
if content is not None:
json_dict.update({key: self.add_object2dict(content, class_dict)})
if object_type not in class_dict.keys():
class_dict.update({object_type: self.create_interface(object_type.__name__, interface_list)})
return json_dict
def create_imports(self, class_dict):
"""Create an ts interface from a list of key and content_types.
[:return:] file_name of interface
"""
ts_file = PathLibPath('ts_imports.ts')
file = open(ts_file, "w+")
file.write(f'//import all interfaces from {self.interface_output_dir} ' + '\n')
for interface_name, path_name in class_dict.items() :
file.write('import {' + interface_name.__name__ + '} from \'./' + str(self.interface_output_dir.joinpath(path_name.stem)) + '\';\n')
file.close()
return ts_file
def create_interface(self, class_name, interface_list) -> PathLibPath:
"""Create an ts interface from a list of key and content_types.
[:return:] file_name of interface
"""
ts_file = self.interface_output_dir.joinpath(PathLibPath(f'{class_name.lower()}.ts'))
import_list = [ import_class_name for import_class_name in\
[ import_class_name.split(': ')[1].replace(';','').replace('[]','') for import_class_name in interface_list ]\
if import_class_name not in set(self.PY2TS_DICT.values()) ]
file = open(ts_file, "w")
for import_class_name in set(import_list):
file.write('import {' + import_class_name + '} from \'./' + import_class_name.lower() + '\';\n')
file.write(f'export interface {class_name} ' + '{\n')
for interace_string in interface_list:
file.write(f'\t' + interace_string + '\n')
file.write('}')
file.close()
return ts_file
class SVGConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text.
"""
BG_COLOR = 'yellow'
OPACITY = '0.2'
def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
self.bg_color = bg_color
self.opacity = opacity
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to SVG
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
svg_file = self.page.svg_file
if svg_file is None and self.page.svg_image is not None:
svg_file = self.page.svg_image.file_name
elif svg_file is None:
msg = f'ERROR: xml_source_file {self.page.docinfo.URL} does neither have a svg_file nor a svg_image!'
raise Exception(msg)
transkription_field = TranskriptionField(svg_file)
if bool(transkription_field.get_svg_attributes('xmlns')):
ET.register_namespace('', transkription_field.get_svg_attributes('xmlns'))
if bool(transkription_field.get_svg_attributes('xmlns:xlink')):
ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink'))
svg_tree = ET.parse(svg_file)
transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'})
colors = [ 'yellow', 'orange' ] if self.bg_color == self.BG_COLOR else [ self.bg_color ]
if highlighted_words is not None:
colors = ['yellow']
else:
highlighted_words = []
color_index = 0
for word in self.page.words:
word_id = 'word_' + str(word.id)
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
transkription_position_id = word_id + '_' + str(transkription_position.id)
color = colors[color_index] if word not in highlighted_words else self.bg_color
rect_node = ET.SubElement(transkription_node, 'rect',\
attrib={'id': transkription_position_id, 'x': str(transkription_position.left + transkription_field.xmin),\
'y': str(transkription_position.top + transkription_field.ymin), 'width': str(transkription_position.width),\
'height': str(transkription_position.height), 'fill': color, 'opacity': self.opacity})
if transkription_position.transform is not None:
matrix = transkription_position.transform.clone_transformation_matrix()
matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3)
rect_node.set('transform', matrix.toString())
rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3)))
rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3)))
ET.SubElement(rect_node, 'title').text = word.text
color_index = (color_index + 1) % len(colors)
if output_file is not None:
svg_tree.write(output_file)
return 0
class HTMLConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a test HTML file.
"""
CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; }
.highlight1 { background-color: pink; opacity: 0.2; }
.highlight2 { background-color: red; opacity: 0.2; }
.foreign { background-color: blue; opacity: 0.4; }
.overwritten { background-color: green; opacity: 0.4; }
.word-insertion-mark { background-color: orange; opacity: 0.2; }
.deleted { background-color: grey; opacity: 0.2; }
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
self.text_field = TextField()
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to HTML
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
if stage_version != '':
title = title + ', Schreibstufe: ' + stage_version
if self.page.svg_image is not None:
width = self.page.svg_image.width
height = self.page.svg_image.height
svg_file = self.page.svg_image.file_name
if self.page.svg_image.text_field is not None:
self.text_field = self.page.svg_image.text_field
print('Textfield found ->adjusting data')
elif self.page.svg_file is not None:
svg_file = self.page.svg_file
transkription_field = TranskriptionField(svg_file)
width = transkription_field.getWidth()
height = transkription_field.getHeight()
style_content = ' position: relative; width: {}px; height: {}px; background-image: url("{}"); background-size: {}px {}px '\
.format(width, height, path.abspath(svg_file), width, height)
style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS)
head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style)
transkription = E.DIV(id="transkription")
counter = 0
for word in self.page.words:
highlight_class = 'highlight' + str(counter)\
if not word.deleted else 'deleted'
if highlighted_words is not None\
and word in highlighted_words:
highlight_class = 'highlight2'
earlier_text = '' if word.earlier_version is None else word.earlier_version.text
if earlier_text == '' and len(word.word_parts) > 0:
earlier_versions = [ word for word in word.word_parts if word.earlier_version is not None ]
earlier_text = earlier_versions[0].text if len(earlier_versions) > 0 else ''
if earlier_text != '':
word_title = 'id: {}/line: {}\n0: {}\n1: {}'.format(str(word.id), str(word.line_number), earlier_text, word.text)
else:
word_title = 'id: {}/line: {}\n{}'.format(str(word.id), str(word.line_number), word.text)
if word.edited_text is not None:
word_title += f'\n>{word.edited_text}'
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
self._append2transkription(transkription, highlight_class, word_title, transkription_position)
if word.overwrites_word is not None:
overwritten_title = f'{word.text} overwrites {word.overwrites_word.text}'
for overwritten_transkription_position in word.overwrites_word.transkription_positions:
self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position)
for part_word in word.word_parts:
highlight_class = 'highlight' + str(counter)\
if not part_word.deleted else 'deleted'
for part_transkription_position in self._get_transkription_positions(part_word.transkription_positions, stage_version=stage_version):
self._append2transkription(transkription, highlight_class, word_title, part_transkription_position)
if part_word.overwrites_word is not None:
overwritten_title = f'{word.text} overwrites {part_word.overwrites_word.text}'
for overwritten_transkription_position in part_word.overwrites_word.transkription_positions:
self._append2transkription(transkription, 'overwritten', overwritten_title, overwritten_transkription_position)
counter = (counter + 1) % 2
word_insertion_mark_class = 'word-insertion-mark'
counter = 0
for mark_foreign_hands in self.page.mark_foreign_hands:
highlight_class = 'foreign'
title = 'id: {}/line: {}\n{} <i>{}</i>'.format(str(mark_foreign_hands.id), str(mark_foreign_hands.line_number),\
mark_foreign_hands.foreign_hands_text, mark_foreign_hands.pen)
for transkription_position in mark_foreign_hands.transkription_positions:
self._append2transkription(transkription, highlight_class, title, transkription_position)
if self.show_word_insertion_mark:
for word_insertion_mark in self.page.word_insertion_marks:
wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(word_insertion_mark.id), str(word_insertion_mark.line_number))
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
word_insertion_mark.top, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height)
link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content)
transkription.append(link)
html = E.HTML(head,E.BODY(transkription))
bool(self.non_testing) and open_in_browser(html)
if output_file is not None:
with open(output_file, 'wb') as f:
f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8'))
f.closed
return 0
def _append2transkription(self, transkription, highlight_class, title, transkription_position):
"""Append content to transkription-div.
"""
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
transkription_position.top - self.text_field.top, transkription_position.left - self.text_field.left, transkription_position.width, transkription_position.height)
if transkription_position.transform is not None:
style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString())
transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\
if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0
style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height)
link = E.A(' ', E.CLASS(highlight_class), title=title, style=style_content)
transkription.append(link)
def create_pdf_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, pdf_file_name='output.pdf', bg_color=SVGConverter.BG_COLOR):
"""Creates a pdf file highlighting some words.
"""
if not pdf_file_name.endswith('pdf'):
pdf_file_name = pdf_file_name + '.pdf'
tmp_svg_file = pdf_file_name.replace('.pdf', '.svg')
create_svg_with_highlighted_words(xml_source_file=xml_source_file, page=page, highlighted_words=highlighted_words,\
svg_file_name=tmp_svg_file, bg_color=bg_color)
if isfile(tmp_svg_file):
cairosvg.svg2pdf(url=tmp_svg_file, write_to=pdf_file_name)
remove(tmp_svg_file)
def create_svg_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, svg_file_name='output.svg', bg_color=SVGConverter.BG_COLOR):
"""Creates a svg file highlighting some words.
"""
if page is None and xml_source_file is not None:
page = Page(xml_source_file)
converter = SVGConverter(page, bg_color=bg_color)
if not svg_file_name.endswith('svg'):
svg_file_name = svg_file_name + '.svg'
converter.convert(output_file=svg_file_name, highlighted_words=highlighted_words)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes.
svgscripts/convert_wordPositions.py OPTIONS <file>
OPTIONS:
-h|--help: show help
-H|--HTML [default] convert to HTML test file
-k|--key=key option for json converter:
only convert object == page.__dict__[key]
-o|--output=outputFile save output to file outputFile
-P|--PDF convert to PDF test file
-S|--SVG convert to SVG test file
-s|--svg=svgFile: svg web file
-T|--TEXT convert to TEXT output
-t|--text=text highlight word
-w|--word-insertion-mark show word insertion mark on HTML
-v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. }
-x|--testing execute in test mode, do not write to file or open browser
:return: exit code (int)
"""
convert_to_type = None
key = ''
non_testing = True
output_file = None
page = None
show_word_insertion_mark = False
stage_version = ''
svg_file = None
text = None
try:
opts, args = getopt.getopt(argv, "hk:t:HPSTws:o:v:x", ["help", "key=", "text=", "HTML", "PDF", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version=", "testing"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-v', '--version'):
if re.match(r'^(\d|\d\+|\d\-\d)$', arg):
stage_version = arg
else:
raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg))
elif opt in ('-w', '--word-insertion-mark'):
show_word_insertion_mark = True
elif opt in ('-P', '--PDF'):
convert_to_type = 'PDF'
elif opt in ('-S', '--SVG'):
convert_to_type = 'SVG'
elif opt in ('-T', '--TEXT'):
convert_to_type = 'TEXT'
elif opt in ('-H', '--HTML'):
convert_to_type = 'HTML'
elif opt in ('-x', '--testing'):
non_testing = False
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-o', '--output'):
output_file = arg
elif opt in ('-k', '--key'):
key = arg
elif opt in ('-t', '--text'):
text = arg
print(arg)
if len(args) < 1:
usage()
return 2
if convert_to_type is None:
if output_file is not None and len(re.split(r'\.', output_file)) > 1:
output_file_part_list = re.split(r'\.', output_file)
convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper()
else:
convert_to_type = 'HTML'
exit_code = 0
for word_position_file in args:
if not isfile(word_position_file):
print("'{}' does not exist!".format(word_position_file))
return 2
if convert_to_type == 'PDF':
if output_file is None:
output_file = 'output.pdf'
highlighted_words = None
if text is not None:
page = Page(word_position_file)
highlighted_words = [ word for word in page.words if word.text == text ]
create_pdf_with_highlighted_words(word_position_file, pdf_file_name=output_file, highlighted_words=highlighted_words)
else:
if svg_file is not None:
if isfile(svg_file):
page = PageCreator(word_position_file, svg_file=svg_file)
else:
print("'{}' does not exist!".format(word_position_file))
return 2
else:
page = Page(word_position_file)
if page.svg_file is None:
print('Please specify a svg file!')
usage()
return 2
highlighted_words = None
if text is not None:
highlighted_words = [ word for word in page.words if word.text == text ]
print([ (word.id, word.text) for word in highlighted_words ])
converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark, key=key)
exit_code = converter.convert(output_file=output_file, stage_version=stage_version, highlighted_words=highlighted_words)
return exit_code
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline