Page MenuHomec4science

No OneTemporary

File Metadata

Created
Fri, Apr 18, 00:33
Index: tests_svgscripts/test_data/image.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tests_svgscripts/test_data/image.jpg
===================================================================
--- tests_svgscripts/test_data/image.jpg (revision 59)
+++ tests_svgscripts/test_data/image.jpg (revision 60)
Property changes on: tests_svgscripts/test_data/image.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tests_svgscripts/test_join_faksimileAndTranskription.py
===================================================================
--- tests_svgscripts/test_join_faksimileAndTranskription.py (revision 59)
+++ tests_svgscripts/test_join_faksimileAndTranskription.py (revision 60)
@@ -1,95 +1,95 @@
import unittest
from os import sep, path, remove
from os.path import isdir, isfile, dirname
import shutil
import sys
import lxml.etree as ET
import warnings
import sys
sys.path.append('svgscripts')
import join_faksimileAndTranskription
from datatypes.faksimile import FaksimilePage
from datatypes.page import Page
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkriptionField import TranskriptionField
from datatypes.word_position import WordPosition
class TestJoin(unittest.TestCase):
def setUp(self):
join_faksimileAndTranskription.UNITTESTING = True
DATADIR = path.dirname(__file__) + sep + 'test_data'
self.faksimile_dir = DATADIR + sep + 'faksimile_svg'
self.manuscript = DATADIR + sep + 'N_VII_1.xml'
self.manuscript_copy = self.manuscript.replace('.', '_copy.')
self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg'
self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml'
self.Mp_XIV_1_mytest_421 = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml'
def test_sort_words(self):
page = Page(xml_source_file=self.Mp_XIV_1_mytest_421)
words_line7 = [ word for word in page.words if word.line_number == 7 ]
page.words = words_line7
sorted_words = join_faksimileAndTranskription.sort_words(page)
self.assertEqual(len(sorted_words), len(words_line7))
for index, word in enumerate(words_line7):
self.assertEqual(sorted_words[index], word)
def test_sort_faksimile_positions(self):
faksimile_tree = ET.parse(self.faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
self.assertEqual(len(faksimile_pages), 2)
svg_pos_file, manuscript_file = join_faksimileAndTranskription.get_svgPosFile_and_manuscriptFile(faksimile_pages[0], manuscript_file=self.manuscript)
sorted_positions = join_faksimileAndTranskription.sort_faksimile_positions(faksimile_pages[0].word_positions)
page = Page(xml_source_file=svg_pos_file)
for index in range(0, 10):
id = sorted_positions[index].id
if len(faksimile_tree.getroot().xpath('//ns:rect[@id="{0}"]/ns:title/text()|//ns:path[@id="{0}"]/ns:title/text()'\
.format(id), namespaces=namespaces)) > 0:
word_text = faksimile_tree.getroot().xpath('//ns:rect[@id="{0}"]/ns:title/text()|//ns:path[@id="{0}"]/ns:title/text()'\
.format(id), namespaces=namespaces)[0]
#print(sorted_positions[index].left, sorted_positions[index].top, word_text, page.words[index].text)
self.assertEqual(word_text, page.words[index].text)
def test_get_filelist_and_manuscript_file(self):
file_list, manuscript_file = join_faksimileAndTranskription.get_filelist_and_manuscript_file(self.faksimile_dir, self.manuscript)
self.assertEqual(len(file_list), 1)
self.assertEqual(file_list[0], self.faksimile_file)
self.assertEqual(manuscript_file, self.manuscript)
file_list, manuscript_file = join_faksimileAndTranskription.get_filelist_and_manuscript_file(self.manuscript, self.faksimile_file)
self.assertEqual(len(file_list), 1)
self.assertEqual(file_list[0], self.faksimile_file)
self.assertEqual(manuscript_file, self.manuscript)
def test_get_svgPosFile_and_manuscriptFile(self):
faksimile_tree = ET.parse(self.faksimile_file)
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree)
self.assertEqual(len(faksimile_pages), 2)
svg_pos_file, manuscript_file = join_faksimileAndTranskription.get_svgPosFile_and_manuscriptFile(faksimile_pages[0], manuscript_file=self.manuscript)
#self.assertEqual(svg_pos_file, self.manuscript.replace('.', '_page00{}.'.format(faksimile_pages[0].page_number)))
self.assertEqual(manuscript_file, self.manuscript)
def test_join_faksimileAndTranskription(self):
self.assertEqual(join_faksimileAndTranskription.join_faksimileAndTranskription(self.faksimile_file, self.manuscript), 0)
#self.assertEqual(join_faksimileAndTranskription.join_faksimileAndTranskription(self.faksimile_file, self.manuscript, test_word_text='gar'), 0)
def testupdate_writing_process(self):
page = Page(xml_source_file=self.xml_file)
word = page.words[12]
self.assertEqual(len(word.faksimile_positions), 1)
self.assertEqual(word.faksimile_positions[0].writing_process_id, -1)
join_faksimileAndTranskription.update_writing_process(word)
self.assertEqual(word.faksimile_positions[0].writing_process_id, 0)
- #@unittest.skipUnless(__name__ == "__main__", 'test takes too long, we do not run it with unittest discover')
- @unittest.skip('test takes too long, has been tested')
+ @unittest.skipUnless(__name__ == "__main__", 'test takes too long, we do not run it with unittest discover')
+ #@unittest.skip('test takes too long, has been tested')
def test_fix_errors(self):
page = Page(xml_source_file=self.xml_file)
word_position = WordPosition(id='rect945', text='Lenken')
join_faksimileAndTranskription.fix_errors(self.faksimile_file, [ word_position], [page.words[12]], xml_source_file=self.xml_file, manuscript_file=self.manuscript )
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_copy_faksimile_svg_file.py
===================================================================
--- tests_svgscripts/test_copy_faksimile_svg_file.py (revision 0)
+++ tests_svgscripts/test_copy_faksimile_svg_file.py (revision 60)
@@ -0,0 +1,44 @@
+import unittest
+from os import sep, path, remove
+from os.path import isdir, isfile, dirname, basename
+import shutil
+import sys
+import lxml.etree as ET
+import sys
+import tempfile
+import warnings
+
+sys.path.append('svgscripts')
+
+import copy_faksimile_svg_file
+from datatypes.faksimile import FaksimilePage
+from datatypes.page import Page
+from datatypes.positional_word_part import PositionalWordPart
+from datatypes.transkriptionField import TranskriptionField
+from datatypes.word_position import WordPosition
+
+class TestCopy(unittest.TestCase):
+ def setUp(self):
+ DATADIR = path.dirname(__file__) + sep + 'test_data'
+ self.faksimile_dir = DATADIR + sep + 'faksimile_svg'
+ self.faksimile_file = self.faksimile_dir + sep + 'N-VII-1,5et6.svg'
+ self.image = DATADIR + sep + 'image.jpg'
+
+ def test_copy(self):
+ tmp_dir = tempfile.mkdtemp()
+ tmp_image = tmp_dir + sep + basename(self.image)
+ target_file = 'asdf.svg'
+ shutil.copy(self.image, tmp_dir)
+ copy_faksimile_svg_file.copy_faksimile_svg_file(target_file, faksimile_source_file=self.faksimile_file,\
+ target_directory=tmp_dir, local_image_path=tmp_image)
+ self.assertEqual(isfile(tmp_dir + sep + target_file), True)
+ copy_faksimile_svg_file.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_file,\
+ target_directory=tmp_dir, local_image_path=tmp_image)
+ self.assertEqual(isfile(tmp_dir + sep + basename(self.faksimile_file)), True)
+ with self.assertRaises(Exception):
+ copy_faksimile_svg_file.copy_faksimile_svg_file()
+ with self.assertRaises(Exception):
+ copy_faksimile_svg_file.copy_faksimile_svg_file(faksimile_source_file=self.faksimile_source_file)
+
+if __name__ == "__main__":
+ unittest.main()
Index: TODO.md
===================================================================
--- TODO.md (revision 59)
+++ TODO.md (revision 60)
@@ -1,42 +1,97 @@
# Wortsuche:
- Die Wortsuche sollte über die Nähe der Wörter zueinander gewichtet werden.
- Wortpfade, d.h. Abfolgen der Wörter sollen vermieden werden, da dies nicht automatisch generiert werden kann und
höchst fehleranfällig ist.
- Daher sollen die Worteinfügungen auch nicht dafür verwendet werden, alternative Textverläufe aufzuzeichnen.
-# Probleme:
-- Wie mit Worteinfügungen in Wortkomposita umgehen?
- - WritingProcess verwenden.
# TODO
-- current task: Reference
+## Faksimile data input
+- word boxes on faksimile by drawing rects with inkscape [IN PROGRESS, see "Leitfaden.pdf"]
+- naming word boxes by using title of rects [IN PROGRESS, see "Leitfaden\_Kontrolle\_und\_Beschriftung\_der\_Wortrahmen.pdf"]
+- splitting word box if a word has parts by drawing a vertical path in rect [TODO]
+## Processing
+### faksimile data input, i.e. svg-file resulting from drawing boxes etc. with inkscape
+- process faksimile words:
+ - join\_faksimileAndTranskription.py [DONE]
+ - create a data input task for words that have parts: [TODO]
+ - create pdf marking relevant words and their parts of transkription
+ - create faksimile svg highlighting relevant rects
+ - copy pdf and svg to designated folder for this task
+
+### transkription, i.e. svg-file resulting from pdf-file ->created with InDesign
+- process text field:
+ - Word [DONE]
+ - SpecialWord
+ - MarkForeignHands [DONE]
+ - TextConnectionMark [DONE]
+ - WordInsertionMark [DONE]
+ - all paths -> page.categorize\_paths [TODO]
+ - word-deletion -> Path [DONE]
+ - make parts of word if only parts of a word are deleted, also introduce earlier version of word [TODO]
+ - word-undeletion (e.g. N VII 1, 18,6 -> "mit")
+ - text-area-deletion
+ - text-connection-lines
+ - underline
+
+- process footnotes:
+ - TextConnectionMark [DONE]
+ - TextConnection with uncertainty [TODO]
+ - "Fortsetzung [0-9]+,[0-9]+?"
+ - "Fortsetzung von [0-9]+,[0-9]+?"
+ - concerning Word:
+ - uncertain transcription: "?"
+ - atypical writting: "¿" and bold word parts
+ - clarification corrections ("Verdeutlichungskorrekturen"): "Vk" and bold word parts
+ - correction: "word>" and ">?" (with uncertainty)
+ - concerning word deletion:
+ - atypical writting: "¿" and "Durchstreichung" (see N VII 1, 11,2)
+
+- process margins:
+ - MarkForeignHands [DONE]
+ - ForeignHandTextAreaDeletion [TODO]
+ - boxes: make earlier version of a word [TODO]
+ - TextConnection [TODO]
+ - from: ([0-9]+,)*[0-9]+ -)
+ - to: -) ([0-9]+,)*[0-9]+
+
+## Datatypes
- make datatypes:
- Page [ok] --> page orientation!!!
- Word [ok] --> deal with non-horizontal text <<<< DONE!
+ --> hyphenation
--> add style info to word: font { German, Latin }
+ --> pen color
--> connect style with character glyph-id from svg path file
--> handle word layers, i.e. later correction of words by insertion
+ --> has parts
+ --> versions: later version of earlier version
- WritingProcess
- correlates with font size:
- biggest font to biggest-1 font: stage 0
- font in between: stage 1
- smallest font to smallest+1 font: stage 2
- Style
- TODO: howto handle style_class in rdf? (as JSON?)
- WordPosition [ok]
- TranskriptionPosition [ok]
- FaksimilePosition [ok]
- - LineNumber [ok]
+ - LineNumber [reDo]
+ - change to Line
- Reference [TODO]+
- - Marginalien
+ - TextConnection
+ - needs change of LineNumber to Line
+ - ForeignHandTextAreaDeletion [TODO]
- Freehand:
- - Deletion
- - MarkForeignHands ("Zeichen für Fremde Hand")
+ - Deletion [DONE]
+ - make parts of word if only parts of a word are deleted, also introduce earlier version of word [TODO]
+ - MarkForeignHands ("Zeichen für Fremde Hand") [DONE]
- isa SpecialWord
- - TextConnectionMark ("Anschlußzeichen") [TODO]+
+ - TextConnectionMark ("Anschlußzeichen") [DONE]
- isa SpecialWord
+ - has a Reference
- WordInsertionMark [reDO]
- Underline [TODO]
Index: svgscripts/copy_faksimile_svg_file.py
===================================================================
--- svgscripts/copy_faksimile_svg_file.py (revision 0)
+++ svgscripts/copy_faksimile_svg_file.py (revision 60)
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This program can be used to copy a faksimile svg file with the option of highlighting some word boxes.
+"""
+# Copyright (C) University of Basel 2019 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
+
+from colorama import Fore, Style
+from functools import cmp_to_key
+import getopt
+import lxml.etree as ET
+import re
+import shutil
+import signal
+import string
+import subprocess
+from svgpathtools import svg_to_paths
+import sys
+import tempfile
+from operator import attrgetter
+import os
+from os import listdir, sep, path, setpgrp, devnull
+from os.path import exists, isfile, isdir, dirname, basename
+import warnings
+import xml.etree.ElementTree as XET
+
+if dirname(__file__) not in sys.path:
+ sys.path.append(dirname(__file__))
+
+from convert_wordPositions import SVGConverter, create_pdf_with_highlighted_words
+from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
+from datatypes.lineNumber import LineNumber
+from datatypes.mark_foreign_hands import MarkForeignHands
+from datatypes.page import Page
+from datatypes.transkriptionField import TranskriptionField
+from local_config import PDF_READER, SVG_EDITOR
+from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
+from process_files import update_svgposfile_status
+
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+UNITTESTING = False
+HIGHLIGHT_COLOR = 'red'
+OPACITY = '0.5'
+
+def copy_faksimile_svg_file(target_file=None, faksimile_source_file=None, faksimile_tree=None, target_directory=None, local_image_path=None):
+ """Copy a faksimile_svg_file to target_file.
+ """
+ if faksimile_source_file is None and faksimile_tree is not None:
+ faksimile_source_file = faksimile_tree.docinfo.URL
+ elif faksimile_source_file is None:
+ raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file')
+ if target_file is not None and target_directory is not None:
+ target_file = target_directory + sep + target_file
+ elif target_file is None and target_directory is not None:
+ target_file = target_directory + sep + basename(faksimile_source_file)
+ elif target_file is None:
+ raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory')
+ paths, attributes, svg_attributes = svg_to_paths.svg2paths(faksimile_source_file, return_svg_attributes=True)
+ if bool(svg_attributes.get('xmlns')):
+ XET.register_namespace('', svg_attributes['xmlns'])
+ for key in [ key for key in svg_attributes.keys() if key.startswith('xmlns:') ]:
+ try:
+ XET.register_namespace(key.replace('xmlns:', ''), svg_attributes[key])
+ except ValueError: pass
+ namespaces = { 'ns': svg_attributes['xmlns'], 'xlink': svg_attributes['xmlns:xlink'] }
+ if faksimile_tree is not None:
+ target_tree = XET.fromstring(ET.tostring(faksimile_tree))\
+ if type(faksimile_tree) == ET._ElementTree\
+ else XET.fromstring(XET.tostring(faksimile_tree.getroot()))
+ else:
+ target_tree = XET.parse(faksimile_source_file)
+ if local_image_path is not None and isfile(local_image_path)\
+ and len(target_tree.findall('.//ns:image', namespaces=namespaces)) > 0:
+ image_node = target_tree.findall('.//ns:image', namespaces=namespaces)[0]
+ image_node.set('{%s}href' % namespaces['xlink'], local_image_path)
+ target_tree.write(target_file)
+
+def highlight_empty_nodes(faksimile_tree, x_min, x_max, y_min, y_max, not_id, namespaces={}):
+ """Highlights rect and path nodes that do not have a title element.
+ :return: a list of corresponding ids
+ """
+ if len(namespaces) == 0:
+ namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
+ empyt_node_ids = []
+ nodes_without_title = faksimile_tree.xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'.format(\
+ x_min, x_max, y_min, y_max, not_id), namespaces=namespaces)
+ nodes_without_title += get_paths_inside_rect(faksimile_tree, '//ns:path[not(./ns:title)]', x_min, x_max, y_min, y_max, not_id, namespaces=namespaces)
+ for node_without_title in nodes_without_title:
+ empyt_node_ids.append(node_without_title.get('id'))
+ highlight_node(node_without_title)
+ return empyt_node_ids
+
+def highlight_node(node):
+ """Highlights a node.
+ """
+ node.set('fill', HIGHLIGHT_COLOR)
+ node.set('opacity', OPACITY)
+ node.set('style', '')
+
+def usage():
+ """prints information on how to use the script
+ """
+ print(main.__doc__)
+
+def main(argv):
+ """This program can be used to copy a faksimile svg file with the option of highlighting some word boxes.
+
+ svgscripts/copy_faksimile_svg_file.py [OPTIONS] <faksimile_svg_file> <target_dir>
+
+ <faksimile_svg_file> a svg file containing information about the word positions on the faksimile.
+ <target_dir> the target directory.
+
+ OPTIONS:
+ -h|--help: show help
+
+ :return: exit code (int)
+ """
+ try:
+ opts, args = getopt.getopt(argv, "h", ["help" ])
+ except getopt.GetoptError:
+ usage()
+ return 2
+ for opt, arg in opts:
+ if opt in ('-h', '--help'):
+ usage()
+ return 0
+ if len(args) < 2:
+ usage()
+ return 2
+ exit_status = 0
+ if exists(args[0]) and exists(args[1]):
+ faksimile_svg_file = args[0] if isfile(args[0]) else args[1]
+ target_dir = args[1] if isdir(args[1]) else args[0]
+ else:
+ file_a = args[0] if not exists(args[0]) else args[1]
+ raise FileNotFoundError('File {} does not exist!'.format(file_a))
+ return exit_status
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
Index: svgscripts/join_faksimileAndTranskription.py
===================================================================
--- svgscripts/join_faksimileAndTranskription.py (revision 59)
+++ svgscripts/join_faksimileAndTranskription.py (revision 60)
@@ -1,503 +1,493 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
-import cairosvg
from colorama import Fore, Style
from functools import cmp_to_key
import getopt
import lxml.etree as ET
import re
import shutil
import signal
import string
import subprocess
from svgpathtools import svg_to_paths
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
import warnings
import xml.etree.ElementTree as XET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
-from convert_wordPositions import SVGConverter
+from convert_wordPositions import SVGConverter, create_pdf_with_highlighted_words
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.lineNumber import LineNumber
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page
from datatypes.transkriptionField import TranskriptionField
from local_config import PDF_READER, SVG_EDITOR
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from process_files import update_svgposfile_status
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation)
PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"')
SINGLE_PUNCTUATION_PATTERN = r"^[{}]$".format(string.punctuation)
STATUS_MERGED_OK = 'faksimile merged'
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
def create_faksimile_svg_file(faksimile_page, faksimile_tree, unmerged_faksimile_positions, faksimile_has_empty_nodes, file_name, namespaces={}):
"""Creates a faksimile_svg_file highlighting the errors of the original file.
:return: a list of ids of nodes (i.e.: rect, path) that have no title in the original file.
"""
THRESHOLD_X = 10
empyt_node_ids = []
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
for faksimile_position in unmerged_faksimile_positions:
rect_nodes = faksimile_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(faksimile_position.id), namespaces=namespaces)
if len(rect_nodes) > 0:
highlight_node(rect_nodes[0])
if faksimile_has_empty_nodes:
x_min = faksimile_page.text_field.xmin + faksimile_page.faksimile_image.x
x_max = faksimile_page.text_field.xmax + faksimile_page.faksimile_image.x - THRESHOLD_X
y_min = faksimile_page.text_field.ymin + faksimile_page.faksimile_image.y
y_max = faksimile_page.text_field.ymax + faksimile_page.faksimile_image.y
empyt_node_ids = highlight_empty_nodes(faksimile_tree, x_min, x_max, y_min, y_max,\
faksimile_page.text_field.id, namespaces=namespaces)
if faksimile_page.faksimile_image.local_path is not None and isfile(faksimile_page.faksimile_image.local_path):
image_node = faksimile_tree.getroot().find('.//image', faksimile_tree.getroot().nsmap)
image_node.set('{%s}href' % namespaces['xlink'], faksimile_page.faksimile_image.local_path)
write_pretty(xml_element_tree=faksimile_tree, file_name=file_name)
return empyt_node_ids
-def create_pdf_file(xml_source_file, unmerged_words, pdf_file_name):
- """Creates a pdf file highlighting the place of the error in the transkription.
- """
- page = Page(xml_source_file=xml_source_file)
- converter = SVGConverter(page, bg_color=HIGHLIGHT_COLOR)
- tmp_svg_file = pdf_file_name.replace('.pdf', '.svg')
- converter.convert(output_file=tmp_svg_file, highlighted_words=unmerged_words)
- cairosvg.svg2pdf(url=tmp_svg_file, write_to=pdf_file_name)
-
def fix_errors(faksimile_file, unmerged_faksimile_positions, unmerged_words, text_field_id=None, faksimile_page=None, xml_source_file=None, manuscript_file=None, namespaces={}):
"""Creates a faksimile svg file and a pdf file highlighting the positions of the word positions
that could not been merged. After correction, results are inserted into origianl file and processed again.
"""
parser = ET.XMLParser(remove_blank_text=True)
faksimile_tree = ET.parse(faksimile_file, parser)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
if faksimile_page is None:
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if text_field_id is not None\
and text_field_id in [ faksimile_page.text_field.id for faksimile_page in faksimile_pages ]:
faksimile_page = [ faksimile_page for faksimile_page in faksimile_pages if faksimile_page.text_field.id == text_field_id ][0]
else:
faksimile_page = faksimile_pages[0]
if xml_source_file is None or manuscript_file is None:
xml_source_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)
tmp_dir = tempfile.mkdtemp()
tmp_pdf_file = tmp_dir + sep + 'output.pdf'
tmp_faksimile = tmp_dir + sep + 'faksimile.svg'
faksimile_has_empty_nodes = len(unmerged_faksimile_positions) < len(unmerged_words)
empyt_node_ids = create_faksimile_svg_file(faksimile_page, faksimile_tree, unmerged_faksimile_positions,\
faksimile_has_empty_nodes, tmp_faksimile, namespaces=namespaces)
- create_pdf_file(xml_source_file, unmerged_words, tmp_pdf_file)
+ create_pdf_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, pdf_file_name=tmp_pdf_file, bg_color=HIGHLIGHT_COLOR)
if isfile(tmp_pdf_file) and isfile(tmp_faksimile):
show_files(tmp_pdf_file, tmp_faksimile)
record_changes(faksimile_file, tmp_faksimile, unmerged_faksimile_positions, empyt_node_ids, parser=parser, namespaces=namespaces)
shutil.rmtree(tmp_dir)
join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, do_fix_errors=False)
def highlight_empty_nodes(faksimile_tree, x_min, x_max, y_min, y_max, not_id, namespaces={}):
"""Highlights rect and path nodes that do not have a title element.
:return: a list of corresponding ids
"""
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
empyt_node_ids = []
nodes_without_title = faksimile_tree.xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'.format(\
x_min, x_max, y_min, y_max, not_id), namespaces=namespaces)
nodes_without_title += get_paths_inside_rect(faksimile_tree, '//ns:path[not(./ns:title)]', x_min, x_max, y_min, y_max, not_id, namespaces=namespaces)
for node_without_title in nodes_without_title:
empyt_node_ids.append(node_without_title.get('id'))
highlight_node(node_without_title)
return empyt_node_ids
def highlight_node(node):
"""Highlights a node.
"""
node.set('fill', HIGHLIGHT_COLOR)
node.set('opacity', OPACITY)
node.set('style', '')
def get_filelist_and_manuscript_file(file_a, file_b=None):
"""Returns a file list and a manuscript file (or None)
"""
file_list = []
manuscript_file = None
if isfile(file_a) and file_a.endswith('svg'):
file_list.append(file_a)
if file_b is not None and isfile(file_b):
manuscript_file = file_b
elif isfile(file_a) and file_a.endswith('xml'):
manuscript_file = file_a
if file_b is not None and isfile(file_b):
file_list.append(file_b)
elif isdir(file_b):
file_list = [ svgfile for svgfile in listdir(file_b) if svgfile.endswith('svg') ]
elif isdir(file_a):
file_list = [ file_a + sep + svgfile for svgfile in listdir(file_a) if svgfile.endswith('svg') ]
if file_b is not None and isfile(file_b):
manuscript_file = file_b
return file_list, manuscript_file
def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None, redo_ok=False):
"""Return svg_pos_file and manuscript_file if they are ready for processing.
"""
svg_pos_file = None
manuscript_tree = None
if manuscript_file is not None\
and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')):
manuscript_tree = ET.parse(manuscript_file)
else:
title_string = faksimile_page.title.replace(' ', '_')
manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\
if isdir('.{}xml'.format(sep)) else title_string + '.xml'
if isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if manuscript_tree is not None:
if redo_ok and len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
else:
if not UNITTESTING:
msg_color = Fore.CYAN if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0\
else Fore.MAGENTA
msg = 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number)\
if msg_color == Fore.MAGENTA\
else 'Faksimile already joined!'
print(msg_color + msg, end='')
print(Style.RESET_ALL)
return svg_pos_file, manuscript_file
def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, test_word_text='', do_fix_errors=False, redo_ok=False):
"""Joins the data of a faksimile file with the data of svgposfile.
"""
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='')
print(Style.RESET_ALL)
faksimile_tree = ET.parse(faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
exit_status = 0
for faksimile_page in faksimile_pages:
svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)
if svg_pos_file is not None:
if not UNITTESTING:
print(Fore.CYAN + 'joining data with file {} ... '.format(svg_pos_file), end='')
image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field)
page = Page(xml_source_file=svg_pos_file, faksimile_image=image4page, faksimile_svgFile=faksimile_file)
words = sort_words(page)
faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions)
new_words = []
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
for word_text in unique_faksimile_words:
process_word_text(new_words, word_text, faksimile_positions, words)
if False not in [ word.joined for word in words ]\
and False not in [ position.joined for position in faksimile_positions]\
and not UNITTESTING:
post_merging_processing_and_saving(svg_pos_file, new_words, page=page, manuscript_file=manuscript_file)
print(Fore.GREEN + '[OK]')
print(Style.RESET_ALL)
elif not UNITTESTING:
not_joined_fp = [ (position.id, position.text) for position in faksimile_positions if not position.joined ]
plural_fp = '' if len(not_joined_fp) < 2 else 's'
not_joined_tw = [ (word.id, word.text) for word in words if not word.joined ]
plural_tw = '' if len(not_joined_tw) < 2 else 's'
print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp))
print(Fore.MAGENTA + '--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw))
print(Style.RESET_ALL)
if do_fix_errors:
fix_errors(faksimile_file, [position for position in faksimile_positions if not position.joined],\
[ word for word in words if not word.joined ], text_field_id=faksimile_page.text_field.id,\
faksimile_page=faksimile_page, xml_source_file=svg_pos_file,\
manuscript_file=manuscript_file, namespaces=namespaces)
exit_status = 2
elif False in [ word.joined for word in words ]:
print([ (word.id, word.text) for word in words if not word.joined ])
exit_status = 2
elif test_word_text != '':
print([ (word.id, word.text) for word in new_words if word.text == test_word_text ])
return exit_status
def post_merging_processing_and_saving(svg_pos_file, new_words, page=None, manuscript_file=None, target_svg_pos_file=None):
"""Process words after merging with faksimile word positions.
"""
if page is None:
page = Page(xml_source_file=svg_pos_file)
page.words = sorted(new_words, key=attrgetter('id'))
for word_node in page.page_tree.xpath('//word'):
word_node.getparent().remove(word_node)
if page.source is None or not isfile(page.source):
raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file))
transkription_field = TranskriptionField(page.source)
page.find_special_words(transkription_field=transkription_field)
page.categorize_paths(transkription_field=transkription_field)
page.update_and_attach_words2tree(update_function_on_word=update_writing_process,\
include_special_words_of_type=[])
if target_svg_pos_file is None:
target_svg_pos_file = svg_pos_file
update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=STATUS_MERGED_OK)
write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
def process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=''):
"""Joins faksimile_positions with text == word_text with words with text == word_text.
"""
text = word_text if alt_word_text == '' else alt_word_text
fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ]
words4word = [ word for word in words if word.text == word_text and not word.joined ]
if alt_word_text != '':
words4word += [ word for word in words if word.text == text and not word.joined ]
words4word = sorted(words4word, key=attrgetter('id'))
if len(fposition4word) == len(words4word):
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words[words4word[index].id].joined = True
new_words.append(words4word[index])
elif len(words4word) < len(fposition4word):
if re.match(r'(.*)ss(.*)', text):
alt_word_text = re.sub(r'ss', 'ß', text)
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text)
elif re.match(SINGLE_PUNCTUATION_PATTERN, text):
if text == '-':
alt_word_text = text.replace('-', '–')
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text)
else:
print('single', word_text, len(fposition4word), len(words4word))
elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text):
alt_word_text = re.sub(PUNCTUATION_PATTERN, "", text)
if alt_word_text != '':
pattern = r'(.*){0}(.*)'.format(alt_word_text)
words4word += [ word for word in words if re.match(pattern, word.text) and not word.joined ]
if len(words4word) < len(fposition4word):
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text)
else:
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
if words4word[index].text[len(words4word[index].text)-1] != word_text[len(word_text)-1]\
and words4word[index].id+1 < len(words)\
and words[words4word[index].id+1].text == word_text[len(word_text)-1]:
words4word[index].join(words[words4word[index].id+1])
words[words4word[index].id+1].joined = True
words[words4word[index].id].joined = True
words4word[index].text = word_text
new_words.append(words4word[index])
else:
if len(text) > 1:
new_words4word = [ word for word in words if word.text == text[1:] and not word.joined ]
if len(new_words4word) == 0:
alt_word_text = text[1:]
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text)
else:
for new_word in new_words4word:
collected_text = new_word.text
current_word = new_word
while collected_text != word_text and word_text.endswith(collected_text) and current_word.id > 0:
previous_word = words[current_word.id-1]
if word_text.endswith(previous_word.text + collected_text):
words[current_word.id].joined = True
previous_word.join(current_word)
current_word = previous_word
collected_text = current_word.text
else:
collected_text = previous_word.text + collected_text
words4word.append(current_word)
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words4word[index].text = word_text
words[words4word[index].id].joined = True
new_words.append(words4word[index])
else:
print('<{0}> {1}/{2}, ids: {3}'.\
format(word_text, len(fposition4word), len(words4word), [ position.id for position in fposition4word ]))
else:
print(word_text, len(fposition4word), len(words4word))
def record_changes(original_faksimile_file, tmp_faksimile_file, unmerged_faksimile_positions, empyt_node_ids, parser=None, namespaces={}):
"""Copy changes made to temporary faksimile svg file to original faksimile svg file.
"""
paths, attributes, svg_attributes = svg_to_paths.svg2paths(original_faksimile_file, return_svg_attributes=True)
if bool(svg_attributes.get('xmlns')):
XET.register_namespace('', svg_attributes['xmlns'])
for key in [ key for key in svg_attributes.keys() if key.startswith('xmlns:') ]:
try:
XET.register_namespace(key.replace('xmlns:', ''), svg_attributes[key])
except ValueError: pass
old_tree = XET.parse(original_faksimile_file)
old_namespaces = { 'ns': svg_attributes['xmlns'] }
if parser is None:
parser = ET.XMLParser(remove_blank_text=True)
new_tree = ET.parse(tmp_faksimile_file, parser)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() }
node_ids = [ faksimile_position.id for faksimile_position in unmerged_faksimile_positions ]
for node_id in node_ids:
new_titles = new_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces)
old_titles = old_tree.findall('.//ns:rect[@id="{0}"]/ns:title'.format(node_id), namespaces=old_namespaces)\
if len(old_tree.findall('.//ns:rect[@id="{0}"]/ns:title'.format(node_id), namespaces=old_namespaces)) > 0\
else old_tree.findall('.//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=old_namespaces)
if len(new_titles) > 0 and len(old_titles) > 0:
old_titles[0].text = new_titles[0].text
old_title_id = 0
for node_id in empyt_node_ids:
new_titles = new_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces)
old_no_titles = old_tree.findall('.//ns:rect[@id="{0}"]'.format(node_id), namespaces=old_namespaces)\
if len(old_tree.findall('.//ns:rect[@id="{0}"]'.format(node_id), namespaces=old_namespaces)) > 0\
else old_tree.findall('.//ns:path[@id="{0}"]'.format(node_id), namespaces=old_namespaces)
if len(new_titles) > 0 and len(old_no_titles) > 0:
old_title_id_string = 'mytitle' + str(old_title_id)
old_title_id += 1
old_title = XET.SubElement(old_no_titles[0], 'title', attrib={ 'id': old_title_id_string })
old_title.text = new_titles[0].text
old_tree.write(original_faksimile_file)
def show_files(pdf_file, faksimile_svg_file):
"""Show the files with external programs.
"""
DEVNULL = open(devnull, 'wb')
p = subprocess.Popen([PDF_READER, pdf_file], stdout=DEVNULL, stderr=DEVNULL, preexec_fn=os.setsid)
subprocess.run([SVG_EDITOR, faksimile_svg_file])
os.killpg(os.getpgid(p.pid), signal.SIGTERM)
DEVNULL.close()
def sort_words(page):
"""Returns sorted words (from top left to bottom right).
"""
if -1 in [ word.line_number for word in page.words ]:
warnings.warn('{} in page file {} for word with ids {}'.format(LineNumber.WARN_NO_LINE_NUMBER, page.page_tree.docinfo.URL, page.page_tree.xpath('//word[not(@line-number)]/@id')))
words = []
for line_number in page.line_numbers:
word_on_line = [ word for word in page.words if word.line_number == line_number.id ]
if line_number.id % 2 == 0:
words += sorted(word_on_line, key=lambda word: word.transkription_positions[0].left)
else:
words += sorted(word_on_line, key=cmp_to_key(\
lambda wordA, wordB: -1\
if wordA.transkription_positions[0].left < wordB.transkription_positions[0].left\
and abs(wordA.transkription_positions[0].bottom-wordB.transkription_positions[0].bottom) < wordA.transkription_positions[0].height/2\
else 1))
for index, word in enumerate(words):
words[index].id = index
words[index].joined = False
return words
def sort_faksimile_positions(faksimile_positions):
"""Returns sorted words (from top left to bottom right).
"""
for faksimile_position in faksimile_positions:
faksimile_position.joined = False
return sorted(faksimile_positions, key=cmp_to_key(\
lambda positionA, positionB: -1\
if positionA.left+positionA.width/2 <= positionB.left+positionB.width/2\
and positionA.top+positionA.height/2 <= positionB.top+positionB.height/2\
else 1\
)\
)
def update_writing_process(word):
"""Updates the writing process of the faksimile word position by
synchronizing it with the corresponding transkription word position.
If there are several transkription positions belonging to different writing
processes but just one faksimile position, then we skip the update.
We will fix these faksimile positions by manually adding more word positions
and processing those additions in a later stage.
"""
writing_processes = [ writing_process_id for writing_process_id in set( tp.writing_process_id for tp in word.transkription_positions ) ]
if len(writing_processes) == 1 and len(word.faksimile_positions) > 0:
word.faksimile_positions[0].writing_process_id = writing_processes[0]
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
svgscripts/join_faksimileAndTranskription.py [OPTIONS] <FAKSIMILE_DIR|faksimile_svg_file> [xmlManuscriptFile]
<FAKSIMILE_DIR> a directory containing <faksimile_svg_file>
<faksimile_svg_file> a svg file containing information about the word positions on the faksimile.
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
OPTIONS:
-h|--help: show help
-f|--fix-errors: open faksimilie svg file if there are errors
-i|--ignore-status-ok ignore status "OK:faksimile merged" in manuscript file and redo merging.
:return: exit code (int)
"""
fix_errors = False
redo_ok = False
try:
opts, args = getopt.getopt(argv, "hfi", ["help", "fix-errors", "ignore-status-ok" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-f', '--fix-errors'):
fix_errors = True
elif opt in ('-i', '--ignore-status-ok '):
redo_ok = True
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if exists(file_a):
file_b = None
if len(args) > 1 and exists(args[1]):
file_b = args[1]
file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b)
for faksimile_file in file_list:
join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, do_fix_errors=fix_errors, redo_ok=redo_ok)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/convert_wordPositions.py
===================================================================
--- svgscripts/convert_wordPositions.py (revision 59)
+++ svgscripts/convert_wordPositions.py (revision 60)
@@ -1,324 +1,345 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
+import cairosvg
import getopt
from lxml.html import builder as E
from lxml.html import open_in_browser
import lxml
-from os import sep, listdir, mkdir, path
+from os import sep, listdir, mkdir, path, remove
from os.path import exists, isfile, isdir
import re
import sys
from svgpathtools import svg_to_paths
import xml.etree.ElementTree as ET
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.transkriptionField import TranskriptionField
from datatypes.writing_process import WritingProcess
from datatypes.word import Word
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Converter:
"""The converter super class.
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
self.page = page
self.non_testing = non_testing
self.show_word_insertion_mark = show_word_insertion_mark
def _get_transkription_positions(self, transkription_positions, stage_version=''):
"""Returns the transkription_positions of the indicated stage_version.
"""
convertable_transkription_positions = transkription_positions
if stage_version != '':
convertable_transkription_positions = []
if re.match(r'^\d$', stage_version):
writing_process_id = int(stage_version)
for transkription_position in transkription_positions:
if transkription_position.writing_process_id == writing_process_id:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\+$', stage_version):
version_range = [ *range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\-\d$', stage_version):
start_stop = [ int(i) for i in re.split(r'-', stage_version) ]
version_range = [ *range(start_stop[0], start_stop[1]+1) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
return convertable_transkription_positions
def _get_words(self, words, highlighted_words=[]):
"""Return the words that will be hightlighted.
"""
if len(highlighted_words) == 0:
return words
else:
return highlighted_words
def convert(self, output_file=None, stage_version='', highlighted_words=[]):
"""Prints all words.
"""
first_word_of_line = None
out = sys.stdout
if output_file is not None:
out = open(output_file, 'w')
for word in self.page.words:
if first_word_of_line is None or first_word_of_line.line_number != word.line_number:
out.write('\n')
first_word_of_line = word
if word.line_number % 2 == 0:
out.write(str(word.line_number).zfill(2) + ' ')
else:
out.write(' ')
if stage_version == '' or len(self._get_transkription_positions(word.transkription_positions, stage_version=stage_version)) > 0:
if word.text is not None:
out.write(word.text + ' ')
out.close()
@classmethod
def CREATE_CONVERTER(cls, page, non_testing=True,converter_type='', show_word_insertion_mark=False):
"""Returns a converter of type converter_type.
[:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None
"""
cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() }
cls_key = converter_type + 'Converter'
if bool(cls_dict.get(cls_key)):
return cls_dict.get(cls_key)(page, non_testing, show_word_insertion_mark)
else:
return Converter(page, non_testing, show_word_insertion_mark)
class SVGConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text.
"""
BG_COLOR = 'yellow'
OPACITY = '0.2'
def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
self.bg_color = bg_color
self.opacity = opacity
def convert(self, output_file=None, stage_version='', highlighted_words=[]):
"""Converts Page to SVG
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
transkription_field = TranskriptionField(self.page.svg_file)
if bool(transkription_field.get_svg_attributes('xmlns')):
ET.register_namespace('', transkription_field.get_svg_attributes('xmlns'))
if bool(transkription_field.get_svg_attributes('xmlns:xlink')):
ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink'))
svg_tree = ET.parse(self.page.svg_file)
transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'})
colors = [ 'yellow', 'orange' ] if self.bg_color == self.BG_COLOR else [ self.bg_color ]
color_index = 0
for word in self._get_words(self.page.words, highlighted_words=highlighted_words):
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
rect_node = ET.SubElement(transkription_node, 'rect',\
attrib={'id': str(transkription_position.id), 'x': str(transkription_position.left + transkription_field.xmin),\
'y': str(transkription_position.top + transkription_field.ymin), 'width': str(transkription_position.width),\
'height': str(transkription_position.height), 'fill': colors[color_index], 'opacity': self.opacity})
if transkription_position.transform is not None:
matrix = transkription_position.transform.clone_transformation_matrix()
matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3)
rect_node.set('transform', matrix.toString())
rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3)))
rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3)))
ET.SubElement(rect_node, 'title').text = word.text
color_index = (color_index + 1) % len(colors)
if output_file is not None:
svg_tree.write(output_file)
class HTMLConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a test HTML file.
"""
CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; }
.highlight1 { background-color: pink; opacity: 0.2; }
.foreign { background-color: blue; opacity: 0.4; }
.word-insertion-mark { background-color: orange; opacity: 0.2; }
.deleted { background-color: grey; opacity: 0.2; }
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
def convert(self, output_file=None, stage_version='', highlighted_words=[]):
"""Converts Page to HTML
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
if stage_version != '':
title = title + ', Schreibstufe: ' + stage_version
width = self.page.width
height = self.page.height
style_content = ' position: relative; width: {}px; height: {}px; background-image: url({}); background-size: {}px {}px '\
.format(width, height, path.abspath(self.page.svg_file), width, height)
style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS)
head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style)
transkription = E.DIV(id="transkription")
counter = 0
for word in self.page.words:
highlight_class = 'highlight' + str(counter)\
if not word.deleted else 'deleted'
word_title = 'id: {}/line: {}\n{}'.format(str(word.id), str(word.line_number), word.text)
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
self._append2transkription(transkription, highlight_class, word_title, transkription_position)
counter = (counter + 1) % 2
word_insertion_mark_class = 'word-insertion-mark'
counter = 0
for mark_foreign_hands in self.page.mark_foreign_hands:
highlight_class = 'foreign'
title = 'id: {}/line: {}\n{} <i>{}</i>'.format(str(mark_foreign_hands.id), str(word.line_number),\
mark_foreign_hands.foreign_hands_text, mark_foreign_hands.pen)
for transkription_position in mark_foreign_hands.transkription_positions:
self._append2transkription(transkription, highlight_class, title, transkription_position)
if self.show_word_insertion_mark:
for word_insertion_mark in self.page.word_insertion_marks:
wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(word_insertion_mark.id), str(word_insertion_mark.line_number))
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
word_insertion_mark.top, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height)
link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content)
transkription.append(link)
html = E.HTML(head,E.BODY(transkription))
bool(self.non_testing) and open_in_browser(html)
if output_file is not None:
with open(output_file, 'wb') as f:
f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8'))
f.closed
def _append2transkription(self, transkription, highlight_class, title, transkription_position):
"""Append content to transkription-div.
"""
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
transkription_position.top, transkription_position.left, transkription_position.width, transkription_position.height)
if transkription_position.transform is not None:
style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString())
transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\
if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0
style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height)
link = E.A(' ', E.CLASS(highlight_class), title=title, style=style_content)
transkription.append(link)
+def create_pdf_with_highlighted_words(xml_source_file, highlighted_words=[], pdf_file_name='output.pdf', bg_color=SVGConverter.BG_COLOR):
+ """Creates a pdf file highlighting some words.
+ """
+ page = Page(xml_source_file=xml_source_file)
+ converter = SVGConverter(page, bg_color=bg_color)
+ if not pdf_file_name.endswith('pdf'):
+ pdf_file_name = pdf_file_name + '.pdf'
+ tmp_svg_file = pdf_file_name.replace('.pdf', '.svg')
+ converter.convert(output_file=tmp_svg_file, highlighted_words=highlighted_words)
+ cairosvg.svg2pdf(url=tmp_svg_file, write_to=pdf_file_name)
+ isfile(tmp_svg_file) and remove(tmp_svg_file)
+
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes.
svgscripts/convert_wordPositions.py OPTIONS <file>
OPTIONS:
-h|--help: show help
-H|--HTML [default] convert to HTML test file
-o|--output=outputFile save output to file outputFile
+ -P|--PDF convert to PDF test file
-S|--SVG convert to SVG test file
-s|--svg=svgFile: svg web file
-T|--TEXT convert to TEXT output
-t|--testing execute in test mode, do not write to file or open browser
-w|--word-insertion-mark show word insertion mark on HTML
-v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. }
:return: exit code (int)
"""
convert_to_type = None
svg_file = None
output_file = None
non_testing = True
show_word_insertion_mark = False
page = None
stage_version = ''
try:
- opts, args = getopt.getopt(argv, "htHSTws:o:v:", ["help", "testing", "HTML", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version="])
+ opts, args = getopt.getopt(argv, "htHPSTws:o:v:", ["help", "testing", "HTML", "PDF", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-v', '--version'):
if re.match(r'^(\d|\d\+|\d\-\d)$', arg):
stage_version = arg
else:
raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg))
elif opt in ('-w', '--word-insertion-mark'):
show_word_insertion_mark = True
+ elif opt in ('-P', '--PDF'):
+ convert_to_type = 'PDF'
elif opt in ('-S', '--SVG'):
convert_to_type = 'SVG'
elif opt in ('-T', '--TEXT'):
convert_to_type = 'TEXT'
elif opt in ('-H', '--HTML'):
convert_to_type = 'HTML'
elif opt in ('-t', '--testing'):
non_testing = False
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-o', '--output'):
output_file = arg
if len(args) < 1:
usage()
return 2
if convert_to_type is None:
if output_file is not None and len(re.split(r'\.', output_file)) > 1:
output_file_part_list = re.split(r'\.', output_file)
convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper()
else:
convert_to_type = 'HTML'
for word_position_file in args:
if not isfile(word_position_file):
print("'{}' does not exist!".format(word_position_file))
return 2
- if svg_file is not None:
- if isfile(svg_file):
- page = Page(xml_source_file=word_position_file, svg_file=svg_file)
- else:
- print("'{}' does not exist!".format(word_position_file))
- return 2
+ if convert_to_type == 'PDF':
+ if output_file is None:
+ output_file = 'output.pdf'
+ create_pdf_with_highlighted_words(word_position_file, pdf_file_name=output_file)
else:
- page = Page(xml_source_file=word_position_file)
- if page.svg_file is None:
- print('Please specify a svg file!')
- usage()
- return 2
- converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark)
- converter.convert(output_file=output_file, stage_version=stage_version)
+ if svg_file is not None:
+ if isfile(svg_file):
+ page = Page(xml_source_file=word_position_file, svg_file=svg_file)
+ else:
+ print("'{}' does not exist!".format(word_position_file))
+ return 2
+ else:
+ page = Page(xml_source_file=word_position_file)
+ if page.svg_file is None:
+ print('Please specify a svg file!')
+ usage()
+ return 2
+ converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark)
+ converter.convert(output_file=output_file, stage_version=stage_version)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline