Index: tests_svgscripts/test_py2ttl.py
===================================================================
--- tests_svgscripts/test_py2ttl.py (revision 49)
+++ tests_svgscripts/test_py2ttl.py (revision 50)
@@ -1,89 +0,0 @@
-import unittest
-import lxml.etree as ET
-from os import sep, path, remove
-from os.path import isfile, dirname
-from rdflib import Graph, URIRef
-import sys
-
-sys.path.append('py2ttl')
-import py2ttl
-try:
- from py2ttl import Py2TTLConverter
-except ImportError:
- from py2ttl.py2ttl import Py2TTLConverter
-from config import PROJECT_NAME, PROJECT_ONTOLOGY_FILE
-
-if dirname(dirname(__file__)) not in sys.path:
- sys.path.append(dirname(dirname(__file__)))
-
-from svgscripts.datatypes.word import Word
-from svgscripts.datatypes.word_position import WordPosition
-
-class TestPy2TTL(unittest.TestCase):
- """This is the unittest for py2ttl.py2ttl.
- @label unittest
- """
- def setUp(self):
- self.ttl_target = __file__ + 'test.ttl'
-
- def test_main(self):
- Py2TTLConverter.UNITTESTING = True
- argv = ['-t', self.ttl_target ]
- try:
- self.assertEqual(py2ttl.main(argv), 0)
- except AttributeError:
- self.assertEqual(py2ttl.py2ttl.main(argv), 0)
-
- def test_init(self):
- converter = Py2TTLConverter(project_ontology_file=PROJECT_ONTOLOGY_FILE)
- self.assertEqual(converter.project_name, PROJECT_NAME)
-
- def test_get_semantic_classes(self):
- converter = Py2TTLConverter()
- classes = converter.get_semantic_classes('svgscripts/datatypes')
- self.assertEqual('FaksimileImage' in [ cls.__name__ for cls in classes ], True)
- self.assertEqual('Image' in [ cls.__name__ for cls in classes ], True)
- self.assertEqual('SemanticClass' in [ cls.__name__ for cls in classes ], False)
-
-
- def test_createProperty(self):
- converter = Py2TTLConverter(project_ontology_file=PROJECT_ONTOLOGY_FILE)
- converter.createProperty(converter.base_uriref + "#Test", 'test', str, 1)
- name_uri = converter.base_uriref + '#hasTest'
- self.assertEqual((name_uri, None, None) in converter.project_graph, True)
-
- def test_createPropertyName(self):
- converter = Py2TTLConverter()
- name = converter.createPropertyName(property_name='test_asdf_asdf')
- self.assertEqual(name, 'hasTestAsdfAsdf')
- name = converter.createPropertyName(object_uri=converter.base_uriref + '#Asdf')
- self.assertEqual(name, 'hasAsdf')
- name = converter.createPropertyName(subject_uri=converter.base_uriref + '#Test',object_uri=converter.base_uriref + '#Asdf')
- self.assertEqual(name, 'testBelongsToAsdf')
- name = converter.createPropertyName(subject_uri=converter.base_uriref + '#Test')
- self.assertEqual(name, 'testBelongsTo')
-
- def test_get_comment_label(self):
- converter = Py2TTLConverter()
- comment, label = converter.get_comment_label(TestPy2TTL)
- self.assertEqual(label, 'unittest')
- self.assertEqual(comment, self.__doc__.split('\n')[0].lstrip())
-
- def test_get_builtin_cls_keys(self):
- dictionary = WordPosition.get_semantic_dictionary()
- converter = Py2TTLConverter()
- builtin_cls_keys = converter._get_builtin_cls_keys(dictionary['properties'])
- self.assertEqual('width' in builtin_cls_keys, True)
- self.assertEqual('height' in builtin_cls_keys, True)
-
- def test_get_semantic_dictionary_keys_super_first(self):
- dict = Word.get_semantic_dictionary()
- converter = Py2TTLConverter()
- keys = converter._get_semantic_dictionary_keys_super_first(dict['properties'])
- self.assertEqual(keys.index('faksimile_positions') < keys.index('transkription_positions'), True)
-
- def tearDown(self):
- isfile(self.ttl_target) and remove(self.ttl_target)
-
-if __name__ == "__main__":
- unittest.main()
Index: tests_svgscripts/test_faksimile_image.py
===================================================================
--- tests_svgscripts/test_faksimile_image.py (revision 49)
+++ tests_svgscripts/test_faksimile_image.py (revision 50)
@@ -1,59 +1,68 @@
import unittest
from os import sep, path
from os.path import isdir, dirname, basename
import lxml.etree as ET
import sys
import sys
sys.path.append('svgscripts')
from datatypes.faksimile_image import FaksimileImage
from datatypes.image import Image
+from datatypes.text_field import TextField
class TestFaksimileImage(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.svg_file = DATADIR + sep + 'W-II-1,49et50.svg'
def test_init(self):
image = FaksimileImage(file_name='test.jpg', height=10, width=10)
self.assertEqual(image.tag, FaksimileImage.XML_TAG)
self.assertEqual(image.width, 10)
def test_attach_object_to_tree(self):
image = FaksimileImage(file_name='test.jpg', height=10, width=10, x=-100, y=-200)
empty_tree = ET.ElementTree(ET.Element('faksimile'))
image.attach_object_to_tree(empty_tree)
self.assertEqual(image.tag, FaksimileImage.XML_TAG)
for node in empty_tree.getroot().xpath('//' + image.tag):
self.assertEqual(node.get('file-name'), 'test.jpg')
self.assertEqual(node.get('height'), '10')
self.assertEqual(node.get('width'), '10')
def test_CREATE_IMAGE(self):
svg_tree = ET.parse(self.svg_file)
image_node = svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap)
- image = FaksimileImage.CREATE_IMAGE(image_node)
+ image = FaksimileImage.CREATE_IMAGE(image_node, self.svg_file)
self.assertEqual(image.file_name, 'W-II-1,49et50.jpg')
image_node = ET.Element('image')
- file_name = 'test.jpg'
+ file_name = 'W-II-1,49et50.jpg'
width = 10
height = 10
image_node.set('href', file_name)
image_node.set('height', str(height))
image_node.set('width', str(width))
image = FaksimileImage.CREATE_IMAGE(image_node)
self.assertEqual(image.height, height)
self.assertEqual(image.width, width)
self.assertEqual(image.file_name, file_name)
def test_get_semantic_dict(self):
image = FaksimileImage(file_name='test.jpg', URL=FaksimileImage.NIETZSCHE_SOURCES_URL + "N-II-1,2et3", height=10, width=10)
self.assertEqual(FaksimileImage.get_semantic_dictionary()['class'].get('this'), FaksimileImage)
- #print(Image.get_semantic_dictionary())
+ #print(FaksimileImage.get_semantic_dictionary())
+
+ def text_get_image_joined_with_text_field(self):
+ tf = TextField()
+ orig_image = FaksimileImage(file_name='test.jpg', URL=FaksimileImage.NIETZSCHE_SOURCES_URL + "N-II-1,2et3", height=10, width=10)
+ copy_image = orig_image.get_image_joined_with_text_field(tf)
+ self.assertEqual(copy_image.text_field.width, tf.width)
+ self.assertEqual(copy_image.id, orig_image.id)
+ self.assertEqual(copy_image.file_name, orig_image.file_name)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_image.py
===================================================================
--- tests_svgscripts/test_image.py (revision 49)
+++ tests_svgscripts/test_image.py (revision 50)
@@ -1,38 +1,50 @@
import unittest
from os import sep, path
from os.path import isdir, dirname, basename
import lxml.etree as ET
import sys
import sys
sys.path.append('svgscripts')
-from datatypes.image import Image
+from datatypes.image import Image, SVGImage
+from datatypes.text_field import TextField
class TestImage(unittest.TestCase):
def test_init(self):
- image = Image(file_name='test.jpg', height=10, width=10)
+ tf = TextField()
+ image = Image(file_name='test.jpg', height=10, width=10, text_field=tf)
self.assertEqual(image.tag, 'image')
self.assertEqual(image.width, 10)
+ self.assertEqual(image.text_field.width, 0)
+ node = ET.Element('svg', attrib={'file': 'test.svg', 'height': '10', 'width': '10'})
+ image = SVGImage(node=node)
+ self.assertEqual(image.tag, 'svg-image')
+ self.assertEqual(image.width, 10)
+ self.assertEqual(image.file_name, 'test.svg')
def test_attach_object_to_tree(self):
tag = 'faksimile-image'
- image = Image(file_name='test.jpg', height=10, width=10, tag=tag)
+ tf = TextField()
+ image = Image(file_name='test.jpg', URL='https://www.google.com', height=10, width=10, text_field=tf, tag=tag)
empty_tree = ET.ElementTree(ET.Element('faksimile'))
image.attach_object_to_tree(empty_tree)
self.assertEqual(image.tag, tag)
for node in empty_tree.getroot().xpath('//' + image.tag):
self.assertEqual(node.get('file-name'), 'test.jpg')
self.assertEqual(node.get('height'), '10')
self.assertEqual(node.get('width'), '10')
+ self.assertEqual(len(node.findall(TextField.XML_TAG)), 1)
def test_get_semantic_dict(self):
- image = Image(file_name='test.jpg', height=10, width=10)
- #print(Image.get_semantic_dictionary())
+ #tf = TextField()
+ #image = Image(file_name='test.jpg', height=10, width=10, text_field=tf)
+ pass
+ #print(SVGImage.get_semantic_dictionary())
#self.assertEqual(image.get_data_dictionary()['body'].get('height'), 10)
#self.assertEqual(image.get_data_dictionary()['body'].get('width'), 10)
if __name__ == "__main__":
unittest.main()
Index: svgscripts/join_faksimileAndTranskription.py
===================================================================
--- svgscripts/join_faksimileAndTranskription.py (revision 49)
+++ svgscripts/join_faksimileAndTranskription.py (revision 50)
@@ -1,312 +1,313 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from functools import cmp_to_key
import getopt
import lxml.etree as ET
import re
import string
import sys
from operator import attrgetter
from os import listdir, sep, path
from os.path import exists, isfile, isdir, dirname, basename
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.faksimile import FaksimilePage
from datatypes.lineNumber import LineNumber
from datatypes.page import Page
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from process_files import update_svgposfile_status
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation)
PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"')
SINGLE_PUNCTUATION_PATTERN = r"^[{}]$".format(string.punctuation)
def get_filelist_and_manuscript_file(file_a, file_b=None):
"""Returns a file list and a manuscript file (or None)
"""
file_list = []
manuscript_file = None
if isfile(file_a) and file_a.endswith('svg'):
file_list.append(file_a)
if file_b is not None and isfile(file_b):
manuscript_file = file_b
elif isfile(file_a) and file_a.endswith('xml'):
manuscript_file = file_a
if file_b is not None and isfile(file_b):
file_list.append(file_b)
elif isdir(file_b):
file_list = [ svgfile for svgfile in listdir(file_b) if svgfile.endswith('svg') ]
elif isdir(file_a):
file_list = [ file_a + sep + svgfile for svgfile in listdir(file_a) if svgfile.endswith('svg') ]
if file_b is not None and isfile(file_b):
manuscript_file = file_b
return file_list, manuscript_file
def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None):
"""Return svg_pos_file and manuscript_file if they are ready for processing.
"""
svg_pos_file = None
manuscript_tree = None
if manuscript_file is not None\
and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')):
manuscript_tree = ET.parse(manuscript_file)
else:
title_string = faksimile_page.title.replace(' ', '_')
manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\
if isdir('.{}xml'.format(sep)) else title_string + '.xml'
if isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if manuscript_tree is not None:
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
else:
if not UNITTESTING:
msg_color = Fore.CYAN if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0\
else Fore.MAGENTA
msg = 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number)\
if msg_color == Fore.MAGENTA\
else 'Faksimile already joined!'
print(msg_color + msg, end='')
print(Style.RESET_ALL)
return svg_pos_file, manuscript_file
def process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=''):
"""Joins faksimile_positions with text == word_text with words with text == word_text.
"""
text = word_text if alt_word_text == '' else alt_word_text
fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ]
words4word = [ word for word in words if word.text == word_text and not word.joined ]
if alt_word_text != '':
words4word += [ word for word in words if word.text == text and not word.joined ]
words4word = sorted(words4word, key=attrgetter('id'))
if len(fposition4word) == len(words4word):
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words[words4word[index].id].joined = True
new_words.append(words4word[index])
elif len(words4word) < len(fposition4word):
if re.match(r'(.*)ss(.*)', text):
alt_word_text = re.sub(r'ss', 'ß', text)
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text)
elif re.match(SINGLE_PUNCTUATION_PATTERN, text):
if text == '-':
alt_word_text = text.replace('-', '–')
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text)
else:
print('single', word_text, len(fposition4word), len(words4word))
elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text):
alt_word_text = re.sub(PUNCTUATION_PATTERN, "", text)
if alt_word_text != '':
pattern = r'(.*){0}(.*)'.format(alt_word_text)
words4word += [ word for word in words if re.match(pattern, word.text) and not word.joined ]
if len(words4word) < len(fposition4word):
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text)
else:
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
if words4word[index].text[len(words4word[index].text)-1] != word_text[len(word_text)-1]\
and words4word[index].id+1 < len(words)\
and words[words4word[index].id+1].text == word_text[len(word_text)-1]:
words4word[index].join(words[words4word[index].id+1])
words[words4word[index].id+1].joined = True
words[words4word[index].id].joined = True
words4word[index].text = word_text
new_words.append(words4word[index])
else:
if len(text) > 1:
new_words4word = [ word for word in words if word.text == text[1:] and not word.joined ]
if len(new_words4word) == 0:
alt_word_text = text[1:]
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text)
else:
for new_word in new_words4word:
collected_text = new_word.text
current_word = new_word
while collected_text != word_text and word_text.endswith(collected_text) and current_word.id > 0:
previous_word = words[current_word.id-1]
if word_text.endswith(previous_word.text + collected_text):
words[current_word.id].joined = True
previous_word.join(current_word)
current_word = previous_word
collected_text = current_word.text
else:
collected_text = previous_word.text + collected_text
words4word.append(current_word)
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words4word[index].text = word_text
words[words4word[index].id].joined = True
new_words.append(words4word[index])
else:
print('<{0}> {1}/{2}, ids: {3}'.\
format(word_text, len(fposition4word), len(words4word), [ position.id for position in fposition4word ]))
else:
print(word_text, len(fposition4word), len(words4word))
def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, test_word_text=''):
"""Joins the data of a faksimile file with the data of svgposfile.
"""
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='')
print(Style.RESET_ALL)
faksimile_tree = ET.parse(faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
exit_status = 0
for faksimile_page in faksimile_pages:
svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)
if svg_pos_file is not None:
if not UNITTESTING:
print(Fore.CYAN + 'joining data with file {} ... '.format(svg_pos_file), end='')
- page = Page(xml_source_file=svg_pos_file)
+ image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field)
+ page = Page(xml_source_file=svg_pos_file, faksimile_image=image4page)
words = sort_words(page)
faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions)
new_words = []
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
for word_text in unique_faksimile_words:
process_word_text(new_words, word_text, faksimile_positions, words)
if False not in [ word.joined for word in words ]\
and False not in [ position.joined for position in faksimile_positions]\
and not UNITTESTING:
page.words = sorted(new_words, key=attrgetter('id'))
for word_node in page.page_tree.xpath('//word'):
word_node.getparent().remove(word_node)
for word in page.words:
word.attach_word_to_tree(page.page_tree)
write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
print(Fore.GREEN + '[OK]')
print(Style.RESET_ALL)
elif not UNITTESTING:
not_joined_fp = [ (position.id, position.text) for position in faksimile_positions if not position.joined ]
plural_fp = '' if len(not_joined_fp) < 2 else 's'
not_joined_tw = [ (word.id, word.text) for word in words if not word.joined ]
plural_tw = '' if len(not_joined_tw) < 2 else 's'
print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp))
print(Fore.MAGENTA + '--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw))
print(Style.RESET_ALL)
exit_status = 2
elif False in [ word.joined for word in words ]:
print([ (word.id, word.text) for word in words if not word.joined ])
exit_status = 2
elif test_word_text != '':
print([ (word.id, word.text) for word in new_words if word.text == test_word_text ])
return exit_status
def sort_words(page):
"""Returns sorted words (from top left to bottom right).
"""
if -1 in [ word.line_number for word in page.words ]:
warnings.warn('{} in page file {} for word with ids {}'.format(LineNumber.WARN_NO_LINE_NUMBER, page.page_tree.docinfo.URL, page.page_tree.xpath('//word[not(@line-number)]/@id')))
words = []
for line_number in page.line_numbers:
word_on_line = [ word for word in page.words if word.line_number == line_number.id ]
if line_number.id % 2 == 0:
words += sorted(word_on_line, key=lambda word: word.transkription_positions[0].left)
else:
words += sorted(word_on_line, key=cmp_to_key(\
lambda wordA, wordB: -1\
if wordA.transkription_positions[0].left < wordB.transkription_positions[0].left\
and abs(wordA.transkription_positions[0].bottom-wordB.transkription_positions[0].bottom) < wordA.transkription_positions[0].height/2\
else 1))
for index, word in enumerate(words):
words[index].id = index
words[index].joined = False
return words
def sort_faksimile_positions(faksimile_positions):
"""Returns sorted words (from top left to bottom right).
"""
for faksimile_position in faksimile_positions:
faksimile_position.joined = False
return sorted(faksimile_positions, key=cmp_to_key(\
lambda positionA, positionB: -1\
if positionA.left+positionA.width/2 <= positionB.left+positionB.width/2\
and positionA.top+positionA.height/2 <= positionB.top+positionB.height/2\
else 1\
)\
)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to join the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
svgscripts/join_faksimileAndTranskription.py [OPTIONS] [xmlManuscriptFile]
a directory containing
a svg file containing information about the word positions on the faksimile.
a xml file about a manuscript, containing information about its pages.
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if exists(file_a):
file_b = None
if len(args) > 1 and exists(args[1]):
file_b = args[1]
file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b)
for faksimile_file in file_list:
join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/local_config.py
===================================================================
--- svgscripts/local_config.py (revision 0)
+++ svgscripts/local_config.py (revision 50)
@@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+FAKSIMILE_LOCATION = '/home/knister0/ownCloud/nietzscheDE/Bearbeitung_Faksimile' # location of faksimiles
Index: svgscripts/datatypes/page.py
===================================================================
--- svgscripts/datatypes/page.py (revision 49)
+++ svgscripts/datatypes/page.py (revision 50)
@@ -1,380 +1,403 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
from .class_spec import SemanticClass
-from .image import Image
+from .image import Image, SVGImage
+from .faksimile_image import FaksimileImage
from .lineNumber import LineNumber
from .path import Path
from .positional_word_part import PositionalWordPart
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
from .word_insertion_mark import WordInsertionMark
class Page(SemanticClass):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
UNITTESTING = False
WARNING_MISSING_USE_NODE4PWP = PositionalWordPart.WARN_NO_USE_NODE_FOUND
WARNING_MISSING_GLYPH_ID4WIM = WordInsertionMark.WARN_NO_GLYPH_ID
- def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, pdfFile=None, svg_file=None, orientation='North', extract_transkription_field_only=False):
+ def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, faksimile_image=None, pdfFile=None, svg_file=None, orientation='North', extract_transkription_field_only=False):
self.title = title
self.line_numbers = []
self.style_dict = {}
self.sonderzeichen_list = []
self.svg_file = None
+ self.svg_image = None
self.pdfFile = None
self.source = None
self.number = page_number if page_number is not None else -1
self.orientation = orientation
self.word_deletion_paths = []
+ self.faksimile_image = faksimile_image
if xml_source_file is not None:
if isfile(xml_source_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_source_file, parser)
self.title = self.page_tree.getroot().get('title')
self.number = self.page_tree.getroot().get('number')
self.source = self.page_tree.getroot().get('source')
self.orientation = self.page_tree.getroot().get('orientation')
self.init_words()
self.add_style(style_node=self.page_tree.getroot().find('.//style'))
+ self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\
+ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None
+ self.svg_image = SVGImage(node=self.page_tree.xpath('.//' + SVGImage.XML_TAG)[0])\
+ if len(self.page_tree.xpath('.//' + SVGImage.XML_TAG)) > 0 else None
+ self.faksimile_image = FaksimileImage(node=self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)[0])\
+ if len(self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)) > 0 else None
self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\
if len(self.page_tree.xpath('.//svg/@file')) > 0 else None
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
- self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\
- if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None
if pdfFile is not None and self.pdfFile is None:
self.pdfFile = pdfFile
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
+ if faksimile_image is not None:
+ self.faksimile_image = faksimile_image
+ self.faksimile_image.attach_object_to_tree(self.page_tree)
if svg_file is not None and self.svg_file is None:
self.svg_file = svg_file
tf = TranskriptionField(svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
- ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
+ self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
+ self.svg_image.attach_object_to_tree(self.page_tree)
+ if self.svg_image is not None and self.svg_file is None:
+ self.svg_file = self.svg_image.file_name
+ if self.svg_image is not None and self.width == 0.0:
+ self.width = self.svg_image.width
+ if self.svg_image is not None and self.height == 0.0:
+ self.height = self.svg_image.height
else:
raise Exception('File "{}" does not exist!'.format(xml_source_file))
elif xml_target_file is not None:
self.word_insertion_marks = []
self.words = []
self.writing_processes = []
self.svg_file = svg_file
self.pdfFile = pdfFile
if isfile(xml_target_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_target_file, parser)
self.source = self.page_tree.getroot().get('source')
if bool(self.page_tree.getroot().get('orientation')):
self.orientation = self.page_tree.getroot().get('orientation')
elif orientation is not None:
self.page_tree.getroot().set('orientation', orientation)
if bool(self.page_tree.getroot().get('title')):
self.title = self.page_tree.getroot().get('title')
elif title is not None:
self.page_tree.getroot().set('title', title)
if self.svg_file is None:
self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\
if len(self.page_tree.xpath('.//svg/@file')) > 0 else None
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
elif len(self.page_tree.xpath('.//svg/@file')) == 0:
tf = TranskriptionField(svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
- ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
+ self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
+ self.svg_image.attach_object_to_tree(self.page_tree)
+ #ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
else:
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
if self.pdfFile is None:
self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\
if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None
elif len(self.page_tree.xpath('.//pdf/@file')) == 0:
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG, WordInsertionMark.XML_TAG,\
WritingProcess.XML_TAG, Path.WORD_DELETION_PATH_TAG ]:
for node in self.page_tree.xpath('//' + xpath2remove):
node.getparent().remove(node)
else:
self.page_tree = ET.ElementTree(ET.Element('page'))
self.pdfFile = pdfFile
self.svg_file = svg_file
if title is not None:
self.page_tree.getroot().set('title', title)
if orientation is not None:
self.page_tree.getroot().set('orientation', orientation)
self.page_tree.getroot().set('transkription-field-only', str(extract_transkription_field_only).lower())
if page_number is not None:
self.page_tree.getroot().set('number', str(page_number))
if self.pdfFile is not None:
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
if self.svg_file is not None:
tf = TranskriptionField(self.svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
- ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
- self.svg_image = Image(file_name=self.svg_file, height=self.height, width=self.width) if self.svg_file is not None\
- else None
+ self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
+ self.svg_image.attach_object_to_tree(self.page_tree)
+ #ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
+ if self.svg_image is None and self.svg_file is not None:
+ self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
+ self.svg_image.attach_object_to_tree(self.page_tree)
def categorize_paths(self, transkription_field=None):
"""Categorize all paths that are part of the transkription field.
"""
if self.source is not None and isfile(self.source):
MAX_HEIGHT_LINES = 1
max_line = sorted(\
[line_number.bottom-line_number.top for line_number in self.line_numbers if line_number.id % 2 == 0],\
reverse=True)[0] + 2 if len(self.line_numbers) > 0 else 17
tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0
tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0
paths, attributes = svg_to_paths.svg2paths(self.source)
allpaths_on_tf = []
if transkription_field is not None:
for index in range(0, len(paths)):
path = paths[index]
attribute = attributes[index]
if len(path) > 0\
and path != transkription_field.path\
and path.start.real > tr_xmin\
and path.end.real < transkription_field.xmax:
allpaths_on_tf.append(Path(id=index, path=path, style_class=attribute.get('class')))
text_area_deletion_paths = []
deletion_or_underline_paths = []
box_paths = []
dots_paths = []
word_connector_paths = []
uncategorized_paths = []
for mypath in allpaths_on_tf:
xmin, xmax, ymin, ymax = mypath.path.bbox()
start_line_number = self.get_line_number(mypath.path.start.imag-tr_ymin)
if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1:
dots_paths.append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
deletion_or_underline_paths.append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed():
box_paths.append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
word_connector_paths.append(mypath)
elif abs(ymax-ymin) < MAX_HEIGHT_LINES:
deletion_or_underline_paths.append(mypath)
elif start_line_number != -1 and start_line_number != self.get_line_number(mypath.path.end.imag-tr_ymin):
text_area_deletion_paths.append(mypath)
else:
uncategorized_paths.append(mypath)
self.mark_words_intersecting_with_paths_as_deleted(deletion_or_underline_paths, tr_xmin, tr_ymin)
elif not Page.UNITTESTING:
error_msg = 'Svg source file {} does not exist!'.format(self.source)\
if self.source is not None else 'Page does not contain a source file!'
raise FileNotFoundError(error_msg)
def init_line_numbers(self, line_numbers, document_bottom):
"""Init line numbers.
"""
even_index = 0
MINABOVE = 1
self.line_numbers = []
if len(line_numbers) > 0:
first_line_bottom = line_numbers[even_index].top - MINABOVE
self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom))
self.line_numbers.append(line_numbers[even_index])
even_index += 1
while even_index < len(line_numbers):
self.line_numbers.append(LineNumber(id=line_numbers[even_index].id-1,\
top=line_numbers[even_index-1].bottom+MINABOVE,\
bottom=line_numbers[even_index].top-MINABOVE))
self.line_numbers.append(line_numbers[even_index])
even_index += 1
self.line_numbers.append(LineNumber(id=line_numbers[even_index-1].id+1,\
top=line_numbers[even_index-1].bottom+MINABOVE,\
bottom=document_bottom))
for line_number in self.line_numbers:
line_number.attach_object_to_tree(self.page_tree)
def init_words(self):
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.CREATE_WORD(word_node=word_node) for word_node in self.page_tree.getroot().xpath('//word') ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
self.word_deletion_paths = [ Path(node=node) for node in self.page_tree.xpath('//' + Path.WORD_DELETION_PATH_TAG) ]
"""
for index, word in enumerate(self.words):
for word_insertion_mark in self.word_insertion_marks:
self.words[index] = word_insertion_mark.attach_and_update_word_if_involved(word)
if self.words[index] != word:
break
"""
def create_writing_processes_and_attach2tree(self):
"""Creates three stages of Nietzsche's process of writing.
"""
self.writing_processes = [ WritingProcess(version=WritingProcess.FIRST_VERSION),\
WritingProcess(version=WritingProcess.INSERTION_AND_ADDITION),\
WritingProcess(version=WritingProcess.LATER_INSERTION_AND_ADDITION) ]
for writing_process in self.writing_processes:
writing_process.attach_object_to_tree(self.page_tree)
for word in self.words:
for transkription_position in word.transkription_positions:
for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
if font_key in self.fontsizekey2stage_mapping.keys():
transkription_position.writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self.sonderzeichen_list = sonderzeichen_list
self.letterspacing_list = letterspacing_list
self.style_dict = style_dict
if style_node is not None:
self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') }
self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ]
self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('letterspacing-list')) ]
elif bool(self.style_dict):
style_node = ET.SubElement(self.page_tree.getroot(), 'style')
if len(self.sonderzeichen_list) > 0:
style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list))
if len(self.letterspacing_list) > 0:
style_node.set('letterspacing-list', ' '.join(self.letterspacing_list))
for key in self.style_dict.keys():
self.style_dict[key]['name'] = key
ET.SubElement(style_node, 'class', attrib=self.style_dict[key])
fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value }
fontsizes = sorted(fontsize_dict.values(), reverse=True)
# create a mapping between fontsizes and word stages
self.fontsizekey2stage_mapping = {}
for fontsize_key, value in fontsize_dict.items():
if value >= fontsizes[0]-1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION })
elif value <= fontsizes[len(fontsizes)-1]+1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION })
else:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION })
def add_source(self, source):
"""Adds a source to page and attaches it to page_tree.
"""
self.source = source
self.page_tree.getroot().set('source', self.source)
def get_biggest_fontSize4styles(self, style_set={}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if bool(self.style_dict):
sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True)
return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1
else:
return 1
def get_line_number(self, y):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if len(self.line_numbers) > 0:
result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ]
return result_list[0] if len(result_list) > 0 else -1
else:
return -1
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'title': (str, 1, '/page/@title'), 'number': (str, 1, '/page/@number'),\
+ 'image': { 'class': Image, 'cardinality': 1, 'xpath': '/page/{}'.format(FaksimileImage.XML_TAG)},\
'line_numbers': (LineNumber, SemanticClass.LIST, '/page/@number|/page/@title'),\
'orientation': { 'class': str, 'cardinality': 1, 'xpath': '/page/@orientation'},\
'words': (Word, SemanticClass.LIST, '/page/@number|/page/@title'),\
- 'svg_image': (Image, 1, '/page/svg'),\
+ 'svg_image': { 'class': SVGImage, 'cardinality': 1, 'xpath': '/page/{}'.format(SVGImage.XML_TAG)},\
'writing_processes': (WritingProcess, SemanticClass.LIST, '/page/@number|/page/@title'),\
'word_deletion_paths': (Path, SemanticClass.LIST, '/page/@number|/page/@title'),\
'word_insertion_marks': (WordInsertionMark, SemanticClass.LIST, '/page/@number|/page/@title')}
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
def mark_words_intersecting_with_paths_as_deleted(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Marks all words that intersect with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] list of .path.Path that might be word_underline_paths
"""
if not Page.UNITTESTING:
bar = Bar('mark words that intersect with deletion paths', max=len(self.words))
for word in self.words:
not bool(Page.UNITTESTING) and bar.next()
for transkription_position in word.transkription_positions:
first_pwp = transkription_position.positional_word_parts[0]
last_pwp = transkription_position.positional_word_parts[len(transkription_position.positional_word_parts)-1]
xmin = tr_xmin + first_pwp.left
xmax = tr_xmin + last_pwp.left + last_pwp.width
ymin = tr_ymin + sorted(pwp.top for pwp in transkription_position.positional_word_parts)[0]
ymax = tr_ymin + sorted([pwp.bottom for pwp in transkription_position.positional_word_parts], reverse=True)[0]
word_path = parse_path('M {}, {} L {}, {} L {}, {} L {}, {} z'.format(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax))
intersecting_paths = [ deletion_path for deletion_path in deletion_paths\
if do_paths_intersect_saveMode(deletion_path.path, word_path) ]
if len(intersecting_paths) > 0:
word.deleted = True
for deletion_path in intersecting_paths:
if deletion_path not in self.word_deletion_paths:
deletion_path.tag = Path.WORD_DELETION_PATH_TAG
deletion_path.attach_object_to_tree(self.page_tree)
self.word_deletion_paths.append(deletion_path)
not bool(Page.UNITTESTING) and bar.finish()
# return those paths in deletion_paths that are not in self.word_deletion_paths
return [ word_underline_path for word_underline_path in set(deletion_paths) - set(self.word_deletion_paths) ]
def do_paths_intersect_saveMode(path1, path2):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try:
return path1.intersect(path2, justonemode=True)
except AssertionError:
return False
Index: svgscripts/datatypes/image.py
===================================================================
--- svgscripts/datatypes/image.py (revision 49)
+++ svgscripts/datatypes/image.py (revision 50)
@@ -1,89 +1,113 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent all image types.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from .attachable_object import AttachableObject
from .class_spec import SemanticClass
+from .text_field import TextField
class Image(AttachableObject,SemanticClass):
"""
This super class represents all types of images.
Args:
- file_name (str): name of the image file.
- node (lxml.etree.Element) node, containing information
- URL (str): URL of image file.
- height (float): height of image
- width (float): width of image
+ file_name (str): name of the image file.
+ node (lxml.etree.Element) node, containing information
+ URL (str): URL of image file.
+ height (float): height of image
+ width (float): width of image
+ text_field (.text_field.TextField) text_field on image representation
"""
- stringKeys = [ 'file_name', 'URL' ]
+ stringKeys = [ 'file_name', 'URL', 'local_path' ]
floatKeys = [ 'height', 'width' ]
XML_TAG = 'image'
- def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, tag=XML_TAG):
+ def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG):
+ self.text_field = text_field
if node is not None:
self.file_name = node.get('file-name')
- self.URL = node.get('absolute-path')
+ self.local_path = node.get('local-path')
+ self.URL = node.get('URL')
self.height = float(node.get('height'))
self.width = float(node.get('width'))
+ if len(node.findall(TextField.XML_TAG)) > 0:
+ self.text_field = TextField(node=node.find(TextField.XML_TAG))
else:
self.tag = tag
self.file_name = file_name
+ self.local_path = local_path
self.URL = URL
self.height = height
self.width = width
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
obj_node = target_tree.getroot().find('.//' + self.tag) \
if(len(target_tree.getroot().findall('.//' + self.tag)) > 0) \
else ET.SubElement(target_tree.getroot(), self.tag)
for key in self.floatKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3)))
for key in self.stringKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), self.__dict__[key])
+ if self.text_field is not None:
+ self.text_field.attach_object_to_tree(obj_node)
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {}
properties.update(dict(zip(Image.floatKeys, [ (float, 1, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in Image.floatKeys])))
properties.update({'file_name': (str, 1, '{}/@file-name'.format(cls.XML_TAG))})
properties.update({'URL': (str, 0, '{}/@absolute-path'.format(cls.XML_TAG))})
+ properties.update({'text_field': (TextField, 0, '{}/{}'.format(cls.XML_TAG, TextField.XML_TAG))})
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
+class SVGImage(Image):
+ """This class represents a svg image.
+ """
+ XML_TAG = 'svg-image'
+
+ def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG):
+ if node is not None and node.tag != self.XML_TAG:
+ file_name = node.get('file')
+ height = float(node.get('height')) if bool(node.get('height')) else 0.0
+ width = float(node.get('width')) if bool(node.get('width')) else 0.0
+ node = None
+ super(SVGImage, self).__init__(node=node, file_name=file_name, URL=URL,\
+ height=height, width=width, text_field=text_field, tag=self.XML_TAG)
+
Index: svgscripts/datatypes/faksimile.py
===================================================================
--- svgscripts/datatypes/faksimile.py (revision 49)
+++ svgscripts/datatypes/faksimile.py (revision 50)
@@ -1,135 +1,135 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a faksimile page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import re
from lxml import etree as ET
from os import path
from os.path import isdir, isfile, sep, basename
from svgpathtools.parser import parse_path
from .faksimile_image import FaksimileImage
from .matrix import Matrix
from .super_page import SuperPage
from .text_field import TextField
from .word_position import WordPosition
class FaksimilePage(SuperPage):
"""
This class represents a faksimile page.
Args:
xml_target_file (str): name of the xml file to which page info will be written.
xml_source_file (str): name of the xml file that will be instantiated.
"""
XML_TAG = 'faksimile-page'
def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, svg_source_file=None, faksimile_image=None, text_field=None):
xml_file = xml_source_file if xml_source_file is not None else xml_target_file
super(FaksimilePage, self).__init__(xml_file=xml_file, title=title, page_number=page_number, tag=self.XML_TAG)
if xml_target_file is not None:
self.remove_tags_from_page_tree([WordPosition.FAKSIMILE])
if svg_source_file is not None:
self.page_tree.getroot().set('svg-source-file', svg_source_file)
if faksimile_image is not None:
faksimile_image.attach_object_to_tree(self.page_tree)
if text_field is not None:
text_field.attach_object_to_tree(self.page_tree)
self.svg_source_file = self.page_tree.getroot().get('svg-source-file')
self.faksimile_image = FaksimileImage(node=self.page_tree.getroot().find('.//' + FaksimileImage.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + FaksimileImage.XML_TAG)) > 0 else None
self.text_field = TextField(node=self.page_tree.getroot().find('.//' + TextField.XML_TAG))\
if len(self.page_tree.getroot().findall('.//' + TextField.XML_TAG)) > 0 else None
self.word_positions = [ WordPosition(node=node) for node in self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE) ]\
if len(self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE)) > 0 else []
def append_word_position(self, word_position):
"""Appends word_position to word_positions and attaches it to page_tree.
"""
self.word_positions.append(word_position)
word_position.attach_object_to_tree(self.page_tree)
@staticmethod
def GET_FAKSIMILEPAGES(svg_tree, namespaces=None):
"""Creates and returns text fields contained in a svg_file as a list.
"""
THRESHOLD_X = 10
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
- image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap))
source_file_name = svg_tree.docinfo.URL
+ image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap), source_file_name)
xml_dir = '.{}xml'.format(sep)
faksimile_pages = list()
title_string = re.sub(r'[,_][0-9]+.*\.svg', '', basename(source_file_name))
title = title_string.replace('-', ' ')
rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap) if rect.get('id', svg_tree.getroot().nsmap).startswith(title_string) ]
for text_field_rect in rect_list:
tf_x = float(text_field_rect.get('x', svg_tree.getroot().nsmap)) - image.x
tf_y = float(text_field_rect.get('y', svg_tree.getroot().nsmap)) - image.y
tf_width = float(text_field_rect.get('width', svg_tree.getroot().nsmap))
tf_height = float(text_field_rect.get('height', svg_tree.getroot().nsmap))
id = text_field_rect.get('id', svg_tree.getroot().nsmap)
target_file_name = xml_dir + sep + id + '.xml' if isdir(xml_dir) else id + '.xml'
page_number = re.sub(r'.*[,_]', '', id)
text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y)
faksimile_page = FaksimilePage(xml_target_file=target_file_name, svg_source_file=source_file_name,\
title=title, page_number=page_number, faksimile_image=image, text_field=text_field)
x_min = text_field.xmin + image.x
y_min = text_field.ymin + image.y
rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\
x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces)
rect_titles += svg_tree.getroot().xpath('//ns:path/ns:title', namespaces=namespaces)
for rect_title in rect_titles:
rect = rect_title.getparent()
x, y, height, width = 0.0, 0.0, 0.0, 0.0
if rect.tag.endswith('path') and rect.get('d') != 0:
path = parse_path(rect.get('d'))
x, xmax, y, ymax = path.bbox()
width = xmax - x
height = ymax - y
if x < x_min or x > text_field.xmax + image.x - THRESHOLD_X\
or y < y_min or y > text_field.ymax + image.y\
or rect.get('id') == text_field.id:
break
else:
x = float(rect.get('x', svg_tree.getroot().nsmap))
y = float(rect.get('y', svg_tree.getroot().nsmap))
height = float(rect.get('height', svg_tree.getroot().nsmap))
width = width=float(rect.get('width', svg_tree.getroot().nsmap))
matrix = None
if bool(rect.get('transform')):
matrix = Matrix(transform_matrix_string=rect.get('transform'))
faksimile_page.append_word_position(\
WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=rect_title.text, height=height,\
width=width, x=x-x_min, y=y-y_min, matrix=matrix, tag=WordPosition.FAKSIMILE))
faksimile_pages.append(faksimile_page)
return faksimile_pages
Index: svgscripts/datatypes/faksimile_image.py
===================================================================
--- svgscripts/datatypes/faksimile_image.py (revision 49)
+++ svgscripts/datatypes/faksimile_image.py (revision 50)
@@ -1,71 +1,92 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent faksimile images.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
+import fnmatch
from lxml import etree as ET
-from os.path import isfile
+import os
+from os.path import basename, dirname, isfile, realpath, sep
+import sys
from .image import Image
+sys.path.append('svgscripts')
+from local_config import FAKSIMILE_LOCATION
class FaksimileImage(Image):
"""
This class represents a faksimile image.
Args:
file_name (str): name of the image file.
node (lxml.etree.Element) node, containing information
URL (str): URL of image file.
height (float): height of image
width (float): width of image
x (float): x
y (float): y
"""
XML_TAG = 'faksimile-image'
NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/page/download/'
- def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0):
- super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL,\
- height=height, width=width, tag=self.XML_TAG)
+ def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, x=0.0, y=0.0, text_field=None):
+ super(FaksimileImage, self).__init__(node=node, file_name=file_name, URL=URL, local_path=local_path,\
+ height=height, width=width, text_field=text_field, tag=self.XML_TAG)
self.x = x
self.y = y
+ def get_image_joined_with_text_field(self, text_field):
+ """Returns a new instance of itself that has a text_field (text_field.TextField).
+ """
+ return FaksimileImage(file_name=self.file_name, local_path=self.local_path, URL=self.URL, height=self.height,\
+ width=self.width, x=self.x, y=self.y, text_field=text_field)
+
@staticmethod
- def CREATE_IMAGE(image_node):
+ def CREATE_IMAGE(image_node, source_file=None):
"""Instantiates a FaksimileImage from a (lxml.etree.Element) image_node.
"""
namespaces = image_node.nsmap
if len(namespaces) == 0:
namespaces = { 'xlink': '' }
- file_name = image_node.get('{%s}href' % namespaces['xlink'])
+ local_path = image_node.get('{%s}href' % namespaces['xlink'])
+ file_name = basename(local_path)
+ if file_name != local_path and source_file is not None:
+ local_path = realpath(dirname(source_file)) + sep + local_path
+ local_path = realpath(local_path)
+ if not isfile(local_path):
+ local_path = None
+ for path, dirs, files in os.walk(os.path.abspath(FAKSIMILE_LOCATION)):
+ for filename in fnmatch.filter(files, file_name):
+ local_path = os.path.join(path, filename)
+ break
URL = FaksimileImage.NIETZSCHE_SOURCES_URL + file_name.replace('.jpg','')
height = float(image_node.get('height')) if bool(image_node.get('height')) else 0.0
width = float(image_node.get('width')) if bool(image_node.get('width')) else 0.0
x = float(image_node.get('x')) if bool(image_node.get('x')) else 0.0
y = float(image_node.get('y')) if bool(image_node.get('y')) else 0.0
- return FaksimileImage(file_name=file_name, URL=URL, height=height, width=width, x=x, y=y)
+ return FaksimileImage(file_name=file_name, local_path=local_path, URL=URL, height=height, width=width, x=x, y=y)