Index: shared_util/myxmlwriter.py
===================================================================
--- shared_util/myxmlwriter.py (revision 0)
+++ shared_util/myxmlwriter.py (revision 66)
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This program can be used to pretty-write a xml string to a xml file.
+"""
+# Copyright (C) University of Basel 2019 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+import inspect
+import xml.dom.minidom as MD
+import xml.etree.ElementTree as ET
+import lxml.etree as LET
+from datetime import datetime
+from rdflib import URIRef
+import sys
+import warnings
+
+sys.path.append('svgscripts')
+from datatypes.page import FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+FILE_TYPE_SVG_WORD_POSITION = FILE_TYPE_SVG_WORD_POSITION
+FILE_TYPE_XML_MANUSCRIPT = FILE_TYPE_XML_MANUSCRIPT
+FILE_TYPE_XML_DICT = 'xml-dictionary'
+
+def attach_dict_to_xml_node(dictionary, xml_node):
+ """Create a xml tree from a dictionary.
+ """
+ for key in dictionary.keys():
+ elem_type = type(dictionary[key])
+ if elem_type != dict:
+ node = LET.SubElement(xml_node, key, attrib={'type': elem_type.__name__})
+ node.text = str(dictionary[key])
+ else:
+ attach_dict_to_xml_node(dictionary[key], LET.SubElement(xml_node, key))
+
+def dict2xml(dictionary, target_file_name):
+ """Write dict 2 xml.
+ """
+ xml_tree = LET.ElementTree(LET.Element('root'))
+ attach_dict_to_xml_node(dictionary, LET.SubElement(xml_tree.getroot(), 'dict'))
+ write_pretty(xml_element_tree=xml_tree, file_name=target_file_name,\
+ script_name=inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_XML_DICT)
+
+def get_dictionary_from_node(node):
+ """Return dictionary from node.
+
+ :return: dict
+ """
+ new_dict = {}
+ if len(node.getchildren()) > 0:
+ new_dict.update({ node.tag : {} })
+ for child_node in node.getchildren():
+ new_dict.get(node.tag).update(get_dictionary_from_node(child_node))
+ else:
+ elem_cls = eval(node.get('type')) if bool(node.get('type')) else str
+ value = elem_cls(node.text) if bool(node.text) else None
+ new_dict.update({ node.tag: value })
+ return new_dict
+
+def lock_xml_tree(xml_element_tree, **locker_dict):
+ """Lock xml_element_tree.
+ """
+ if xml_element_tree is not None and not test_lock(xml_element_tree, silent=True):
+ message = locker_dict.get('message') if bool(locker_dict.get('message')) else ''
+ reference_file = locker_dict.get('reference_file') if bool(locker_dict.get('reference_file')) else ''
+ metadata = xml_element_tree.xpath('./metadata')[0]\
+ if len(xml_element_tree.xpath('./metadata')) > 0\
+ else LET.SubElement(xml_element_tree.getroot(), 'metadata')
+ lock = LET.SubElement(metadata, 'lock')
+ LET.SubElement(lock, 'reference-file').text = reference_file
+ if message != '':
+ LET.SubElement(lock, 'message').text = message
+
+def parse_xml_of_type(xml_source_file, file_type):
+ """Return a xml_tree from xml_source_file is file is of type file_type.
+ """
+ xml_tree = LET.parse(xml_source_file)
+ if not xml_has_type(file_type, xml_tree=xml_tree):
+ msg = 'File {} is not of type {}!'.format(xml_source_file, file_type)
+ raise Exception(msg)
+ return xml_tree
+
+def test_lock(xml_element_tree=None, silent=False):
+ """Test if xml_element_tree is locked and print a message.
+
+ :return: True if locked
+ """
+ if xml_element_tree is None:
+ return False
+ if len(xml_element_tree.findall('./metadata/lock')) > 0:
+ reference_file = xml_element_tree.findall('./metadata/lock/reference-file')
+ message = xml_element_tree.findall('./metadata/lock/message')
+ if not silent:
+ warning_msg = 'File {0} is locked!'.format(xml_element_tree.docinfo.URL)
+ if len(reference_file) > 0:
+ warning_msg = warning_msg.replace('!', ' ') + 'on {0}.'.format(reference_file[0].text)
+ if len(message) > 0:
+ warning_msg = warning_msg + '\n{0}'.format(message[0].text)
+ warnings.warn(warning_msg)
+ return True
+ return False
+
+def update_metadata(xml_element_tree, script_name, file_type=None):
+ """Updates metadata of xml tree.
+ """
+ if len(xml_element_tree.getroot().findall('./metadata')) > 0:
+ if len(xml_element_tree.getroot().find('./metadata').findall('./modifiedBy[@script="{}"]'.format(script_name))) == 0:
+ LET.SubElement(xml_element_tree.getroot().find('./metadata'), 'modifiedBy', attrib={'script': script_name})
+ xml_element_tree.getroot().find('./metadata').findall('./modifiedBy[@script="{}"]'.format(script_name))[0].text = \
+ datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ else:
+ metadata = LET.SubElement(xml_element_tree.getroot(), 'metadata')
+ if file_type is not None:
+ LET.SubElement(metadata, 'type').text = file_type
+ createdBy = LET.SubElement(metadata, 'createdBy')
+ LET.SubElement(createdBy, 'script').text = script_name
+ LET.SubElement(createdBy, 'date').text = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+
+def write_pretty(xml_string=None, xml_element_tree=None, file_name=None, script_name=None, file_type=None, **locker_dict):
+ """Writes a xml string pretty to a file.
+ """
+ if not bool(xml_string) and not bool(xml_element_tree):
+ raise Exception("write_pretty needs a string or a xml.ElementTree!")
+ if not test_lock(xml_element_tree):
+ if len(locker_dict) > 0 and bool(locker_dict.get('reference_file')):
+ lock_xml_tree(xml_element_tree, **locker_dict)
+ if script_name is not None and xml_element_tree is not None:
+ update_metadata(xml_element_tree, script_name, file_type=file_type)
+ if file_name is None and xml_element_tree is not None\
+ and xml_element_tree.docinfo is not None and xml_element_tree.docinfo.URL is not None:
+ file_name = xml_element_tree.docinfo.URL
+ if file_name is None:
+ raise Exception("write_pretty needs a file_name or a xml.ElementTree with a docinfo.URL!")
+ dom = MD.parseString(xml_string) if(bool(xml_string)) else MD.parseString(ET.tostring(xml_element_tree.getroot()))
+ f = open(file_name, "w")
+ dom.writexml(f, addindent="\t", newl='\n', encoding='utf-8')
+ f.close()
+
+def xml2dict(xml_source_file):
+ """Create dict from xml_source_file of Type FILE_TYPE_XML_DICT.
+
+ :return: dict
+ """
+ new_dict = {}
+ xml_tree = LET.parse(xml_source_file)
+ if xml_has_type(FILE_TYPE_XML_DICT, xml_tree=xml_tree)\
+ and len(xml_tree.xpath('/root/dict')) > 0:
+ for node in xml_tree.xpath('/root/dict')[0].getchildren():
+ new_dict.update(get_dictionary_from_node(node))
+ else:
+ msg = 'File {} is not of type {}!'.format(xml_source_file, FILE_TYPE_XML_DICT)
+ raise Exception(msg)
+ return new_dict
+
+def xml_has_type(file_type, xml_source_file=None, xml_tree=None):
+ """Return true if xml_source_file/xml_tree has file type == file_type.
+ """
+ if xml_tree is None and xml_source_file is None:
+ return False
+ if xml_tree is None:
+ xml_tree = LET.parse(xml_source_file)
+ if len(xml_tree.xpath('//metadata/type/text()')) < 1:
+ return False
+ return xml_tree.xpath('//metadata/type/text()')[0] == file_type
Index: tests_shared_util/test_data/N_VII_1_page001.xml
===================================================================
--- tests_shared_util/test_data/N_VII_1_page001.xml (revision 0)
+++ tests_shared_util/test_data/N_VII_1_page001.xml (revision 66)
@@ -0,0 +1,1369 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ svgWordPosition
+
+
+ 2019-08-02 15:26:13
+
+ 2019-08-02 15:26:13
+
+
Index: tests_shared_util/test_myxmlwriter.py
===================================================================
--- tests_shared_util/test_myxmlwriter.py (revision 0)
+++ tests_shared_util/test_myxmlwriter.py (revision 66)
@@ -0,0 +1,103 @@
+import unittest
+import os
+from os.path import isfile, isdir, dirname, sep, realpath
+from datetime import datetime
+import shutil
+import tempfile
+import xml.etree.ElementTree as ET
+import lxml.etree as LET
+from rdflib import Graph, URIRef, Literal, BNode, OWL, RDF, RDFS, XSD
+from xmldiff import main
+import sys
+
+sys.path.append('svgscripts')
+from datatypes.page import Page
+
+sys.path.append('shared_util')
+try:
+ from myxmlwriter import attach_dict_to_xml_node, dict2xml, lock_xml_tree, update_metadata, write_pretty, test_lock, xml_has_type,\
+ FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_DICT, get_dictionary_from_node, xml2dict, parse_xml_of_type
+except ImportError:
+ sys.path.append(dirname(dirname(realpath(__file__))))
+ from shared_util.myxmlwriter import attach_dict_to_xml_node, dict2xml, lock_xml_tree, update_metadata, write_pretty, test_lock, xml_has_type,\
+ FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_DICT, get_dictionary_from_node, xml2dict, parse_xml_of_type
+
+class TestPrettyWriter(unittest.TestCase):
+ def setUp(self):
+ self.test_dir = tempfile.mkdtemp()
+ self.title = 'ASDF'
+ DATADIR = dirname(__file__) + sep + 'test_data'
+ self.page = DATADIR + sep + 'N_VII_1_page001.xml'
+ self.mydict = { 'asdf': { 'b': { 'a': 1, 'b': 'c' , 'c': URIRef('adf')}},\
+ 'str': 'test' }
+
+ def test_attach_dict_to_xml_node(self):
+ xml_tree = LET.Element('root')
+ attach_dict_to_xml_node(self.mydict, LET.SubElement(xml_tree, 'dict'))
+ #print(LET.dump(xml_tree))
+ self.assertEqual(xml_tree.xpath('//asdf/b/a/@type')[0], 'int')
+ self.assertEqual(xml_tree.xpath('//asdf/b/b/@type')[0], 'str')
+ self.assertEqual(xml_tree.xpath('//asdf/b/c/@type')[0], URIRef.__name__)
+
+ def test_dict2xml(self):
+ test_file = self.test_dir + sep + 'new_test.xml'
+ dict2xml(self.mydict, test_file)
+ self.assertEqual(isfile(test_file), True)
+
+ def test_get_dictionary_from_node(self):
+ test_file = self.test_dir + sep + 'source.xml'
+ dict2xml(self.mydict, test_file)
+ xml_tree = LET.parse(test_file)
+ self.assertEqual(len(xml_tree.xpath('/root/dict')[0].getchildren()), len(self.mydict.keys()))
+ for index, key in enumerate(self.mydict.keys()):
+ mydict = get_dictionary_from_node(xml_tree.xpath('/root/dict')[0].getchildren()[index])
+ self.assertEqual(key in mydict.keys(), True)
+ if type(self.mydict[key]) == dict:
+ self.assertEqual(mydict[key].keys(), self.mydict[key].keys())
+
+ def test_update_metadata(self):
+ test_tree = LET.ElementTree(LET.Element('page', attrib={"title": self.title}))
+ update_metadata(test_tree, __file__)
+ self.assertEqual(test_tree.find('./metadata').find('./createdBy').find('./script').text, __file__)
+ update_metadata(test_tree, __file__)
+ self.assertEqual(len(test_tree.find('./metadata').findall('./modifiedBy[@script="{}"]'.format(__file__))), 1)
+ update_metadata(test_tree, __file__)
+ self.assertEqual(len(test_tree.find('./metadata').findall('./modifiedBy[@script="{}"]'.format(__file__))), 1)
+
+ def test_write_pretty(self):
+ et_file = self.test_dir + os.sep + 'et_file.xml'
+ pretty_file = self.test_dir + os.sep + 'pretty_file.xml'
+ manuscript_tree = ET.ElementTree(ET.Element('page', attrib={"title": self.title}))
+ metadata = ET.SubElement(manuscript_tree.getroot(), 'metadata')
+ ET.SubElement(metadata, 'type').text = 'xmlManuscriptFile'
+ createdBy = ET.SubElement(metadata, 'createdBy')
+ manuscript_tree.write(et_file, xml_declaration=True, encoding='utf-8')
+ write_pretty(xml_string=ET.tostring(manuscript_tree.getroot()), file_name=pretty_file)
+ self.assertEqual(main.diff_files(et_file, pretty_file), [])
+ write_pretty(xml_element_tree=manuscript_tree, file_name=pretty_file)
+ self.assertEqual(main.diff_files(et_file, pretty_file), [])
+
+ def test_lock(self):
+ page = Page(xml_source_file=self.page)
+ locker_dict = { 'reference_file': 'asdf.txt', 'message': 'locked on this file'}
+ lock_xml_tree(page.page_tree, **locker_dict)
+ self.assertEqual(page.is_locked(), True)
+ #test_lock(page.page_tree)
+
+ def test_xml2dict(self):
+ test_file = self.test_dir + sep + 'source.xml'
+ dict2xml(self.mydict, test_file)
+ mydict = xml2dict(test_file)
+ self.assertEqual(mydict, self.mydict)
+
+ def test_xml_has_type(self):
+ self.assertEqual(xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=self.page), True)
+ self.assertEqual(xml_has_type(FILE_TYPE_XML_DICT, xml_source_file=self.page), False)
+ with self.assertRaises(Exception):
+ parse_xml_of_type(self.page, FILE_TYPE_XML_DICT)
+
+ def tearDown(self):
+ isdir(self.test_dir) and shutil.rmtree(self.test_dir)
+
+if __name__ == "__main__":
+ unittest.main()
Index: svgscripts/myxmlwriter.py
===================================================================
--- svgscripts/myxmlwriter.py (revision 65)
+++ svgscripts/myxmlwriter.py (revision 66)
@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-""" This program can be used to pretty-write a xml string to a xml file.
-"""
-# Copyright (C) University of Basel 2019 {{{1
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see 1}}}
-
-import xml.dom.minidom as MD
-import xml.etree.ElementTree as ET
-import lxml.etree as LET
-from datetime import datetime
-import sys
-import warnings
-
-sys.path.append('svgscripts')
-
-from datatypes.page import FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
-
-__author__ = "Christian Steiner"
-__maintainer__ = __author__
-__copyright__ = 'University of Basel'
-__email__ = "christian.steiner@unibas.ch"
-__status__ = "Development"
-__license__ = "GPL v3"
-__version__ = "0.0.1"
-
-FILE_TYPE_SVG_WORD_POSITION = FILE_TYPE_SVG_WORD_POSITION
-FILE_TYPE_XML_MANUSCRIPT = FILE_TYPE_XML_MANUSCRIPT
-
-def lock_xml_tree(xml_element_tree, **locker_dict):
- """Lock xml_element_tree.
- """
- if xml_element_tree is not None and not test_lock(xml_element_tree, silent=True):
- message = locker_dict.get('message') if bool(locker_dict.get('message')) else ''
- reference_file = locker_dict.get('reference_file') if bool(locker_dict.get('reference_file')) else ''
- metadata = xml_element_tree.xpath('./metadata')[0]\
- if len(xml_element_tree.xpath('./metadata')) > 0\
- else LET.SubElement(xml_element_tree.getroot(), 'metadata')
- lock = LET.SubElement(metadata, 'lock')
- LET.SubElement(lock, 'reference-file').text = reference_file
- if message != '':
- LET.SubElement(lock, 'message').text = message
-
-def test_lock(xml_element_tree=None, silent=False):
- """Test if xml_element_tree is locked and print a message.
-
- :return: True if locked
- """
- if xml_element_tree is None:
- return False
- if len(xml_element_tree.findall('./metadata/lock')) > 0:
- reference_file = xml_element_tree.findall('./metadata/lock/reference-file')
- message = xml_element_tree.findall('./metadata/lock/message')
- if not silent:
- warning_msg = 'File {0} is locked!'.format(xml_element_tree.docinfo.URL)
- if len(reference_file) > 0:
- warning_msg = warning_msg.replace('!', ' ') + 'on {0}.'.format(reference_file[0].text)
- if len(message) > 0:
- warning_msg = warning_msg + '\n{0}'.format(message[0].text)
- warnings.warn(warning_msg)
- return True
- return False
-
-def update_metadata(xml_element_tree, script_name, file_type=None):
- """Updates metadata of xml tree.
- """
- if len(xml_element_tree.getroot().findall('./metadata')) > 0:
- if len(xml_element_tree.getroot().find('./metadata').findall('./modifiedBy[@script="{}"]'.format(script_name))) == 0:
- LET.SubElement(xml_element_tree.getroot().find('./metadata'), 'modifiedBy', attrib={'script': script_name})
- xml_element_tree.getroot().find('./metadata').findall('./modifiedBy[@script="{}"]'.format(script_name))[0].text = \
- datetime.now().strftime('%Y-%m-%d %H:%M:%S')
- else:
- metadata = LET.SubElement(xml_element_tree.getroot(), 'metadata')
- if file_type is not None:
- LET.SubElement(metadata, 'type').text = file_type
- createdBy = LET.SubElement(metadata, 'createdBy')
- LET.SubElement(createdBy, 'script').text = script_name
- LET.SubElement(createdBy, 'date').text = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-
-def write_pretty(xml_string=None, xml_element_tree=None, file_name=None, script_name=None, file_type=None, **locker_dict):
- """Writes a xml string pretty to a file.
- """
- if not bool(xml_string) and not bool(xml_element_tree):
- raise Exception("write_pretty needs a string or a xml.ElementTree!")
- if not test_lock(xml_element_tree):
- if len(locker_dict) > 0 and bool(locker_dict.get('reference_file')):
- lock_xml_tree(xml_element_tree, **locker_dict)
- if script_name is not None and xml_element_tree is not None:
- update_metadata(xml_element_tree, script_name, file_type=file_type)
- if file_name is None and xml_element_tree is not None\
- and xml_element_tree.docinfo is not None and xml_element_tree.docinfo.URL is not None:
- file_name = xml_element_tree.docinfo.URL
- if file_name is None:
- raise Exception("write_pretty needs a file_name or a xml.ElementTree with a docinfo.URL!")
- dom = MD.parseString(xml_string) if(bool(xml_string)) else MD.parseString(ET.tostring(xml_element_tree.getroot()))
- f = open(file_name, "w")
- dom.writexml(f, addindent="\t", newl='\n', encoding='utf-8')
- f.close()
-
Index: svgscripts/extractAndConvert.py
===================================================================
--- svgscripts/extractAndConvert.py (revision 65)
+++ svgscripts/extractAndConvert.py (revision 66)
@@ -1,136 +1,138 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to start extraction of word position and showing the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import getopt
import re
import sys
from os import sep, path
from os.path import isfile
import lxml.etree as ET
from extractWordPosition import Extractor
from convert_wordPositions import HTMLConverter
+
+sys.path.append('shared_util')
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to start extraction of word position and showing the word positions to HTML for testing purposes.
svgscripts/extractAndConvert.py [OPTIONS] svg file OR xml target file containing file name of svg file as "/page/@source".
OPTIONS:
-h|--help: show help
-s|--svg=svgFile: svg web file
-H|--HTML [default] convert to HTML test file
-x|--xml-target-file=xmlOutputFile: target file
-p|--page=pageNumber: page number of the current page. For use with _one_ file only.
-P|--PDF=pdfFile: pdf file - used for word correction
-t|--title=title: title of the manuscript to which the current page(s) belong(s)
:return: exit code (int)
"""
convert_to_type = 'HTML'
file_name = None
non_testing = True
page = None
page_number = None
pdfFile = None
svg_file = None
title = None
xml_dir = ".{}xml".format(sep)
xml_target_file = None
try:
opts, args = getopt.getopt(argv, "hTHt:p:s:x:P:", ["help", "Testing", "HTML", "title=", "page=", "svg=", "xml-target-file=", "PDF="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-T', '--Testing'):
non_testing = False
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-p', '--page'):
page_number = str(arg)
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-P', '--PDF'):
pdfFile = arg
elif opt in ('-x', '--xml-target-file'):
xml_target_file = str(arg)
if len(args) < 1 or args[0].endswith('xml'):
if xml_target_file is None:
xml_target_file = args[0] if len(args) > 0 else None
if xml_target_file is not None and isfile(xml_target_file):
target_file_tree = ET.parse(xml_target_file)
file_name = target_file_tree.getroot().get('source')
title = target_file_tree.getroot().get('title') if title is None else title
page_number = target_file_tree.getroot().get('number') if page_number is None else page_number
if svg_file is None:
if len(target_file_tree.xpath('//svg-image')) > 0:
svg_file = target_file_tree.xpath('.//svg-image/@file-name')[0]\
if len(target_file_tree.xpath('.//svg-image/@file-name')) > 0 else None
else:
svg_file = target_file_tree.xpath('.//svg/@file')[0]\
if len(target_file_tree.xpath('.//svg/@file')) > 0 else None
else:
file_name = args[0]
if file_name is None or not isfile(file_name):
print("'{}' does not exist!".format(file_name)) if (file_name is not None) else usage()
return 2
extractor = Extractor(xml_dir=xml_dir, title=title, extract_transkription_field_only=True)
page = extractor.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file)
if page.svg_file is None:
print('Please specify a svg file!')
usage()
return 2
page.init_words()
converter = HTMLConverter(page, non_testing=non_testing)
converter.convert()
if xml_target_file is not None:
xml_target_file = xml_dir + sep + path.basename(xml_target_file)
page.page_tree.getroot().set('source', file_name)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition')
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/join_faksimileAndTranskription.py
===================================================================
--- svgscripts/join_faksimileAndTranskription.py (revision 65)
+++ svgscripts/join_faksimileAndTranskription.py (revision 66)
@@ -1,562 +1,564 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from convert_wordPositions import create_pdf_with_highlighted_words, create_svg_with_highlighted_words
from create_task import CorrectWords
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.lineNumber import LineNumber
from datatypes.page import Page
from datatypes.transkriptionField import TranskriptionField
-from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from process_files import update_svgposfile_status
from util import ExternalViewer, create_highlighted_svg_file, get_empty_node_ids, record_changes, record_changes_on_svg_file_to_page
+sys.path.append('shared_util')
+from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
+
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation)
PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"')
SINGLE_PUNCTUATION_PATTERN = r"^[{}]$".format(string.punctuation)
SINGLE_WORD_PATTERN = r"^[\w{}]$".format(string.punctuation)
STATUS_MERGED_OK = 'faksimile merged'
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
def create_task_correct_words(target_dir, xml_source_file=None, source_svg_file=None, page=None, unmatched_word_ids=None, unmatched_node_ids=None):
"""Create a task CorrectWords or process corrected files.
"""
exit_status = 0
if xml_source_file is None or source_svg_file is None:
if xml_source_file is None and page is not None and isfile(page.page_tree.docinfo.URL):
xml_source_file = page.page_tree.docinfo.URL if xml_source_file is None else xml_source_file
elif xml_source_file is None:
raise Exception('create_task_correct_words needs a xml_source_file or a page that has a valid tree source!')
if source_svg_file is None and page is not None and isfile(page.faksimile_svgFile):
source_svg_file = page.faksimile_svgFile if source_svg_file is None else source_svg_file
elif source_svg_file is None:
raise Exception('create_task_correct_words needs a source_svg_file or a page that has a faksimile_svgFile!')
correct_words = CorrectWords(xml_source_file, source_svg_file, target_dir,\
unmatched_word_ids=unmatched_word_ids, unmatched_node_ids=unmatched_node_ids)
finished_faksimile_file = correct_words.get_target_filepath(finished_dir=True)
finished_transkription_file = correct_words.get_target_filepath(get_faksimile=False, finished_dir=True)
unfinished_transkription_file = correct_words.get_target_filepath(get_faksimile=False)
if page is not None:
if not isfile(finished_transkription_file) and not isfile(unfinished_transkription_file):
lock_dict = { 'reference_file': finished_transkription_file,\
'message': 'Run:$python3 {0} -c {1} {2}'.format(__file__, target_dir, source_svg_file)}
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file, script_name=__file__,\
file_type=FILE_TYPE_SVG_WORD_POSITION, **lock_dict)
if isfile(finished_transkription_file) or isfile(finished_faksimile_file):
if isfile(finished_transkription_file):
page = record_changes_on_svg_file_to_page(xml_source_file, finished_transkription_file, word_ids=unmatched_word_ids)
faksimile_file = finished_faksimile_file if isfile(finished_faksimile_file) else source_svg_file
exit_status = join_faksimileAndTranskription(finished_faksimile_file)
elif not isfile(correct_words.get_target_filepath()):
correct_words.create()
return exit_status
def debug_function(words, input=''):
"""Custon debug function.
"""
if len([ word for word in words if word.debug_container.get('marked') ]) > 0:
print(Fore.RED + 'marked word(s): {}'.format([ word.text for word in words if word.debug_container.get('marked') ]))
if input != '':
print('input: {}'.format(input))
print(Fore.RESET)
def fix_errors(faksimile_file, unmerged_faksimile_positions, unmerged_words, text_field_id=None, faksimile_page=None, xml_source_file=None, manuscript_file=None, namespaces={}):
"""Creates a faksimile svg file and a pdf file highlighting the positions of the word positions
that could not been merged. After correction, results are inserted into origianl file and processed again.
:return: exit status (int)
"""
parser = ET.XMLParser(remove_blank_text=True)
faksimile_tree = ET.parse(faksimile_file, parser)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
if faksimile_page is None:
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if text_field_id is not None\
and text_field_id in [ faksimile_page.text_field.id for faksimile_page in faksimile_pages ]:
faksimile_page = [ faksimile_page for faksimile_page in faksimile_pages if faksimile_page.text_field.id == text_field_id ][0]
else:
faksimile_page = faksimile_pages[0]
if xml_source_file is None or manuscript_file is None:
xml_source_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)
tmp_dir = tempfile.mkdtemp()
tmp_pdf_file = tmp_dir + sep + 'output.pdf'
tmp_svg_file = tmp_dir + sep + 'output.svg'
tmp_faksimile = tmp_dir + sep + 'faksimile.svg'
empyt_node_ids = get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces)\
if len(unmerged_faksimile_positions) < len(unmerged_words) else []
highlight_node_ids = [ faksimile_position.id for faksimile_position in unmerged_faksimile_positions ]
highlight_node_ids += empyt_node_ids
create_highlighted_svg_file(faksimile_tree, highlight_node_ids, target_file=tmp_faksimile,
local_image_path=faksimile_page.faksimile_image.local_path, namespaces=namespaces, highlight_color=HIGHLIGHT_COLOR)
#create_pdf_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, pdf_file_name=tmp_pdf_file, bg_color=HIGHLIGHT_COLOR)
create_svg_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, svg_file_name=tmp_svg_file, bg_color=HIGHLIGHT_COLOR)
exit_status = 2
if isfile(tmp_svg_file) and isfile(tmp_faksimile):
ExternalViewer.show_files(list_of_files=[tmp_svg_file, tmp_faksimile])
record_changes_on_svg_file_to_page(xml_source_file, tmp_svg_file, word_ids=[ word.id for word in unmerged_words ])
record_changes(faksimile_file, tmp_faksimile, highlight_node_ids, namespaces=namespaces)
shutil.rmtree(tmp_dir)
exit_status = join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, do_fix_errors=False, join_single_char_words=True)
return exit_status
def get_filelist_and_manuscript_file(file_a, file_b=None):
"""Returns a file list and a manuscript file (or None)
"""
file_list = []
manuscript_file = None
if isfile(file_a) and file_a.endswith('svg'):
file_list.append(file_a)
if file_b is not None and isfile(file_b):
manuscript_file = file_b
elif isfile(file_a) and file_a.endswith('xml'):
manuscript_file = file_a
if file_b is not None and isfile(file_b):
file_list.append(file_b)
elif isdir(file_b):
file_list = [ svgfile for svgfile in listdir(file_b) if svgfile.endswith('svg') ]
elif isdir(file_a):
file_list = [ file_a + sep + svgfile for svgfile in listdir(file_a) if svgfile.endswith('svg') ]
if file_b is not None and isfile(file_b):
manuscript_file = file_b
return file_list, manuscript_file
def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None, redo_ok=False):
"""Return svg_pos_file and manuscript_file if they are ready for processing.
"""
svg_pos_file = None
manuscript_tree = None
if manuscript_file is not None\
and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')):
manuscript_tree = ET.parse(manuscript_file)
else:
title_string = faksimile_page.title.replace(' ', '_')
manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\
if isdir('.{}xml'.format(sep)) else title_string + '.xml'
if isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if manuscript_tree is not None:
if redo_ok and len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
else:
if not UNITTESTING:
msg_color = Fore.CYAN if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0\
else Fore.MAGENTA
msg = 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number)\
if msg_color == Fore.MAGENTA\
else 'Faksimile already joined!'
print(msg_color + msg, end='')
print(Style.RESET_ALL)
return svg_pos_file, manuscript_file
def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, do_fix_errors=False, redo_ok=False, debug_word_text='', **kwargs):
"""Joins the data of a faksimile file with the data of svgposfile.
"""
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='')
print(Style.RESET_ALL)
if not do_fix_errors and 'do_fix_errors' in kwargs.keys():
do_fix_errors = kwargs.get('do_fix_errors')
if not redo_ok and 'redo_ok' in kwargs.keys():
redo_ok = kwargs.get('redo_ok')
if debug_word_text == '' and 'debug_word_text' in kwargs.keys():
debug_word_text = kwargs.get('debug_word_text')
faksimile_tree = ET.parse(faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
exit_status = 0
for faksimile_page in faksimile_pages:
svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)
if svg_pos_file is not None:
if not UNITTESTING:
print(Fore.CYAN + 'joining data with file {} ... '.format(svg_pos_file), end='')
image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field)
page = Page(xml_source_file=svg_pos_file, faksimile_image=image4page, faksimile_svgFile=faksimile_file)
words = sort_words(page)
if debug_word_text != '' and len([ word for word in words if word.text == debug_word_text ]) > 0:
for word in words:
if word.text == debug_word_text:
word.debug_container.update({'marked': True})
if not bool(kwargs.get('do_not_join_single_char_words')):
removed_words = join_single_char_words(words)
page.words = words
page.update_and_attach_words2tree()
#print([ word.text for word in page.words if word in removed_words ])
else:
print('not joining single char words!')
faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions)
new_words = []
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
for word_text in unique_faksimile_words:
process_word_text(new_words, word_text, faksimile_positions, words)
if False not in [ word.joined for word in words ]\
and False not in [ position.joined for position in faksimile_positions]\
and not UNITTESTING:
if page.is_locked():
page.unlock()
post_merging_processing_and_saving(svg_pos_file, new_words, page=page, manuscript_file=manuscript_file)
print(Fore.GREEN + '[OK]')
print(Style.RESET_ALL)
elif not UNITTESTING:
not_joined_fp = [ (position.id, position.text) for position in faksimile_positions if not position.joined ]
plural_fp = '' if len(not_joined_fp) < 2 else 's'
not_joined_tw = [ (word.id, word.line_number, word.text) for word in words if not word.joined ]
plural_tw = '' if len(not_joined_tw) < 2 else 's'
print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp))
print('--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw))
debug_function(new_words, input='new_words')
debug_function(words, input='words')
print(Style.RESET_ALL)
if kwargs.get('correct_words') is not None:
unmatched_node_ids = [ position.id for position in faksimile_positions if not position.joined]
unmatched_node_ids += get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces)
exit_status = create_task_correct_words(kwargs.get('correct_words'), page=page, source_svg_file=faksimile_file,\
unmatched_word_ids=[ word.id for word in words if not word.joined ],\
unmatched_node_ids=unmatched_node_ids)
elif do_fix_errors:
exit_status = fix_errors(faksimile_file, [position for position in faksimile_positions if not position.joined],\
[ word for word in words if not word.joined ], text_field_id=faksimile_page.text_field.id,\
faksimile_page=faksimile_page, xml_source_file=svg_pos_file,\
manuscript_file=manuscript_file, namespaces=namespaces)
else:
exit_status = 2
elif False in [ word.joined for word in words ]:
print([ (word.id, word.text) for word in words if not word.joined ])
exit_status = 2
return exit_status
def join_single_char_words(words, threshold_x=5, threshold_y=5):
"""Join single char words.
:return: a list of removed words
"""
#all_single_char_words = [ word for word in words if re.match(r'^\w$', word.text) ]
removed_words = []
all_single_char_words = [ word for word in words if re.match(SINGLE_WORD_PATTERN, word.text) ]
if not UNITTESTING:
bar = Bar('Joining single char words', max=len(all_single_char_words))
line_numbers = sorted(set(word.line_number for word in all_single_char_words))
for line_number in line_numbers:
single_char_words = [ word for word in all_single_char_words if word.line_number == line_number ]
index = len(single_char_words)
while index > 0:
index -= 1
word = None
not UNITTESTING and bar.next()
if single_char_words[index] in words:
single_char_word_index = words.index(single_char_words[index])
if re.match(SINGLE_PUNCTUATION_PATTERN, single_char_words[index].text)\
and words_close_enough(words[single_char_word_index-1], single_char_words[index], 15, 12):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
#print('{0} -> {1}, {2}'.format(word.text, words[single_char_word_index-1].text))
elif index > 0\
and words_close_enough(single_char_words[index-1], single_char_words[index], threshold_x, threshold_y):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
elif single_char_word_index > 0\
and words[single_char_word_index-1].line_number == line_number\
and words_close_enough(words[single_char_word_index-1], single_char_words[index], threshold_x, threshold_y):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
not UNITTESTING and bar.finish()
return removed_words
def post_merging_processing_and_saving(svg_pos_file, new_words, page=None, manuscript_file=None, target_svg_pos_file=None):
"""Process words after merging with faksimile word positions.
"""
if page is None:
page = Page(xml_source_file=svg_pos_file)
page.words = sorted(new_words, key=attrgetter('id'))
for word_node in page.page_tree.xpath('.//word'):
word_node.getparent().remove(word_node)
if page.source is None or not isfile(page.source):
raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file))
transkription_field = TranskriptionField(page.source)
page.find_special_words(transkription_field=transkription_field)
#TODO: page.find_hyphenated_words()
page.categorize_paths(transkription_field=transkription_field)
page.update_and_attach_words2tree()
if target_svg_pos_file is None:
target_svg_pos_file = svg_pos_file
update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=STATUS_MERGED_OK)
write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
def process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text='', min_length_split=5):
"""Joins faksimile_positions with text == word_text with words with text == word_text.
"""
text = word_text if alt_word_text == '' else alt_word_text
fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ]
words4word = [ word for word in words if word.text == word_text and not word.joined ]
if alt_word_text != '':
words4word += [ word for word in words if word.text == text and not word.joined ]
words4word = sorted(words4word, key=attrgetter('id'))
if len(fposition4word) == len(words4word):
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words[words.index(words4word[index])].joined = True
new_words.append(words4word[index])
elif len(words4word) < len(fposition4word):
if re.match(r'(.*)ss(.*)', text):
alt_word_text = re.sub(r'ss', 'ß', text)
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
elif re.match(SINGLE_PUNCTUATION_PATTERN, text):
if text == '-':
alt_word_text = text.replace('-', '–')
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
print('single', word_text, len(fposition4word), len(words4word))
elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text):
alt_word_text = re.sub(PUNCTUATION_PATTERN, "", text)
debug_function(words4word, input='elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text) text {0}'.format(text))
if alt_word_text != '':
pattern = r'(.*){0}(.*)'.format(alt_word_text)
words4word += [ word for word in words if re.match(pattern, word.text) and not word.joined ]
if len(words4word) < len(fposition4word):
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
if words4word[index].text[len(words4word[index].text)-1] != word_text[len(word_text)-1]\
and words.index(words4word[index])+1 < len(words)\
and words[words.index(words4word[index])+1].text == word_text[len(word_text)-1]:
words4word[index].join(words[words.index(words4word[index])+1])
words[words.index(words4word[index])+1].joined = True
words[words.index(words4word[index])].joined = True
words4word[index].text = word_text
new_words.append(words4word[index])
elif len(text) >= min_length_split and len([ word for word in words if word.text.startswith(text) and not word.joined ]) == len(fposition4word):
new_words4word = [ word for word in words if word.text.startswith(text) and not word.joined ]
debug_function(new_words4word, input='word.startswith {}'.format(text))
for index, fposition in enumerate(fposition4word):
old_word = new_words4word[index]
none_word, new_word, next_word = old_word.split(text, start_id=old_word.id)
fposition4word[index].joined = True
new_word.faksimile_positions = [ fposition4word[index] ]
words[words.index(old_word)] = new_word
if next_word is not None:
next_word.id = len(words)
next_word.joined = False
words.append(next_word)
new_word.joined = True
new_words.append(new_word)
elif len(text) >= min_length_split and len([ word for word in words if word.text.endswith(text) and not word.joined ]) == len(fposition4word):
new_words4word = [ word for word in words if word.text.endswith(text) and not word.joined ]
debug_function(new_words4word, input='word.endswith {}'.format(text))
for index, fposition in enumerate(fposition4word):
old_word = new_words4word[index]
before_word, new_word, none_word = old_word.split(text, start_id=old_word.id)
fposition4word[index].joined = True
new_word.faksimile_positions = [ fposition4word[index] ]
words[words.index(old_word)] = new_word
if before_word is not None:
before_word.id = len(words)
before_word.joined = False
words.append(before_word)
new_word.joined = True
new_words.append(new_word)
else:
if len(text) > 1:
new_words4word = [ word for word in words if word.text == text[1:] and not word.joined ]
debug_function(new_words4word, input='else text {0}'.format(text))
if len(new_words4word) == 0:
alt_word_text = text[1:]
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
for new_word in new_words4word:
collected_text = new_word.text
current_word = new_word
while collected_text != word_text and word_text.endswith(collected_text) and current_word.id > 0:
previous_word = words[current_word.id-1]
if word_text.endswith(previous_word.text + collected_text):
words[current_word.id].joined = True
previous_word.join(current_word)
current_word = previous_word
collected_text = current_word.text
else:
collected_text = previous_word.text + collected_text
words4word.append(current_word)
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words4word[index].text = word_text
words[words.index(words4word[index])].joined = True
new_words.append(words4word[index])
else:
print('<{0}> f{1}/t{2}, ids: {3}'.\
format(word_text, len(fposition4word), len(words4word), [ position.id for position in fposition4word ]))
else:
print('<{0}> f{1}/t{2}'.format(word_text, len(fposition4word), len(words4word)))
def sort_words(page):
"""Returns sorted words (from top left to bottom right).
"""
if -1 in [ word.line_number for word in page.words ]:
warnings.warn('{} in page file {} for word with ids {}'.format(LineNumber.WARN_NO_LINE_NUMBER, page.page_tree.docinfo.URL, page.page_tree.xpath('./word[not(@line-number)]/@id')))
words = []
for line_number in page.line_numbers:
word_on_line = [ word for word in page.words if word.line_number == line_number.id ]
if line_number.id % 2 == 0:
words += sorted(word_on_line, key=lambda word: word.transkription_positions[0].left)
else:
words += sorted(word_on_line, key=cmp_to_key(\
lambda wordA, wordB: -1\
if wordA.transkription_positions[0].left < wordB.transkription_positions[0].left\
and abs(wordA.transkription_positions[0].bottom-wordB.transkription_positions[0].bottom) < wordA.transkription_positions[0].height/2\
else 1))
for index, word in enumerate(words):
words[index].id = index
words[index].joined = False
return words
def sort_faksimile_positions(faksimile_positions):
"""Returns sorted words (from top left to bottom right).
"""
for faksimile_position in faksimile_positions:
faksimile_position.joined = False
return sorted(faksimile_positions, key=cmp_to_key(\
lambda positionA, positionB: -1\
if positionA.left+positionA.width/2 <= positionB.left+positionB.width/2\
and positionA.top+positionA.height/2 <= positionB.top+positionB.height/2\
else 1\
)\
)
@deprecated(reason="Writing process id is now set to word not word_position, TODO: check faksimile_positions for split candidates!")
def update_writing_process(word):
"""Updates the writing process of the faksimile word position by
synchronizing it with the corresponding transkription word position.
If there are several transkription positions belonging to different writing
processes but just one faksimile position, then we skip the update.
We will fix these faksimile positions by manually adding more word positions
and processing those additions in a later stage.
"""
writing_processes = [ writing_process_id for writing_process_id in set( tp.writing_process_id for tp in word.transkription_positions ) ]
if len(writing_processes) == 1 and len(word.faksimile_positions) > 0:
word.faksimile_positions[0].writing_process_id = writing_processes[0]
def words_close_enough(wordA, wordB, threshold_x=10, threshold_y=5):
"""Return true if words are closer than thresholds
"""
return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left)\
-wordB.transkription_positions[0].left) < threshold_x\
and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y
#return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left+wordA.transkription_positions[len(wordA.transkription_positions)-1].width)\
# -wordB.transkription_positions[0].left) < threshold_x\
# and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
svgscripts/join_faksimileAndTranskription.py [OPTIONS] [xmlManuscriptFile]
a directory containing a svg file containing information about the word positions on the faksimile.
a xml file about a manuscript, containing information about its pages.
OPTIONS:
-h|--help: show help
-c|--correct-words=DIR create a taks "CorrectWords" in target dir DIR
-d|--debug-word=WORD show debug information for word == WORD
-f|--fix-errors: open faksimilie svg file if there are errors
-i|--ignore-status-ok ignore status "OK:faksimile merged" in manuscript file and redo merging.
:return: exit code (int)
"""
commando_dict = { 'do_fix_errors': False, 'redo_ok': False, 'debug_word_text': '', 'correct_words': None }
try:
opts, args = getopt.getopt(argv, "hc:d:fi", ["help", "correct-words=", "debug-word=", "fix-errors", "ignore-status-ok" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-c', '--correct-words'):
commando_dict['correct_words'] = arg
elif opt in ('-d', '--debug-word'):
commando_dict['debug_word_text'] = arg
elif opt in ('-f', '--fix-errors'):
commando_dict['do_fix_errors'] = True
elif opt in ('-i', '--ignore-status-ok'):
commando_dict['redo_ok'] = True
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if exists(file_a):
file_b = None
if len(args) > 1 and exists(args[1]):
file_b = args[1]
file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b)
for faksimile_file in file_list:
join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, **commando_dict)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/process_files.py
===================================================================
--- svgscripts/process_files.py (revision 65)
+++ svgscripts/process_files.py (revision 66)
@@ -1,356 +1,358 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract information from all text svg files in directory.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from convertPDF2SVG4Web import Converter
from datatypes.page import Page
from datatypes.transkriptionField import TranskriptionField
from extractWordPosition import Extractor
+
+sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
class MyErrorHandler:
"""This class can be used to handle errors executing extractWordPosition.Extractor.extractAndWriteInformation.
"""
ERROR_LOG = 'error_log.xml'
def __init__(self):
self.tree = ET.ElementTree(ET.Element('error-log'))
if isfile(MyErrorHandler.ERROR_LOG):
parser = ET.XMLParser(remove_blank_text=True)
self.tree = ET.parse(MyErrorHandler.ERROR_LOG, parser)
def record_error(self, svgfile, pdffile, title, page_number, error=None):
"""Records an error.
"""
if len(self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))) > 0:
error_node = self.tree.xpath('//error[@title="{0}" and @number="{1}"]'.format(title, page_number))[0]
else:
error_node = ET.SubElement(self.tree.getroot(), 'error', attrib={'title': title, 'number': page_number})
ET.SubElement(error_node, 'svgfile').text = svgfile
ET.SubElement(error_node, 'pdffile').text = pdffile
if error is not None:
error_node.set('type', str(type(error).__name__))
if str(error) != '':
error_msg = ET.SubElement(error_node, 'error-msg')
error_msg.text = str(error)
if str(type(error).__name__) == 'ExpatError':
error_msg.text += '->svgfile is empty!'
def run(self, title=None, page_number=None, error_type=None):
"""Run all or some errors
[:return:] exit status (int)
"""
xpath = '//error'
if title is not None and page_number is not None:
xpath = '//error[@title="{0}" and @number="{1}"]'.format(title, page_number)
elif title is not None:
xpath = '//error[@title="{0}"]'.format(title)
elif page_number is not None:
xpath = '//error[@number="{0}"]'.format(page_number)
if error_type is not None:
xpath = xpath + '[@type="{0}"]'.format(error_type)\
if title is None and page_number is None\
else xpath.replace(']', ' ') + 'and @type="{0}"]'.format(error_type)
exit_status = 0
for error in self.tree.xpath(xpath):
title = error.get('title')
page_number = error.get('number')
svgfile = error.xpath('./svgfile/text()')[0]\
if len(error.xpath('./svgfile/text()')) > 0 else None
pdffile = error.xpath('./pdffile/text()')[0]\
if len(error.xpath('./pdffile/text()')) > 0 else None
if svgfile is not None:
converter = Converter(title=title)
extractor = Extractor(title=title, extract_transkription_field_only=True, compare2pdf=True)
status = process_file(converter, extractor, svgfile, pdffile, page_number)
if status > 0:
exit_status = status
if status < 2:
error.getparent().remove(error)
self.write()
return exit_status
def write(self):
"""Writes error log.
"""
write_pretty(xml_element_tree=self.tree, file_name=MyErrorHandler.ERROR_LOG, script_name=__file__, file_type='xmlErrorLog')
def is_page_ok(manuscript_file=None, page_number=None):
"""Returns true if page status is 'OK'.
"""
if manuscript_file is not None and isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if page_number is not None\
and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
return manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('status') == 'OK'\
and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output'))
return False
def is_svg_ok(manuscript_file=None, page_number=None):
"""Returns true if svgfile contains a valid svg graphic location.
"""
if manuscript_file is not None and isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if page_number is not None\
and len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0\
and isfile(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output')):
xml_source_tree = ET.parse(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0].get('output'))
return len(xml_source_tree.xpath('//svg/@file')) > 0 and isfile(xml_source_tree.xpath('//svg/@file')[0])
return False
def process_file(converter, extractor, svgfile, pdffile, page_number):
"""Processes file.
[:return:] exit status (int)
"""
exit_status = 0
path_svg_file = converter.get_file_name(pdffile, page_number=page_number)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} ...'.format(svgfile))
print(Style.RESET_ALL)
if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0:
transkriptionField = TranskriptionField(path_svg_file)
transkriptionField.shrink_svg_to_transkription_field()
xml_target_file = extractor.get_file_name(svgfile, page_number)
extraction_status = extractor.extractAndWriteInformation(svgfile, xml_target_file=xml_target_file,\
page_number=page_number, pdfFile=pdffile, svg_file=path_svg_file, record_warnings=True)
if extraction_status < 2 and extractor.manuscript_file is not None:
status = 'OK'
if extraction_status == 1:
status = extractor.latest_status
exit_status = 1
#update_manuscript_file(extractor.manuscript_file, page_number, xml_target_file, status=status)
update_svgposfile_status(xml_target_file, manuscript_file=extractor.manuscript_file, status=status)
return exit_status
def update_graphical_svg(converter, svgfile, pdffile, page_number, xml_source_file):
"""Create a new graphical svg file and update xml output file.
[:return:] exit status (int)
"""
exit_status = 0
if isfile(xml_source_file):
path_svg_file = converter.get_file_name(pdffile, page_number=page_number)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Creating file {} ...'.format(svgfile))
print(Style.RESET_ALL)
if converter.pdf2svg(pdffile, page_number=page_number, svg_file_name=path_svg_file) == 0:
transkriptionField = TranskriptionField(path_svg_file)
transkriptionField.shrink_svg_to_transkription_field()
page = Page(xml_source_file=xml_source_file, svg_file=path_svg_file)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
else:
exit_status = 2
return exit_status
def update_manuscript_file(manuscript_file, page_number, file_name, status='changed', append=True):
"""Updates manuscript file: adds status information about page.
"""
if isfile(manuscript_file):
parser = ET.XMLParser(remove_blank_text=True)
manuscript_tree = ET.parse(manuscript_file, parser)
if len(manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)) > 0:
node = manuscript_tree.getroot().xpath('//page[@number="%s"]' % page_number)[0]
old_status = node.get('status')
if old_status is None or 'OK' not in old_status.split(':'):
node.set('status', status)
elif append:
if status not in old_status.split(':'):
new_status = old_status + ':' + status
node.set('status', new_status)
else:
node.set('status', new_status)
if not bool(node.get('output')):
node.set('output', file_name)
else:
pages_node = manuscript_tree.getroot().find('pages')\
if manuscript_tree.getroot().find('pages') is not None\
else ET.SubElement(manuscript_tree.getroot(), 'pages')
new_id = len(pages_node.findall('page')) + 1
ET.SubElement(pages_node, 'page', attrib={'id': str(new_id), 'number': str(page_number), 'status': status, 'output': file_name})
write_pretty(xml_element_tree=manuscript_tree, file_name=manuscript_file, script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT)
def update_svgposfile_status(file_name, manuscript_file=None, status='changed', append=True):
"""Updates svg position file's status.
"""
if isfile(file_name):
parser = ET.XMLParser(remove_blank_text=True)
file_tree = ET.parse(file_name, parser)
old_status = file_tree.getroot().get('status')
if old_status is None or 'OK' not in old_status.split(':'):
file_tree.getroot().set('status', status)
elif append:
if status not in old_status.split(':'):
new_status = old_status + ':' + status
file_tree.getroot().set('status', new_status)
else:
file_tree.getroot().set('status', new_status)
write_pretty(xml_element_tree=file_tree, file_name=file_name, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
if manuscript_file is not None and isfile(manuscript_file):
page_number = file_tree.getroot().get('number')
update_manuscript_file(manuscript_file, page_number, file_name, status=status)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to extract information from all text svg files in a directory.
svgscripts/process_files.py [OPTIONS]
svgscripts/process_files.py [OPTIONS] Directory containing pdfs corresponding to svg files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
Directory containing svg files corresponding to pdf files (i.e. PDFDIR/NAME.pdf <-> TEXT_SVG_DIR/NAME.svg).
OPTIONS:
-h|--help: show help
-e|--run-error Rerun error cases.
-g|--check-graphic-svg Check that graphical svg file exists or generate a new svg file.
-n|--number=pageNumber Use this with OPTION -e|--run-error in order to specify an error case.
-t|--title=title: title of the manuscript to which all files belong.
-T|--error-type: error type, use this with OPTION -e|--run-error in order to specify an error case.
-s|--svg-target-dir=svg-target-dir target directory for path svg files, i.e. svg files that can be displayed on the web.
-x|--xml-target-dir=xml-target-dir target directory for xml files.
:return: exit code (int)
"""
title = None
xml_target_dir = ".{}xml".format(sep)
svg_target_dir = ".{}svg".format(sep)
error_handler = MyErrorHandler()
number = None
rerun_errors = False
error_type = None
check_graphic_svg_exists = False
try:
opts, args = getopt.getopt(argv, "hegn:s:t:T:x:", ["help", "run-error", "check-graphic-svg", "number=", "svg-target-dir=", "title=", "error-type=", "xml-target-dir="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-e', '--run-error'):
rerun_errors = True
elif opt in ('-g', '--check-graphic-svg'):
check_graphic_svg_exists = True
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-T', '--error-type'):
error_type = arg
elif opt in ('-n', '--number'):
number = arg
elif opt in ('-s', '--svg-target-dir'):
svg_target_dir = arg
elif opt in ('-x', '--xml-target-dir'):
xml_target_dir = arg
if rerun_errors:
return error_handler.run(title=title, page_number=number, error_type=error_type)
if len(args) == 1 and args[0].endswith('.xml'):
source_tree = ET.parse(args[0])
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
svg_word_file_tree = ET.parse(source_tree.xpath('//page/@output')[0])
svg_dir = dirname(svg_word_file_tree.xpath('//page/@source')[0])
pdf_dir = dirname(svg_word_file_tree.xpath('//page/pdf/@file')[0])
else:
print('File {} is not of type {}'.format(args[0], FILE_TYPE_XML_MANUSCRIPT))
usage()
return 2
elif len(args) < 1 or\
(len(args) == 1\
and (True not in [ pdffile.endswith('pdf') for pdffile in listdir(args[0]) ]\
or True not in [ svgfile.endswith('svg') for svgfile in listdir(args[0]) ])\
):
print("Please specify both PDFDIR and TEXT_SVG_DIR!")
usage()
return 2
elif len(args) < 2:
pdf_dir, svg_dir = args[0], args[0]
elif isdir(args[0]) and isdir(args[1]):
pdf_dir, svg_dir = args[0], args[1]
if True in [ svgfile.endswith('pdf') for svgfile in listdir(args[1]) ]:
pdf_dir, svg_dir = args[1], args[0]
else:
not_existing = args[0] if not isdir(args[0]) else args[1]
print("ERROR directory {} does not exist!".format(not_existing))
return 2
list_of_svg = [ svgfile for svgfile in listdir(svg_dir) if svgfile.endswith('svg') ]
list_of_pdf = [ pdffile for pdffile in listdir(pdf_dir) if pdffile.endswith('pdf') ]
converter = Converter(target_dir=svg_target_dir, title=title)
extractor = Extractor(xml_dir=xml_target_dir, title=title, extract_transkription_field_only=True, compare2pdf=True)
exit_status = 0
for svgfile in list_of_svg:
if svgfile.replace('.svg', '.pdf') in list_of_pdf:
title = re.split(r'(^[A-Z]+p*_[A-Z]*_[0-9]*)', svgfile)[1].replace('_', ' ')
if extractor.title is None or extractor.title != title:
extractor.update_title_and_manuscript(title)
if converter.title is None or converter.title != title:
converter.title = title.replace(' ', '_')
if 'page' in svgfile:
page_number = svgfile.replace('.svg','').split('page')[1]
else:
page_number = svgfile.replace('.svg','').split('_')[len(svgfile.replace('.svg','').split('_'))-1]
pdffile = '{}{}{}'.format(pdf_dir, sep, svgfile.replace('.svg', '.pdf'))
if not check_graphic_svg_exists and not is_page_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
try:
svgfile = '{}{}{}'.format(svg_dir, sep, svgfile)
exit_status = process_file(converter, extractor, svgfile, pdffile, page_number)
except Exception as err:
error_handler.record_error(svgfile, pdffile, title, page_number, error=err)
if not UNITTESTING:
print(Fore.RED)
print('There was an error ->', err)
print(Style.RESET_ALL)
elif not is_svg_ok(manuscript_file=extractor.manuscript_file, page_number=page_number):
update_graphical_svg(converter, svgfile, pdffile, page_number, extractor.get_file_name(svgfile, page_number))
error_handler.write()
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/fix_missing_glyphs.py
===================================================================
--- svgscripts/fix_missing_glyphs.py (revision 65)
+++ svgscripts/fix_missing_glyphs.py (revision 66)
@@ -1,190 +1,192 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to fix missing glyphs.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.page import Page, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
-from myxmlwriter import write_pretty
from process_files import update_svgposfile_status
+sys.path.append('shared_util')
+from myxmlwriter import write_pretty
+
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
def find_missing_glyph_for_pwp(positional_word_part_node, svg_path_tree, namespaces, xmin=0.0, ymin=0.0):
"""Finds missing glyph for node of a PositionalWordPart.
:return: list of PositionalWordPart
"""
THRESHOLD = 15.5
pwp = PositionalWordPart(node=positional_word_part_node)
word_part_obj = { "x": pwp.left, "y": pwp.top, "text": pwp.text, "matrix": pwp.transform, "class": pwp.style_class }
start_id = int(pwp.id)
threshold = -0.5
positional_word_parts = []
while threshold < THRESHOLD and len(positional_word_parts) < 1:
try:
positional_word_parts = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces,\
start_id=start_id, xmin=xmin, ymin=ymin, threshold=threshold, throw_error_if_not_found=True)
except Exception:
threshold += 0.1
return positional_word_parts
def update_word(page, positional_word_part_node, positional_word_parts):
"""Updates word according to new positional_word_parts.
"""
if len(positional_word_parts) > 0:
debug_msg_string = 'update word from ' + __file__
positional_word_part_id = int(positional_word_part_node.get('id'))
transkription_position_id = int(positional_word_part_node.getparent().get('id'))
word_id = int(positional_word_part_node.getparent().getparent().get('id'))
word = page.words[word_id]
transkription_position = word.transkription_positions[transkription_position_id]
transkription_position.positional_word_parts.pop(positional_word_part_id)
positional_word_parts.reverse()
for positional_word_part in positional_word_parts:
transkription_position.positional_word_parts.insert(positional_word_part_id, positional_word_part)
for index, positional_word_part in enumerate(transkription_position.positional_word_parts):
positional_word_part.id = index
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
transkription_position.positional_word_parts, debug_msg_string=debug_msg_string, transkription_position_id=transkription_position_id)
word.transkription_positions.pop(transkription_position_id)
transkription_positions.reverse()
for new_tp in transkription_positions:
word.transkription_positions.insert(transkription_position_id, new_tp)
text = ''
for index, tp in enumerate(word.transkription_positions):
tp.id = index
tp.writing_process_id = transkription_position.writing_process_id
for pwp in tp.positional_word_parts:
text += pwp.text
if word.text != text:
word.text = text
word.attach_word_to_tree(page.page_tree)
def fix_missing_glyphs(svg_word_pos_file, manuscript_file=None):
"""Finds missing glyphs for xml file of type FILE_TYPE_SVG_WORD_POSITION.
"""
if isfile(svg_word_pos_file):
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Fixing missing glyphs for file {} ... '.format(svg_word_pos_file), end='')
#print(Style.RESET_ALL)
page = Page(xml_source_file=svg_word_pos_file)
transkription_field = TranskriptionField(page.svg_file)
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]'))
for positional_word_part_node in page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]'):
pwps = find_missing_glyph_for_pwp(positional_word_part_node, svg_path_tree, namespaces, xmin=transkription_field.xmin, ymin=transkription_field.ymin)
update_word(page, positional_word_part_node, pwps)
write_pretty(xml_element_tree=page.page_tree, file_name=svg_word_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
page = Page(xml_source_file=svg_word_pos_file)
new_number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]'))
if not UNITTESTING:
result_color = Fore.LIGHTBLUE_EX if new_number_of_missing_glyphs == 0 else Fore.MAGENTA
print(result_color + ' {0}/{1}'.format(number_of_missing_glyphs-new_number_of_missing_glyphs, number_of_missing_glyphs), end='')
print(Fore.LIGHTBLUE_EX + ' fixed.', end='')
print(Style.RESET_ALL)
if len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) == 0:
update_svgposfile_status(svg_word_pos_file, manuscript_file=manuscript_file, status='OK')
def get_filelist_and_manuscript_file(file_a, file_b=None):
"""Returns a file list and a manuscript file (or None)
"""
file_list = []
manuscript_file = None
source_tree = ET.parse(file_a)
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION\
and len([ word_part for word_part in source_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')]) > 0: # if symbol_ids are missing ...
file_list.append(file_a)
if file_b is not None:
manuscript_file = file_b
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
manuscript_file = file_a
if file_b is not None:
file_list.append(file_b)
else:
file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND.lower()))
return file_list, manuscript_file
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix missing glyphs.
svgscripts/fix_missing_glyphs.py [OPTIONS] -File [-File]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
file_b = None
if len(args) > 1 and isfile(args[1]):
file_b = args[1]
file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b)
for svg_word_pos_file in file_list:
fix_missing_glyphs(svg_word_pos_file, manuscript_file=manuscript_file)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/datatypes/class_spec.py
===================================================================
--- svgscripts/datatypes/class_spec.py (revision 65)
+++ svgscripts/datatypes/class_spec.py (revision 66)
@@ -1,118 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-""" This is an abstract class for all classes that are semantically relevant.
-"""
-# Copyright (C) University of Basel 2019 {{{1
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see 1}}}
-
-__author__ = "Christian Steiner"
-__maintainer__ = __author__
-__copyright__ = 'University of Basel'
-__email__ = "christian.steiner@unibas.ch"
-__status__ = "Development"
-__license__ = "GPL v3"
-__version__ = "0.0.1"
-
-import abc
-import inspect
-import warnings
-
-class SemanticClass(metaclass=abc.ABCMeta):
- """
- This is an abstract class for all classes that are semantically relevant.
- """
- SINGLE_VALUE = 1
- LIST = -99
-
- @classmethod
- def get_class_dictionary(cls):
- """Creates and returns a class_dictionary with the keys 'this' ['type'].
- """
- class_dict = {'this': cls }
- if cls.__dict__.get('RDF_SUBCLASSES') and len(cls.RDF_SUBCLASSES) > 0:
- class_dict.update({'owl:equivalentClass': cls.RDF_SUBCLASSES })
- else:
- direct_super_class = inspect.getclasstree([cls],unique=True)[0][0]
- if issubclass(direct_super_class, SemanticClass) and direct_super_class != SemanticClass:
- class_dict.update({'type': direct_super_class})
- return class_dict
-
- def get_data_dictionary(self): # DEPRECIATED
- """Returns a data dictionary with the keys 'head' and 'body'.
-
- Key 'head' points to a dictionary with class information (key: 'class').
- Key 'body' points to a dictionary with the data.
- """
- warnings.warn("deprecated", DeprecationWarning)
- data_dict = {}
- semantic_dict = self.get_semantic_dictionary()
- data_dict.update({'head': {'class': semantic_dict['class'].get('this')}})
- body = {}
- """
- for key, (datatype, cardinality) in semantic_dict['properties'].items():
- if self.__dict__.get(key) is not None:
- if issubclass(datatype, SemanticClass):
- if cardinality != SemanticClass.SINGLE_VALUE and len(self.__dict__.get(key)) > 1:
- items = []
- for item in self.__dict__.get(key):
- items.append(item.get_data_dictionary().get('body'))
- body.update({ key: items})
- else:
- body.update({ key: self.__dict__.get(key).get_data_dictionary().get('body')})
- else:
- body.update({ key: self.__dict__.get(key) })
- data_dict.update({'body': body})
- """
- return data_dict
-
- @classmethod
- def get_cls_hasPart_objectCls_dictionaries(cls, object_cls, xpath, object_seqnum_xpath=None, cardinality=0, cardinality_restriction='minCardinality'):
- """Return a dictionary containing the information for creating a class that can act
- as an intermediary between cls and a number of object_cls if object_cls has
- a position in a sequence of object_classes that belong to cls.
- """
- part_name = object_cls.__name__ + 'Part'
- has_part_name = object_cls.__name__.lower() + 'PartHas' + object_cls.__name__
- has_seqnum_name = object_cls.__name__.lower() + 'HasSeqNum'
- if object_seqnum_xpath is None:
- object_seqnum_xpath = xpath + '/@id'
- object_part_dictionary = { 'class': object_cls, 'cardinality': 1, 'xpath': xpath,\
- 'name': has_part_name, 'label': '{0} has a {1}'.format(part_name, object_cls.__name__),\
- 'comment': '{0} has a part, that is a {1}'.format(part_name, object_cls.__name__)}
- object_seqnum_dictionary = { 'class': int, 'cardinality': 1, 'xpath': object_seqnum_xpath,\
- 'name': has_seqnum_name, 'label': '{0} has a sequence number'.format(part_name),\
- 'comment': '{0} has a part, that stands in a sequence with this number'.format(part_name, object_cls.__name__)}
- object_dictionary = { 'class_name': part_name, 'has_part': object_part_dictionary, 'has_seqnum': object_seqnum_dictionary,\
- 'label': '{0} part'.format(object_cls.__name__.lower()),\
- 'comment': 'This class servers as a intermediary between {0} and {1}. {0} has some {1} in a specific sequence.'.format(cls.__name__, object_cls.__name__)}
- dictionary = { 'flag': 'ordered_list' , 'class': object_dictionary, 'cardinality': cardinality, 'cardinality_restriction': cardinality_restriction, 'xpath': xpath,\
- 'name': cls.__name__.lower() + 'Has' + part_name, 'label': '{0} has a part that connects it with a {1}'.format(cls.__name__, object_cls.__name__),\
- 'comment': '{0} has a part that connects it with a {1}, that has a position in a sequence of {1}'.format(cls.__name__, object_cls.__name__)}
- return dictionary
-
- @classmethod
- @abc.abstractmethod
- def get_semantic_dictionary(cls):
- """Creates a semantic dictionary with 'class' and 'properties' as its keys.
-
- The class-key points to a class_dictionary with the keys: 'this' ['type', 'rdf:subClassOf']
-
- The properties-key points to a properties_dictionary with semantically relevant keys
- of self.__dict__ as keys, and tuples of datatype (class), cardinality (int) as its values.
-
- Cardinality can be SemanticClass.SINGLE_VALUE, SemanticClass.LIST.
- """
- pass
Index: svgscripts/datatypes/lineNumber.py
===================================================================
--- svgscripts/datatypes/lineNumber.py (revision 65)
+++ svgscripts/datatypes/lineNumber.py (revision 66)
@@ -1,101 +1,104 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a line number.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import re
from lxml import etree as ET
from os.path import isfile
+import sys
-from .class_spec import SemanticClass
from .matrix import Matrix
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+
class LineNumber(SemanticClass):
"""
This class represents a line number.
Args:
file_name (str): name of the xml file to be instantiated.
"""
XML_TAG = 'line-number'
WARN_NO_LINE_NUMBER = 'No line number found'
def __init__(self, id=0, bottom=0.0, top=0.0, raw_text_node=None, transkription_field=None, xml_text_node=None):
self.id = id
self.bottom = bottom
self.top = top
if xml_text_node is not None:
self.id = int(xml_text_node.get('id'))
self.bottom = float(xml_text_node.get('bottom'))
self.top = float(xml_text_node.get('top'))
if raw_text_node is not None and transkription_field is not None:
matrix = Matrix(raw_text_node.get('transform'), transkription_field=transkription_field)
self.bottom = matrix.getY()
self.id = int(raw_text_node.text) if raw_text_node.text is not None\
else int(''.join([x.text for x in raw_text_node.findall('.//tspan', raw_text_node.nsmap)]))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {\
'id': { 'class': int, 'cardinality': 1, 'xpath': '{}/@id'.format(LineNumber.XML_TAG),\
'name': 'lineHasNumber', 'label': 'line has number',\
'comment': 'Relating a line to a number it has.'},\
'bottom': (float, 1, '{}/@bottom'.format(LineNumber.XML_TAG)),\
'top': (float, 1, '{}/@top'.format(LineNumber.XML_TAG))\
}
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
@staticmethod
def IS_A_LINE_NUMBER(raw_text_node):
"""Returns whether svg node contains a line number.
"""
if raw_text_node.text is not None:
return bool(re.search(r'^[0-9]+$', raw_text_node.text))
elif len(raw_text_node.findall('.//tspan', raw_text_node.nsmap)) > 0:
text = ''.join([x.text for x in raw_text_node.findall('.//tspan', raw_text_node.nsmap)])
return bool(re.search(r'^[0-9]+$', text))
return False
def setTop(self, top):
"""Sets top position of line number.
"""
self.top = top
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
obj_node = target_tree.getroot().xpath('//' + LineNumber.XML_TAG + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.getroot().xpath('//' + LineNumber.XML_TAG + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree.getroot(), LineNumber.XML_TAG)
for key in self.__dict__.keys():
obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3)))
Index: svgscripts/datatypes/positional_object.py
===================================================================
--- svgscripts/datatypes/positional_object.py (revision 65)
+++ svgscripts/datatypes/positional_object.py (revision 66)
@@ -1,136 +1,140 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent an object with positional information.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
+import sys
from .matrix import Matrix
from .attachable_object import AttachableObject
-from .class_spec import SemanticClass
+
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+
class PositionalObject(AttachableObject,SemanticClass):
"""
This (super) class represents an object with positional information.
Args:
id (int): object id
matrix (datatypes.Matrix): matrix containing information about conversion.
height (float): height of
width (float): width of object
x (float): x position of object
y (float): y position of object
"""
XML_TAG = 'positional-object'
floatKeys = [ 'height', 'width', 'left', 'top', 'bottom']
intKeys = [ ]
stringKeys = [ ]
def __init__(self, node=None, id=0, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, tag=XML_TAG):
self.floatKeys = []
self.floatKeys += PositionalObject.floatKeys
self.intKeys = []
self.intKeys += PositionalObject.intKeys
self.stringKeys = [ 'id' ]
self.stringKeys += PositionalObject.stringKeys
self.attachable_objects = []
if node is not None:
self.id = str(node.get('id'))
self.height = float(node.get('height'))
self.width = float(node.get('width'))
self.left = float(node.get('left'))
self.top = float(node.get('top'))
self.bottom = float(node.get('bottom'))
self.transform = Matrix(node.get('transform')) if bool(node.get('transform')) and 'matrix(' in node.get('transform') else None
self.tag = node.tag
else:
self.id = str(id)
self.height = round(height, 3)
self.width = round(width, 3)
self.left = round(x, 3)
self.top = round(y, 3)
self.bottom = round(y + height, 3)
self.transform = matrix
self.tag = tag
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.tag)
for key in self.floatKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3)))
for key in self.intKeys + self.stringKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(self.__dict__[key]))
if self.transform is not None and self.transform.isRotationMatrix():
obj_node.set('transform', self.transform.toString())
for attachable_object in self.attachable_objects:
attachable_object.attach_object_to_tree(obj_node)
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {}
properties.update(dict(zip(cls.intKeys, [ (int, 0, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in cls.intKeys])))
properties.update(dict(zip(cls.floatKeys, [ (float, 1, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in cls.floatKeys])))
properties.update(dict(zip(cls.stringKeys, [ (str, 1, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in cls.stringKeys])))
properties.update({'transform': (str, 0, '{}/@transform'.format(cls.XML_TAG))})
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
@staticmethod
def POSITIONS_OVERLAP_HORIZONTALLY(position_a, position_b):
"""Returns whether position a and b overlap horizontally.
"""
return (position_a.left < position_b.left+position_b.width)\
and (position_a.left+position_a.width > position_b.left)
@staticmethod
def POSITIONS_OVERLAP_VERTICALLY(position_a, position_b):
"""Returns whether position a and b overlap vertically.
"""
return (position_a.top < position_b.bottom)\
and (position_a.bottom > position_b.top)
@staticmethod
def POSITIONS_ARE_STACKED(position_a, position_b):
"""Returns whether position a and b are stacked, i.e. are above each other.
"""
return PositionalObject.POSITIONS_OVERLAP_HORIZONTALLY(position_a, position_b)\
and (not PositionalObject.POSITIONS_OVERLAP_VERTICALLY(position_a, position_b)\
or abs(position_a.top-position_b.top) > (position_a.height/4 + position_b.height/4))
Index: svgscripts/datatypes/word.py
===================================================================
--- svgscripts/datatypes/word.py (revision 65)
+++ svgscripts/datatypes/word.py (revision 66)
@@ -1,484 +1,484 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from operator import attrgetter
import warnings
-from .class_spec import SemanticClass
from .lineNumber import LineNumber
from .matrix import Matrix
from .path import Path
from .simple_word import SimpleWord
from .word_position import WordPosition
from .transkription_position import TranskriptionPosition
from .writing_process import WritingProcess
+
class Word(SimpleWord):
"""
This class represents a word.
"""
DATA = 'debug-data'
XML_TAG = 'word'
XML_EARLIER_VERSION = 'earlier-version'
def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None):
super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
self.deleted = deleted
self.debug_container = {}
if len([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ]) > len(self.text):
self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ])
self.word_part_objs = word_part_objs if word_part_objs is not None else []
self.is_head_of_inserted_words = False
self.is_tail_of_inserted_words = False
self.is_before_inserted_words = False
self.is_after_inserted_words = False
self.word_insertion_mark = None
self.debug_msg = None
self.writing_process_id = writing_process_id
self.word_parts = word_parts if word_parts is not None else []
self.earlier_version = earlier_version
self.box_paths = box_paths if box_paths is not None else []
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
word_node = super(Word,self).attach_word_to_tree(target_tree)
if self.deleted is not None:
word_node.set('deleted', str(self.deleted).lower())
if self.writing_process_id > -1:
word_node.set('writing-process-id', str(self.writing_process_id))
for word_part in self.word_parts:
word_part.attach_word_to_tree(word_node)
if self.earlier_version is not None:
earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION)
self.earlier_version.attach_word_to_tree(earlier_node)
for index, box_path in enumerate(self.box_paths):
box_path.id = index
box_path.attach_object_to_tree(word_node)
return word_node
def belongs_to_multiple_writing_processes(self, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if len(self.word_parts) > 0 and include_parts:
return len(set(word.writing_process_id for word in self.word_parts)) > 1
return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1
def get_partial_word_over_box(self):
"""Partition a word according to its transkription_positions' has_box
->split word and add partial words as its parts.
:return: word over box or self
"""
word_over_box = self
if self.has_mixed_status('has_box'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.has_box != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_status:
word_over_box = newWord
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.has_box
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
if last_status:
word_over_box = newWord
self.transkription_positions = []
self.line_number = -1
elif len(self.word_parts) > 0:
self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, 'get_partial_word_over_box')
return word_over_box
def has_mixed_status(self, property_key, include_parts=False):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions):
return False
if len(self.word_parts) > 0 and include_parts:
if False in set(property_key in word.__dict__.keys() for word in self.word_parts):
return False
return len(set(word.deleted for word in self.word_parts)) > 1
return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1
def partition_according_to_deletion(self):
"""Partition a word according to its transkription_positions' deletion status
->split word and add partial words as its parts.
"""
if self.has_mixed_status('deleted'):
transkription_positions = []
last_status = None
for transkription_position in self.transkription_positions:
if transkription_position.deleted != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.deleted
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
self.line_number = -1
self.deleted = False
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion')
elif not self.deleted\
and len(self.transkription_positions) > 0\
and self.transkription_positions[0].deleted:
self.deleted = True
def partition_according_to_writing_process_id(self):
"""Partition a word according to its transkription_positions' writing_process_ids
->split word and add partial words as its parts.
"""
if self.belongs_to_multiple_writing_processes():
last_writing_process_id = -1
transkription_positions = []
for transkription_position in self.transkription_positions:
if transkription_position.writing_process_id != last_writing_process_id\
and len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_writing_process_id = transkription_position.writing_process_id
if len(transkription_positions) > 0:
newWord = Word(id=len(self.word_parts), line_number=self.line_number,\
transkription_positions=transkription_positions, writing_process_id=last_writing_process_id)
self.word_parts.append(newWord)
self.transkription_positions = []
self.line_number = -1
elif len(self.word_parts) > 0:
self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id')
if self.belongs_to_multiple_writing_processes(include_parts=True):
self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0]
elif len(self.transkription_positions) > 0:
self.writing_process_id = self.transkription_positions[0].writing_process_id
def process_boxes(self, box_paths, parent_word=None, tr_xmin=0.0, tr_ymin=0.0):
"""Determines whether word is over a word box.
"""
test_case = len(box_paths) == 1
later_version_word = None
if len(self.word_parts) > 0:
for word in self.word_parts:
later_version = word.process_boxes(box_paths, parent_word=self, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
if later_version is not None and later_version.earlier_version is not None:
later_version_word = later_version
else:
new_tp_dict = {}
for transkription_position in self.transkription_positions:
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
containing_boxes = [ box_path for box_path in box_paths\
if word_path.is_partially_contained_by(box_path)\
or box_path.do_paths_intersect(word_path) ]
if len(containing_boxes) > 0:
box_path = containing_boxes[0]
if box_path.contains_path(word_path):
transkription_position.has_box = box_path
elif box_path.contains_start_of_path(word_path):
split_position = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[0].has_box = box_path
new_tp_dict.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
elif box_path.contains_end_of_path(word_path):
split_position = box_path.path.bbox()[0] - tr_xmin
new_tps = transkription_position.split(split_position)
if len(new_tps) == 2:
new_tps[1].has_box = box_path
new_tp_dict.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
else:
split_position1 = box_path.path.bbox()[0] - tr_xmin
split_position2 = box_path.path.bbox()[1] - tr_xmin
new_tps = transkription_position.split(split_position1, split_position2)
if len(new_tps) >= 2:
new_tps[1].has_box = box_path
new_tp_dict.update({ transkription_position: new_tps })
else:
transkription_position.has_box = box_path
for replace_tp in new_tp_dict.keys():
for tp in new_tp_dict.get(replace_tp):
self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp)
self.transkription_positions.remove(replace_tp)
update_transkription_position_ids(self)
later_version_word = self.get_partial_word_over_box()
if len(later_version_word.transkription_positions) > 0\
and later_version_word.transkription_positions[0].has_box is not None:
box_holder = self if parent_word is None else parent_word
box_holder.box_paths.append(later_version_word.transkription_positions[0].has_box)
box_text = later_version_word.transkription_positions[0].has_box.earlier_text
transkription_positions = TranskriptionPosition.copy_list_of_cls(later_version_word.transkription_positions)
later_version_word.earlier_version = Word(text=box_text, transkription_positions=transkription_positions)
#print(later_version_word.text, later_version_word.earlier_version.text)
return later_version_word
return later_version_word
def split(self, split_string, start_id=0):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString, currentString, nextString = self.text.partition(split_string)
currentWord = None
previousWord = None
nextWord = None
previousIndex = 0
current_id = start_id
all_positional_word_parts = []
for position in self.transkription_positions:
all_positional_word_parts += position.positional_word_parts
if len(all_positional_word_parts) == 0:
warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString))
if len(previousString) > 0:
previous_pwps = []
while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
previous_pwps.append(all_positional_word_parts[previousIndex])
previousIndex += 1
if previousString != ''.join([ pwp.text for pwp in previous_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString))
else:
previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split')
previous_text = ''.join([ pwp.text for pwp in previous_pwps ])
previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions)
current_id += 1
all_positional_word_parts = all_positional_word_parts[previousIndex:]
if len(nextString) > 0:
tmp_pwps = []
index = 0
while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
tmp_pwps.append(all_positional_word_parts[index])
index += 1
if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]):
warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString))
else:
next_pwps = all_positional_word_parts[index:]
next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split')
next_text = ''.join([ pwp.text for pwp in next_pwps ])
nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions)
all_positional_word_parts = all_positional_word_parts[:index]
current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split')
current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ])
currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions)
return previousWord, currentWord, nextWord
def split_according_to_status(self, status):
"""Split a word according to its transkription_positions' text.
:return: a list of new word.Word
"""
new_words = []
if self.has_mixed_status(status):
last_status = None
transkription_positions = []
copy_keys = [ 'line_number', 'text', 'deleted', 'writing_process_id' ]
for transkription_position in self.transkription_positions:
if transkription_position.__dict__[status] != last_status\
and len(transkription_positions) > 0:
newWord = Word(id=self.id+len(new_words), transkription_positions=transkription_positions)
for key in copy_keys:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
new_words.append(newWord)
transkription_positions = []
transkription_positions.append(transkription_position)
last_status = transkription_position.__dict__[status]
if len(transkription_positions) > 0:
newWord = Word(id=self.id+len(new_words), transkription_positions=transkription_positions)
for key in copy_keys:
if key != status and key in self.__dict__.keys():
newWord.__dict__[key] = self.__dict__[key]
newWord.__dict__[status] = transkription_positions[0].__dict__[status]
new_words.append(newWord)
return new_words
def join(self, other_word, append_at_end_of_new_word=True):
"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
"""
if append_at_end_of_new_word:
self.text = self.text + other_word.text
for position in other_word.transkription_positions:
position.id = str(len(self.transkription_positions))
self.transkription_positions.append(position)
else:
self.text = other_word.text + self.text
index = 0
for position in other_word.transkription_positions:
self.transkription_positions.insert(index, position)
index += 1
while index < len(self.transkription_positions):
self.transkription_positions[index].id = str(index)
index += 1
self.simplify_transkription_positions()
def set_word_insertion_mark(self, word_insertion_mark):
"""Sets word_insertion_mark
"""
self.word_insertion_mark = word_insertion_mark
def simplify_transkription_positions(self):
"""Merge transkription_positions if possible.
"""
index = len(self.transkription_positions)-1
while index > 0\
and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]:
current_tp = self.transkription_positions[index]
index -= 1
previous_tp = self.transkription_positions[index]
if previous_tp.writing_process_id == current_tp.writing_process_id:
positional_word_parts = previous_tp.positional_word_parts
positional_word_parts += current_tp.positional_word_parts
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id)
if len(transkription_positions) == 1:
transkription_positions[0].writing_process_id = previous_tp.writing_process_id
self.transkription_positions.pop(index+1)
self.transkription_positions[index] = transkription_positions[0]
#print(self.text, len(self.transkription_positions))
@classmethod
def create_cls(cls, word_node):
"""Creates a word from a (lxml.Element) node.
[:return:] Word
"""
cls = super(Word,cls).create_cls(word_node)
cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1
cls.deleted = word_node.get('deleted') == 'true'\
if bool(word_node.get('deleted')) else None
cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_TAG) ]
cls.box_paths = [ Path(node=node) for node in word_node.xpath('.//' + Path.BOX_TAG ) ]
earlier_versions = [ cls.create_cls(node) for node in word_node.xpath('.//' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ]
if len(earlier_versions) > 0:
cls.earlier_version = earlier_versions[0]
return cls
@staticmethod
def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None):
"""Creates a word from a (lxml.Element) node or word_part_objs.
[:return:] Word
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number
text = word_node.get('text')
deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true'
transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\
if len(word_node.findall('.//' + Word.DATA)) > 0\
else [ item.attrib for item in word_node.findall('.//part')]
return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions, word_part_objs=word_part_objs)
elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file
WIDTH = 5
TOPCORRECTION = 2.0
FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
height = height
x = round(float(word_part_objs[0]['x']), 3)
if(page is not None and bool(page.style_dict)):
HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' '))
biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set)
height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3)
TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size
if endSign is not None and '%' in endSign:
lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\
for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\
if bool(page.style_dict[key].get('font-size'))]
lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1
endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR
elif endSign is not None and '%' in endSign:
endX = float(endX) + WIDTH
bottom = round(float(word_part_objs[0]['y']), 3)
y = round(bottom - height + TOPCORRECTION, 3)
width = round(float(endX) - x, 3)
transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ]
text = ''.join([ dict['text'] for dict in word_part_objs])
line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number
word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs)
word.debug_msg = debug_msg
return word
else:
error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(Word,cls).get_semantic_dictionary()
dictionary['properties'].update({'deleted':\
{'class': bool, 'cardinality': 1, 'xpath': '{0}/@deleted'.format(cls.XML_TAG),\
'name': 'isWordDeleted', 'label': 'has word been deleted'}})
dictionary['properties'].update({'writing_process_id':\
{'class': WritingProcess, 'cardinality': 0, 'xpath': '{0}/@writing-process-id'.format(cls.XML_TAG),\
'name': 'wordBelongsToWritingProcess', 'label': 'word has been written in a specific writing process'}})
dictionary['properties'].update({'word_parts': Word.get_cls_hasPart_objectCls_dictionaries(Word, xpath='word/word')})
return dictionary
def execute_function_on_parts(word_parts, func_name):
"""Execute function on parts and add those parts instead of original word to word_parts.
:return: new word_parts, output from func
"""
copy_parts = word_parts[:]
for word in word_parts:
output = eval('word.{0}()'.format(func_name))
if len(word.word_parts) > 0:
for part_word in word.word_parts:
copy_parts.insert(copy_parts.index(word), part_word)
copy_parts.remove(word)
word.word_parts = []
return copy_parts, output
def update_transkription_position_ids(word):
"""Update transkription_position' ids according to index.
"""
for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))):
transkription_position.id = index
Index: svgscripts/datatypes/page.py
===================================================================
--- svgscripts/datatypes/page.py (revision 65)
+++ svgscripts/datatypes/page.py (revision 66)
@@ -1,634 +1,637 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
+import sys
from .box import Box
-from .class_spec import SemanticClass
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .lineNumber import LineNumber
from .mark_foreign_hands import MarkForeignHands
from .matrix import Matrix
from .path import Path
from .positional_word_part import PositionalWordPart
from .text_connection_mark import TextConnectionMark
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
from .word_insertion_mark import WordInsertionMark
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+
FILE_TYPE_SVG_WORD_POSITION = 'svgWordPosition'
FILE_TYPE_XML_MANUSCRIPT = 'xmlManuscriptFile'
class Page(SemanticClass):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
xml_target_file (str): name of the xml file to which page info will be written.
"""
UNITTESTING = False
WARNING_MISSING_USE_NODE4PWP = PositionalWordPart.WARN_NO_USE_NODE_FOUND
WARNING_MISSING_GLYPH_ID4WIM = WordInsertionMark.WARN_NO_GLYPH_ID
PAGE_RECTO = 'recto'
PAGE_VERSO = 'verso'
def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, faksimile_image=None, faksimile_svgFile=None, pdfFile=None, svg_file=None, orientation='North', page_type=PAGE_VERSO, extract_transkription_field_only=False):
self.title = title
self.mark_foreign_hands = []
self.text_connection_marks = []
self.line_numbers = []
self.style_dict = {}
self.sonderzeichen_list = []
self.svg_file = None
self.svg_image = None
self.pdfFile = None
self.faksimile_svgFile = None
self.source = None
self.number = page_number if page_number is not None else -1
self.orientation = orientation
self.page_type = page_type
self.word_deletion_paths = []
self.faksimile_image = faksimile_image
if xml_source_file is not None:
if isfile(xml_source_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_source_file, parser)
self.title = self.page_tree.getroot().get('title')
self.number = self.page_tree.getroot().get('number')
self.source = self.page_tree.getroot().get('source')
self.orientation = self.page_tree.getroot().get('orientation')
self.page_type = self.page_tree.getroot().get('pageType')
self.init_words()
self.add_style(style_node=self.page_tree.getroot().find('.//style'))
self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\
if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None
self.faksimile_svgFile = self.page_tree.xpath('.//faksimile-svg/@file')[0]\
if len(self.page_tree.xpath('.//faksimile-svg/@file')) > 0 else None
self.svg_image = SVGImage(node=self.page_tree.xpath('.//' + SVGImage.XML_TAG)[0])\
if len(self.page_tree.xpath('.//' + SVGImage.XML_TAG)) > 0 else None
self.faksimile_image = FaksimileImage(node=self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)[0])\
if len(self.page_tree.xpath('.//' + FaksimileImage.XML_TAG)) > 0 else None
self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\
if len(self.page_tree.xpath('.//svg/@file')) > 0 else None
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
if pdfFile is not None and self.pdfFile is None:
self.pdfFile = pdfFile
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
if faksimile_svgFile is not None and self.faksimile_svgFile is None:
self.faksimile_svgFile = faksimile_svgFile
ET.SubElement(self.page_tree.getroot(), 'faksimile-svg', attrib={'file': self.faksimile_svgFile})
if faksimile_image is not None:
self.faksimile_image = faksimile_image
self.faksimile_image.attach_object_to_tree(self.page_tree)
if svg_file is not None and self.svg_file is None:
self.svg_file = svg_file
tf = TranskriptionField(svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
self.svg_image.attach_object_to_tree(self.page_tree)
if self.svg_image is not None and self.svg_file is None:
self.svg_file = self.svg_image.file_name
if self.svg_image is not None and self.width == 0.0:
self.width = self.svg_image.width
if self.svg_image is not None and self.height == 0.0:
self.height = self.svg_image.height
else:
raise Exception('File "{}" does not exist!'.format(xml_source_file))
elif xml_target_file is not None:
self.word_insertion_marks = []
self.words = []
self.writing_processes = []
self.svg_file = svg_file
self.pdfFile = pdfFile
self.faksimile_svgFile = faksimile_svgFile
if isfile(xml_target_file):
parser = ET.XMLParser(remove_blank_text=True)
self.page_tree = ET.parse(xml_target_file, parser)
self.source = self.page_tree.getroot().get('source')
if bool(self.page_tree.getroot().get('orientation')):
self.orientation = self.page_tree.getroot().get('orientation')
elif orientation is not None:
self.page_tree.getroot().set('orientation', orientation)
if bool(self.page_tree.getroot().get('title')):
self.title = self.page_tree.getroot().get('title')
elif title is not None:
self.page_tree.getroot().set('title', title)
if self.svg_file is None:
self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\
if len(self.page_tree.xpath('.//svg/@file')) > 0 else None
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
elif len(self.page_tree.xpath('.//svg/@file')) == 0:
tf = TranskriptionField(svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
self.svg_image.attach_object_to_tree(self.page_tree)
#ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
else:
self.width = float(self.page_tree.xpath('.//svg/@width')[0])\
if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0
self.height = float(self.page_tree.xpath('.//svg/@height')[0])\
if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0
if self.pdfFile is None:
self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\
if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None
elif len(self.page_tree.xpath('.//pdf/@file')) == 0:
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG, WordInsertionMark.XML_TAG,\
WritingProcess.XML_TAG, Path.WORD_DELETION_PATH_TAG ]:
for node in self.page_tree.xpath('//' + xpath2remove):
node.getparent().remove(node)
else:
self.page_tree = ET.ElementTree(ET.Element('page'))
self.pdfFile = pdfFile
self.svg_file = svg_file
if title is not None:
self.page_tree.getroot().set('title', title)
if orientation is not None:
self.page_tree.getroot().set('orientation', orientation)
self.page_tree.getroot().set('transkription-field-only', str(extract_transkription_field_only).lower())
if page_number is not None:
self.page_tree.getroot().set('number', str(page_number))
if self.pdfFile is not None:
ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile})
if self.svg_file is not None:
tf = TranskriptionField(self.svg_file)
self.width = round(tf.documentWidth, 3)
self.height = round(tf.documentHeight, 3)
self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
self.svg_image.attach_object_to_tree(self.page_tree)
#ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file})
if self.svg_image is None and self.svg_file is not None:
self.svg_image = SVGImage(file_name=self.svg_file, width=self.width, height=self.height)
self.svg_image.attach_object_to_tree(self.page_tree)
def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None):
"""Adds a list of classes that are sonderzeichen and a style dictionary to page.
"""
self.sonderzeichen_list = sonderzeichen_list
self.letterspacing_list = letterspacing_list
self.style_dict = style_dict
if style_node is not None:
self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') }
self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ]
self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\
if bool(item.get('letterspacing-list')) ]
elif bool(self.style_dict):
style_node = ET.SubElement(self.page_tree.getroot(), 'style')
if len(self.sonderzeichen_list) > 0:
style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list))
if len(self.letterspacing_list) > 0:
style_node.set('letterspacing-list', ' '.join(self.letterspacing_list))
for key in self.style_dict.keys():
self.style_dict[key]['name'] = key
ET.SubElement(style_node, 'class', attrib=self.style_dict[key])
fontsize_dict = { key: float(value.get('font-size').replace('px','')) for key, value in self.style_dict.items() if 'font-size' in value }
fontsizes = sorted(fontsize_dict.values(), reverse=True)
# create a mapping between fontsizes and word stages
self.fontsizekey2stage_mapping = {}
for fontsize_key, value in fontsize_dict.items():
if value >= fontsizes[0]-1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.FIRST_VERSION })
elif value <= fontsizes[len(fontsizes)-1]+1:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.LATER_INSERTION_AND_ADDITION })
else:
self.fontsizekey2stage_mapping.update({ fontsize_key: WritingProcess.INSERTION_AND_ADDITION })
def add_source(self, source):
"""Adds a source to page and attaches it to page_tree.
"""
self.source = source
self.page_tree.getroot().set('source', self.source)
def categorize_paths(self, transkription_field=None):
"""Categorize all paths that are part of the transkription field.
:return: a dictionary containig a list for each category of path.
"""
if self.source is not None and isfile(self.source):
MAX_HEIGHT_LINES = 1
max_line = sorted(\
[line_number.bottom-line_number.top for line_number in self.line_numbers if line_number.id % 2 == 0],\
reverse=True)[0] + 2 if len(self.line_numbers) > 0 else 17
tr_xmin = transkription_field.xmin if transkription_field is not None else 0.0
tr_ymin = transkription_field.ymin if transkription_field is not None else 0.0
paths, attributes = svg_to_paths.svg2paths(self.source)
allpaths_on_tf = []
allpaths_outside_tf = []
attributes_outside_tf = []
if transkription_field is not None:
for index in range(0, len(paths)):
path = paths[index]
attribute = attributes[index]
if len(path) > 0\
and path != transkription_field.path\
and path.bbox()[0] > tr_xmin\
and path.bbox()[1] < transkription_field.xmax:
allpaths_on_tf.append(Path(id=index, path=path, style_class=attribute.get('class')))
elif len(path) > 0\
and path != transkription_field.path:
allpaths_outside_tf.append(path)
attributes_outside_tf.append(attribute)
path_dict = { 'text_area_deletion_paths': [],\
'deletion_or_underline_paths': [],\
'box_paths': [],\
'dots_paths': [],\
'word_connector_paths': [],\
'uncategorized_paths': [] }
for mypath in allpaths_on_tf:
xmin, xmax, ymin, ymax = mypath.path.bbox()
start_line_number = self.get_line_number(mypath.path.start.imag-tr_ymin)
if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1:
path_dict.get('dots_paths').append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed():
path_dict.get('box_paths').append(mypath)
elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed():
path_dict.get('word_connector_paths').append(mypath)
elif abs(ymax-ymin) < MAX_HEIGHT_LINES:
path_dict.get('deletion_or_underline_paths').append(mypath)
elif start_line_number != -1 and start_line_number != self.get_line_number(mypath.path.end.imag-tr_ymin):
path_dict.get('text_area_deletion_paths').append(mypath)
else:
path_dict.get('uncategorized_paths').append(mypath)
underline_path = self.mark_words_intersecting_with_paths_as_deleted(path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin)
path_dict.update({'underline_path': underline_path})
self.process_word_boxes(path_dict.get('box_paths'), transkription_field,\
paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line)
return path_dict
elif not Page.UNITTESTING:
error_msg = 'Svg source file {} does not exist!'.format(self.source)\
if self.source is not None else 'Page does not contain a source file!'
raise FileNotFoundError(error_msg)
return {}
def create_writing_processes_and_attach2tree(self):
"""Creates three stages of Nietzsche's process of writing.
"""
self.writing_processes = [ WritingProcess(version=WritingProcess.FIRST_VERSION),\
WritingProcess(version=WritingProcess.INSERTION_AND_ADDITION),\
WritingProcess(version=WritingProcess.LATER_INSERTION_AND_ADDITION) ]
for writing_process in self.writing_processes:
writing_process.attach_object_to_tree(self.page_tree)
for word in self.words:
for transkription_position in word.transkription_positions:
for font_key in transkription_position.positional_word_parts[0].style_class.split(' '):
if font_key in self.fontsizekey2stage_mapping.keys():
transkription_position.writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
def find_special_words(self, transkription_field=None):
"""Find special words, remove them from words, process their content.
"""
if self.source is None or not isfile(self.source):
raise FileNotFoundError('Page does not have a source!')
if transkription_field is None:
transkription_field = TranskriptionField(self.source)
special_char_list = MarkForeignHands.get_special_char_list()
special_char_list += TextConnectionMark.get_special_char_list()
single_char_words = [ word for word in self.words if len(word.text) == 1 and word.text in special_char_list ]
for word in single_char_words:
if word.text == MarkForeignHands.CLASS_MARK:
id = len(self.mark_foreign_hands)
self.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id))
self.words.remove(word)
elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\
or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\
and any(style in self.sonderzeichen_list for style\
in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))):
id = len(self.text_connection_marks)
self.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id))
self.words.remove(word)
svg_tree = ET.parse(self.source)
self.update_page_type(transkription_field=transkription_field)
self.update_line_number_area(transkription_field, svg_tree=svg_tree)
italic_classes = [ key for key in self.style_dict\
if bool(self.style_dict[key].get('font-family')) and self.style_dict[key]['font-family'].endswith('Italic') ]
if len(self.mark_foreign_hands) > 0:
MarkForeignHands.find_content(self.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\
SonderzeichenList=self.sonderzeichen_list)
if len(self.text_connection_marks) > 0:
TextConnectionMark.find_content_in_footnotes(self.text_connection_marks, transkription_field, svg_tree,\
title=self.title, page_number=self.number)
def get_biggest_fontSize4styles(self, style_set={}):
"""Returns biggest font size from style_dict for a set of style class names.
[:returns:] (float) biggest font size OR 1 if style_dict is empty
"""
if bool(self.style_dict):
sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True)
return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1
else:
return 1
def get_line_number(self, y):
"""Returns line number id for element at y.
[:return:] (int) line number id or -1
"""
if len(self.line_numbers) > 0:
result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ]
return result_list[0] if len(result_list) > 0 else -1
else:
return -1
@classmethod
def get_pages_from_xml_file(cls, xml_file, status_contains='', word_selection_function=None):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree = ET.parse(xml_file)
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION:
page = cls(xml_source_file=xml_file)
if word_selection_function is None or len(word_selection_function(page.words)) > 0:
return [ page ]
else:
return []
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
pages = []
xpath = '//page/@output'\
if status_contains == ''\
else '//page[contains(@status, "{0}")]/@output'.format(status_contains)
for xml_source_file in source_tree.xpath(xpath):
if isfile(xml_source_file):
pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function)
return pages
else:
return []
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
- properties = { 'title': (str, 1, '/page/@title'), 'number': (str, 1, '/page/@number'),\
- 'image': { 'class': Image, 'cardinality': 1, 'xpath': '/page/{}'.format(FaksimileImage.XML_TAG)},\
+ properties = { 'number': (str, 1, '/page/@number'),\
+ 'faksimile_image': { 'class': FaksimileImage, 'cardinality': 1, 'xpath': '/page/{}'.format(FaksimileImage.XML_TAG)},\
'line_numbers': (LineNumber, SemanticClass.LIST, '/page/@number|/page/@title'),\
'orientation': { 'class': str, 'cardinality': 1, 'xpath': '/page/@orientation'},\
'words': (Word, SemanticClass.LIST, '/page/@number|/page/@title'),\
'svg_image': { 'class': SVGImage, 'cardinality': 1, 'xpath': '/page/{}'.format(SVGImage.XML_TAG)},\
'writing_processes': (WritingProcess, SemanticClass.LIST, '/page/@number|/page/@title'),\
'word_deletion_paths': (Path, SemanticClass.LIST, '/page/@number|/page/@title'),\
'word_insertion_marks': (WordInsertionMark, SemanticClass.LIST, '/page/@number|/page/@title')}
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
def init_line_numbers(self, line_numbers, document_bottom):
"""Init line numbers.
"""
even_index = 0
MINABOVE = 1
self.line_numbers = []
if len(line_numbers) > 0:
first_line_bottom = line_numbers[even_index].top - MINABOVE
self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom))
self.line_numbers.append(line_numbers[even_index])
even_index += 1
while even_index < len(line_numbers):
self.line_numbers.append(LineNumber(id=line_numbers[even_index].id-1,\
top=line_numbers[even_index-1].bottom+MINABOVE,\
bottom=line_numbers[even_index].top-MINABOVE))
self.line_numbers.append(line_numbers[even_index])
even_index += 1
self.line_numbers.append(LineNumber(id=line_numbers[even_index-1].id+1,\
top=line_numbers[even_index-1].bottom+MINABOVE,\
bottom=document_bottom))
for line_number in self.line_numbers:
line_number.attach_object_to_tree(self.page_tree)
def init_words(self):
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ]
self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ]
self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
self.word_deletion_paths = [ Path(node=node) for node in self.page_tree.xpath('//' + Path.WORD_DELETION_PATH_TAG) ]
def is_locked(self):
"""Return true if page is locked.
"""
return len(self.page_tree.xpath('//metadata/lock')) > 0
def lock(self, reference_file, message=''):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if not self.is_locked():
metadata = self.page_tree.xpath('./metadata')[0]\
if len(self.page_tree.xpath('./metadata')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'metadata')
lock = ET.SubElement(metadata, 'lock')
ET.SubElement(lock, 'reference-file').text = reference_file
if message != '':
ET.SubElement(lock, 'message').text = message
def mark_words_intersecting_with_paths_as_deleted(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0):
"""Marks all words that intersect with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] list of .path.Path that might be word_underline_paths
"""
if not Page.UNITTESTING:
bar = Bar('mark words that intersect with deletion paths', max=len(self.words))
for word in self.words:
not bool(Page.UNITTESTING) and bar.next()
word.deleted = False
for transkription_position in word.transkription_positions:
word_path = Path.create_path_from_transkription_position(transkription_position,\
tr_xmin=tr_xmin, tr_ymin=tr_ymin)
intersecting_paths = [ deletion_path for deletion_path in deletion_paths\
if do_paths_intersect_saveMode(deletion_path.path, word_path.path) ]
if len(intersecting_paths) > 0:
transkription_position.deleted = True
for deletion_path in intersecting_paths:
if deletion_path not in self.word_deletion_paths:
deletion_path.tag = Path.WORD_DELETION_PATH_TAG
deletion_path.attach_object_to_tree(self.page_tree)
self.word_deletion_paths.append(deletion_path)
word.partition_according_to_writing_process_id()
word.partition_according_to_deletion()
not bool(Page.UNITTESTING) and bar.finish()
# return those paths in deletion_paths that are not in self.word_deletion_paths
return [ word_underline_path for word_underline_path in set(deletion_paths) - set(self.word_deletion_paths) ]
def process_word_boxes(self, box_paths, transkription_field, paths=None, attributes=None, max_line=17):
"""Process word boxes: partition words according to word boxes.
"""
MAX_HEIGHT_LINES = 1
if not Page.UNITTESTING:
bar = Bar('process word boxes', max=len(self.words))
svg_tree = ET.parse(self.source)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
allpaths_on_margin_field = []
if paths is None or attributes is None:
paths, attributes = svg_to_paths.svg2paths(self.source)
for index in range(0, len(paths)):
path = paths[index]
xmin, xmax, ymin, ymax = path.bbox()
attribute = attributes[index]
if len(path) > 0\
and path != transkription_field.path\
and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\
or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\
and abs(ymax-ymin) < max_line:
allpaths_on_margin_field.append(Path(id=index, path=path, style_class=attribute.get('class')))
box_line_number_dict = {}
for box_path in sorted(box_paths, key=lambda path: path.get_median_y()):
line_number = self.get_line_number(box_path.get_median_y(tr_ymin=transkription_field.ymin))
if line_number not in box_line_number_dict.keys():
box_line_number_dict.update({ line_number: [ box_path ]})
else:
box_line_number_dict.get(line_number).append(box_path)
boxes = []
for line_number in box_line_number_dict.keys():
box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x())
margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\
if self.get_line_number(margin_box.get_median_y(tr_ymin=transkription_field.ymin)) == line_number ],\
key=lambda path: path.get_x())
threshold = 3 if line_number % 2 == 0 else 1.5
for box_path in box_paths_on_line:
box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\
transkription_field=transkription_field, namespaces=namespaces, threshold=threshold)
if box is not None:
boxes.append(box)
for word in self.words:
not bool(Page.UNITTESTING) and bar.next()
word.process_boxes(boxes, tr_xmin=transkription_field.xmin, tr_ymin=transkription_field.ymin)
not bool(Page.UNITTESTING) and bar.finish()
def unlock(self):
"""Lock tree such that ids of words etc. correspond to ids
in reference_file, optionally add a message that will be shown.
"""
if self.is_locked():
lock = self.page_tree.xpath('//metadata/lock')[0]
lock.getparent().remove(lock)
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(update_function_on_word):
update_function_on_word(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
def update_line_number_area(self, transkription_field, svg_tree=None):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD = 0.4
if svg_tree is None:
svg_tree = ET.parse(self.source)
if len(self.line_numbers) > 1:
line_number = self.line_numbers[9]\
if transkription_field.is_page_verso() and len(self.line_numbers) > 8\
else self.line_numbers[1]
ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\
if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\
and LineNumber.IS_A_LINE_NUMBER(item)\
and LineNumber(raw_text_node=item, transkription_field=transkription_field).id == line_number.id ]
if len(ln_nodes) > 0:
matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform'))
if transkription_field.is_page_verso():
transkription_field.add_line_number_area_width(matrix.getX())
elif self.svg_file is not None and isfile(self.svg_file):
svg_path_tree = ET.parse(self.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
svg_x = matrix.getX()
svg_y = self.line_numbers[1].bottom + transkription_field.ymin
use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
if len(use_nodes) > 0:
symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
if len(d_strings) > 0 and d_strings[0] != '':
path = parse_path(d_strings[0])
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
transkription_field.add_line_number_area_width(matrix.getX() + width)
def update_page_type(self, transkription_field=None):
"""Adds a source to page and attaches it to page_tree.
"""
if transkription_field is None:
if self.source is None or not isfile(self.source):
raise FileNotFoundError('Page does not have a source!')
transkription_field = TranskriptionField(self.source)
self.page_type = Page.PAGE_VERSO\
if transkription_field.is_page_verso()\
else Page.PAGE_RECTO
self.page_tree.getroot().set('pageType', self.page_type)
def do_paths_intersect_saveMode(path1, path2):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try:
return path1.intersect(path2, justonemode=True)
except AssertionError:
return False
Index: svgscripts/datatypes/writing_process.py
===================================================================
--- svgscripts/datatypes/writing_process.py (revision 65)
+++ svgscripts/datatypes/writing_process.py (revision 66)
@@ -1,85 +1,89 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a text version.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
+import sys
from .attachable_object import AttachableObject
-from .class_spec import SemanticClass
+
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
class WritingProcess(AttachableObject,SemanticClass):
"""
This class represents a stage in Nietzsche's process of writing the text.
Args:
version (int): stage in the writing process
words (list of datatypes.word.Word): all words that belong to this stage
"""
XML_TAG = 'writing-process'
FIRST_VERSION = 0
INSERTION_AND_ADDITION = 1
LATER_INSERTION_AND_ADDITION = 2
VERSION_DESCRIPTION = [ 'first version', 'insertion and addition', 'later insertion and addition' ]
def __init__(self, version=FIRST_VERSION):
+ self.id = version
self.version = version
self.description = WritingProcess.VERSION_DESCRIPTION[self.version]\
if self.version < len(WritingProcess.VERSION_DESCRIPTION) else ''
@classmethod
def create_writing_process_from_xml(cls, node, all_words=[]):
"""Creates a WritingProcess by instantiating a -node.
[:return:] (datatypes.writing_process) WritingProcess
"""
version = int(node.get('version'))\
if bool(node.get('version')) else cls.FIRST_VERSION
return cls(version=version)
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {\
'version': (int, 1, '{}/@version'.format(WritingProcess.XML_TAG)),\
'description': (str, 1, '{}/@description'.format(WritingProcess.XML_TAG)),\
}
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
obj_node = target_tree.getroot().xpath('//' + WritingProcess.XML_TAG + '[@version="%s"]' % self.version)[0] \
if(len(target_tree.getroot().xpath('//' + WritingProcess.XML_TAG + '[@version="%s"]' % self.version)) > 0) \
else ET.SubElement(target_tree.getroot(), WritingProcess.XML_TAG)
obj_node.set('version', str(self.version))
if self.description != '':
obj_node.set('description', self.description)
Index: svgscripts/datatypes/image.py
===================================================================
--- svgscripts/datatypes/image.py (revision 65)
+++ svgscripts/datatypes/image.py (revision 66)
@@ -1,113 +1,116 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent all image types.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
+import sys
from .attachable_object import AttachableObject
-from .class_spec import SemanticClass
from .text_field import TextField
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+
class Image(AttachableObject,SemanticClass):
"""
This super class represents all types of images.
Args:
file_name (str): name of the image file.
node (lxml.etree.Element) node, containing information
URL (str): URL of image file.
height (float): height of image
width (float): width of image
text_field (.text_field.TextField) text_field on image representation
"""
stringKeys = [ 'file_name', 'URL', 'local_path' ]
floatKeys = [ 'height', 'width' ]
XML_TAG = 'image'
def __init__(self, node=None, file_name=None, local_path=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG):
self.text_field = text_field
if node is not None:
self.file_name = node.get('file-name')
self.local_path = node.get('local-path')
self.URL = node.get('URL')
self.height = float(node.get('height'))
self.width = float(node.get('width'))
if len(node.findall(TextField.XML_TAG)) > 0:
self.text_field = TextField(node=node.find(TextField.XML_TAG))
else:
self.tag = tag
self.file_name = file_name
self.local_path = local_path
self.URL = URL
self.height = height
self.width = width
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
obj_node = target_tree.getroot().find('.//' + self.tag) \
if(len(target_tree.getroot().findall('.//' + self.tag)) > 0) \
else ET.SubElement(target_tree.getroot(), self.tag)
for key in self.floatKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3)))
for key in self.stringKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), self.__dict__[key])
if self.text_field is not None:
self.text_field.attach_object_to_tree(obj_node)
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {}
properties.update(dict(zip(Image.floatKeys, [ (float, 1, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in Image.floatKeys])))
properties.update({'file_name': (str, 1, '{}/@file-name'.format(cls.XML_TAG))})
properties.update({'URL': (str, 0, '{}/@absolute-path'.format(cls.XML_TAG))})
properties.update({'text_field': (TextField, 0, '{}/{}'.format(cls.XML_TAG, TextField.XML_TAG))})
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
class SVGImage(Image):
"""This class represents a svg image.
"""
XML_TAG = 'svg-image'
def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, text_field=None, tag=XML_TAG):
if node is not None and node.tag != self.XML_TAG:
file_name = node.get('file')
height = float(node.get('height')) if bool(node.get('height')) else 0.0
width = float(node.get('width')) if bool(node.get('width')) else 0.0
node = None
super(SVGImage, self).__init__(node=node, file_name=file_name, URL=URL,\
height=height, width=width, text_field=text_field, tag=self.XML_TAG)
Index: svgscripts/datatypes/simple_word.py
===================================================================
--- svgscripts/datatypes/simple_word.py (revision 65)
+++ svgscripts/datatypes/simple_word.py (revision 66)
@@ -1,104 +1,107 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent a simple word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import abc
from lxml import etree as ET
+import sys
-from .class_spec import SemanticClass
from .lineNumber import LineNumber
from .transkription_position import TranskriptionPosition
from .word_position import WordPosition
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+
class SimpleWord(SemanticClass, metaclass=abc.ABCMeta):
"""
This class represents a simple word.
"""
XML_TAG = 'simple-word'
XML_SUB_TAG = 'content'
def __init__(self, id=0, line_number=-1, text='', deleted=False, transkription_positions=None, faksimile_positions=None):
self.id = id
self.text = text
self.line_number = line_number
self.transkription_positions = transkription_positions if transkription_positions is not None else []
self.faksimile_positions = faksimile_positions if faksimile_positions is not None else []
def attach_word_to_tree(self, target_tree):
"""Attaches word to tree target_tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
if len(target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)) > 0:
word_node = target_tree.xpath('.//' + self.XML_TAG + '[@id="%s"]' % self.id)[0]
word_node.getparent().remove(word_node)
word_node = ET.SubElement(target_tree, self.XML_TAG, attrib={'id': str(self.id)})
word_node.set('text', self.text)
if self.line_number > -1:
word_node.set('line-number', str(self.line_number))
for transkription_position in self.transkription_positions:
transkription_position.attach_object_to_tree(word_node)
for faksimile_position in self.faksimile_positions:
faksimile_position.attach_object_to_tree(word_node)
return word_node
@classmethod
def create_cls(cls, word_node):
"""Creates a cls from a (lxml.Element) node.
[:return:] cls
"""
if word_node is not None: # init word from xml node
id = int(word_node.get('id'))
line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else -1
text = word_node.get('text')
transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ]
faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ]
return cls(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\
faksimile_positions=faksimile_positions)
else:
error_msg = 'word_node has not been defined'
raise Exception('Error: {}'.format(error_msg))
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {'text': (str, 1, '{0}/@text'.format(cls.XML_TAG)),\
'line_number': {'class': LineNumber, 'cardinality': 0,\
'name': 'wordHasLineNumber', 'xpath': '{0}/@line-number'.format(cls.XML_TAG),\
'label': 'word has a line number',\
'comment': 'Relating a word to a line number it has.'},\
'transkription_positions': (TranskriptionPosition, SemanticClass.LIST, '{0}/@id'.format(cls.XML_TAG)),\
'faksimile_positions': (WordPosition, SemanticClass.LIST, '{0}/@id'.format(cls.XML_TAG))}
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
Index: svgscripts/datatypes/manuscript.py
===================================================================
--- svgscripts/datatypes/manuscript.py (revision 0)
+++ svgscripts/datatypes/manuscript.py (revision 66)
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+""" This class can be used to represent an archival unity of manuscript pages, i.e. workbooks, notebooks, folders of handwritten pages.
+"""
+# Copyright (C) University of Basel 2019 {{{1
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see 1}}}
+
+__author__ = "Christian Steiner"
+__maintainer__ = __author__
+__copyright__ = 'University of Basel'
+__email__ = "christian.steiner@unibas.ch"
+__status__ = "Development"
+__license__ = "GPL v3"
+__version__ = "0.0.1"
+
+from lxml import etree as ET
+from os.path import isfile
+import sys
+
+from .page import Page, FILE_TYPE_XML_MANUSCRIPT, FILE_TYPE_SVG_WORD_POSITION
+
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+
+sys.path.append('shared_util')
+from myxmlwriter import parse_xml_of_type, xml_has_type
+
+class ArchivalManuscriptUnity(SemanticClass):
+ """
+ This class represents an archival unity of manuscript pages, i.e. workbooks, notebooks, folders of handwritten pages.
+ @label archival unity of manuscript pages
+
+ Args:
+ title title of archival unity
+ manuscript_type type of manuscript: 'Arbeitsheft', 'Notizheft', 'Mappe'
+ """
+ XML_TAG = 'manuscript'
+ RDFS_SUBCLASSOF = 'http://www.knora.org/ontology/0068/nietzsche#Manuscript'
+
+ def __init__(self, title='', manuscript_type=''):
+ self.title = title
+ self.manuscript_type = manuscript_type
+ self.pages = []
+
+ def get_name_and_id(self):
+ """Return an identification for object as 2-tuple.
+ """
+ return '', self.title.replace(' ', '_')
+
+ @classmethod
+ def get_semantic_dictionary(cls):
+ """ Creates a semantic dictionary as specified by SemanticClass.
+ """
+ dictionary = {}
+ class_dict = cls.get_class_dictionary()
+ properties = {}
+ properties.update({'title': (str, 1, '{}/@title'.format(cls.XML_TAG))})
+ properties.update({'manuscript_type': (str, 1, '{}/@type'.format(cls.XML_TAG))})
+ properties.update({'pages': (Page, SemanticClass.LIST, '{}/pages/page'.format(cls.XML_TAG))})
+ dictionary.update({'class': class_dict})
+ dictionary.update({'properties': properties})
+ return dictionary
+
+ @classmethod
+ def create_cls(cls, xml_manuscript_file, page_status_list=None, page_xpath=''):
+ """Create an instance of ArchivalManuscriptUnity from a xml file of type FILE_TYPE_XML_MANUSCRIPT.
+
+ :return: ArchivalManuscriptUnity
+ """
+ manuscript_tree = parse_xml_of_type(xml_manuscript_file, FILE_TYPE_XML_MANUSCRIPT)
+ title = manuscript_tree.getroot().get('title') if bool(manuscript_tree.getroot().get('title')) else ''
+ manuscript_type = manuscript_tree.getroot().get('type') if bool(manuscript_tree.getroot().get('type')) else ''
+ manuscript = cls(title=title, manuscript_type=manuscript_type)
+ if page_xpath == '':
+ page_status = ''
+ if page_status_list is not None\
+ and type(page_status_list) is list\
+ and len(page_status_list) > 0:
+ page_status = '[' + ' and '.join([ 'contains(@status, "{}")'.format(status) for status in page_status_list ]) + ']'
+ page_xpath = '//pages/page{0}/@output'.format(page_status)
+ manuscript.pages = [ Page(xml_source_file=page_source)\
+ for page_source in manuscript_tree.xpath(page_xpath)\
+ if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ]
+ return manuscript
+
Index: svgscripts/datatypes/text_field.py
===================================================================
--- svgscripts/datatypes/text_field.py (revision 65)
+++ svgscripts/datatypes/text_field.py (revision 66)
@@ -1,52 +1,52 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-""" This class represents a text field on a faksimile svg.
+""" This class represents a text field on a faksimile image.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import sys
import re
from os import path, sep
import lxml.etree as ET
from .positional_object import PositionalObject
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__version__ = "0.0.1"
class TextField(PositionalObject):
"""
- This class represents the text field of a faksimile svg.
+ This class represents the text field of a faksimile image.
Args:
id (str): id from svg file.
width (float)
height (float)
x (float)
y (float)
"""
XML_TAG = 'text-field'
- def __init__(self, id=None, node=None, width=0.0, height=0.0, x=0.0, y=0.0):
+ def __init__(self, id=0, node=None, width=0.0, height=0.0, x=0.0, y=0.0):
super(TextField, self).__init__(node=node, id=id, width=width, height=height, x=x, y=y, tag=self.XML_TAG)
self.xmin = self.left
self.xmax = self.left + self.width
self.ymin = self.top
self.ymax = self.top + self.height
Index: svgscripts/datatypes/transkription_position.py
===================================================================
--- svgscripts/datatypes/transkription_position.py (revision 65)
+++ svgscripts/datatypes/transkription_position.py (revision 66)
@@ -1,174 +1,177 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a transkription word position.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
+import sys
-
-from .class_spec import SemanticClass
from .debug_message import DebugMessage
from .positional_word_part import PositionalWordPart
from .word_position import WordPosition
from .matrix import Matrix
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+
+
class TranskriptionPosition(WordPosition):
"""
This class represents a transkription word position.
Args:
id (int): word id
matrix (datatypes.Matrix): matrix containing information about transformation.
height (float): height of word
width (float): width of word
x (float): x position of word
y (float): y position of word
positional_word_parts a list of (datatypes.positional_word_part) PositionalWordPart
debug_message a (datatypes.debug_message) DebugMessage
"""
ADD2X = 0.15
ADD2TOP = 1.0
ADD2BOTTOM = 0.2
HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
XML_TAG = WordPosition.TRANSKRIPTION
def __init__(self, id=0, node=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, positional_word_parts=None, debug_message=None):
super(TranskriptionPosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION)
self.positional_word_parts = positional_word_parts if positional_word_parts is not None else []
self.debug_message = debug_message
self.deleted = False
self.has_box = None
if node is not None:
self.debug_message = DebugMessage(node=node.xpath('.//' + DebugMessage.XML_TAG)[0])\
if len(node.xpath('.//' + DebugMessage.XML_TAG)) > 0 else None
self.positional_word_parts = [ PositionalWordPart(node=pwp_node) for pwp_node in node.xpath('.//' + PositionalWordPart.XML_TAG) ]
self.attachable_objects += self.positional_word_parts
if self.debug_message is not None:
self.attachable_objects.append(self.debug_message)
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(TranskriptionPosition,cls).get_semantic_dictionary()
dictionary['properties'].update({'positional_word_parts': (PositionalWordPart, SemanticClass.LIST, '{}/@id'.format(cls.XML_TAG))})
return dictionary
def get_text(self):
"""Returns the concatenated text of all positional_word_parts.
"""
return ''.join([pwp.text for pwp in self.positional_word_parts])
def split(self, split_position, second_split=-1):
"""Split a transkription_position in two at split_position.
:return: a list of the new transkription_positions
"""
transkription_positions = []
left_pwp = [ pwp for pwp in self.positional_word_parts if pwp.left + pwp.width < split_position ]
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(left_pwp, transkription_position_id=self.id)
if second_split == -1:
right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp ]
next_id = int(self.id) + 1
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id))
else:
middle_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp.left + pwp.width < second_split ]
next_id = int(self.id) + 1
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(middle_pwp, transkription_position_id=str(next_id))
right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp not in middle_pwp ]
next_id = int(self.id) + 1
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id))
return transkription_positions
@staticmethod
def CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=None, debug_msg_string=None, transkription_position_id=0):
"""Creates a list of TranskriptionPosition from a list of (datatypes.positional_word_part) PositionalWordPart.
[:return:] a list of (datatypes.transkription_position) TranskriptionPosition
"""
TOPCORRECTION = 1
debug_message = DebugMessage(message=debug_msg_string)\
if debug_msg_string is not None else debug_message
transkription_positions = []
if len(positional_word_parts) < 1:
return []
matrix = positional_word_parts[0].transform
index = 0
matrices_differ = False
style_class = positional_word_parts[0].style_class
styles_differ = False
while index < len(positional_word_parts) and not matrices_differ and not styles_differ:
if Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix, positional_word_parts[index].transform):
matrices_differ = True
elif style_class != positional_word_parts[index].style_class:
styles_differ = True
else:
index += 1
if (matrices_differ or styles_differ) and index < len(positional_word_parts):
debug_msg_string = 'matrices differ' if matrices_differ else 'styles differ'
transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
positional_word_parts[index:], debug_msg_string=debug_msg_string, transkription_position_id=int(transkription_position_id)+1)
positional_word_parts = positional_word_parts[:index]
height = [ pwp.height for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.height, reverse=True)][0] + 2*TOPCORRECTION
x = positional_word_parts[0].left - TranskriptionPosition.ADD2X
y = [ pwp.top for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.top)][0] - TOPCORRECTION
width = positional_word_parts[len(positional_word_parts)-1].left - x\
+ positional_word_parts[len(positional_word_parts)-1].width + TranskriptionPosition.ADD2X
for pwp_index, pwp in enumerate(positional_word_parts):
pwp.id = pwp_index
transkription_positions.insert(0, TranskriptionPosition(id=transkription_position_id, height=height, width=width, x=x, y=y, matrix=matrix,\
positional_word_parts=positional_word_parts, debug_message=debug_message))
return transkription_positions
@staticmethod
def CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=None, debug_msg_string=None, transkription_field=None):
"""Creates a list of TranskriptionPosition from word_part_objs (i.e. a list of dictionaries
with the keys: text, x, y, matrix, class).
[:return:] a list of (datatypes.transkription_position) TranskriptionPosition
"""
positional_word_parts = []
debug_message = DebugMessage(message=debug_msg_string)\
if debug_msg_string is not None else None
if page.svg_file is not None and isfile(page.svg_file):
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
xmin = 0.0
ymin = 0.0
if transkription_field is not None:
xmin = transkription_field.xmin
ymin = transkription_field.ymin
for part_obj in word_part_objs:
positional_word_parts += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(\
part_obj, svg_path_tree, namespaces, page, start_id=len(positional_word_parts),\
xmin=xmin, ymin=ymin)
else:
positional_word_parts = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs)
if len(positional_word_parts) > 0:
return TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=debug_message)
else:
return [ TranskriptionPosition(matrix=matrix, debug_message=debug_message) ]
Index: svgscripts/datatypes/path.py
===================================================================
--- svgscripts/datatypes/path.py (revision 65)
+++ svgscripts/datatypes/path.py (revision 66)
@@ -1,163 +1,170 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This super class can be used to represent all svg path types.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile
from svgpathtools.parser import parse_path
+import sys
from .attachable_object import AttachableObject
-from .class_spec import SemanticClass
+
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
class Path(AttachableObject,SemanticClass):
"""
This super class represents all types of svg paths.
Args:
node (lxml.etree.Element) node, containing information
path (svgpathtools.path.Path) svg path representation.
"""
XML_TAG = 'path'
WORD_DELETION_PATH_TAG = 'word-deletion-path'
BOX_TAG = 'box-path'
def __init__(self, id=0, node=None, path=None, d_string=None, style_class='', tag=XML_TAG):
self.intKeys = [ 'id' ]
self.stringKeys = [ 'style_class' ]
self.floatKeys = []
if node is not None:
self.id = int(node.get('id')) if bool(node.get('id')) else 0
self.path = parse_path(node.get('d')) if bool(node.get('d')) else None
+ self.d_attribute = node.get('d')
self.style_class = node.get('style-class')
self.tag = node.tag
else:
self.tag = tag
self.id = id
self.path = path
if self.path is None\
and d_string is not None\
and d_string != '':
self.path = parse_path(d_string)
+ self.d_attribute = self.path.d() if self.path is not None else ''
self.style_class = style_class
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.tag)
for key in self.floatKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3)))
for key in self.intKeys + self.stringKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(self.__dict__[key]))
if self.path is not None:
obj_node.set('d', self.path.d())
def contains_path(self, other_path):
"""Returns true if other_path is contained in this path.
"""
this_xmin, this_xmax, this_ymin, this_ymax = self.path.bbox()
other_xmin, other_xmax, other_ymin, other_ymax = other_path.path.bbox()
return other_xmin >= this_xmin and other_xmax <= this_xmax\
and other_ymin >= this_ymin and other_ymax <= this_ymax
def contains_start_of_path(self, other_path):
"""Returns true if start of other_path is contained in this path.
"""
this_xmin, this_xmax, this_ymin, this_ymax = self.path.bbox()
other_xmin, other_xmax, other_ymin, other_ymax = other_path.path.bbox()
return other_xmin >= this_xmin and other_xmin < this_xmax\
and other_ymin >= this_ymin and other_ymax <= this_ymax
def contains_end_of_path(self, other_path):
"""Returns true if end of other_path is contained in this path.
"""
this_xmin, this_xmax, this_ymin, this_ymax = self.path.bbox()
other_xmin, other_xmax, other_ymin, other_ymax = other_path.path.bbox()
return other_xmax >= this_xmin and other_xmax < this_xmax\
and other_ymin >= this_ymin and other_ymax <= this_ymax
@classmethod
def create_path_from_transkription_position(cls, transkription_position, tr_xmin=0.0, tr_ymin=0.0):
"""Create a .path.Path from a .transkription_position.TranskriptionPosition.
"""
if len(transkription_position.positional_word_parts) > 0:
first_pwp = transkription_position.positional_word_parts[0]
last_pwp = transkription_position.positional_word_parts[len(transkription_position.positional_word_parts)-1]
xmin = tr_xmin + first_pwp.left
xmax = tr_xmin + last_pwp.left + last_pwp.width
ymin = tr_ymin + sorted(pwp.top for pwp in transkription_position.positional_word_parts)[0]
ymax = tr_ymin + sorted([pwp.bottom for pwp in transkription_position.positional_word_parts], reverse=True)[0]
else:
xmin = tr_xmin + transkription_position.left
xmax = xmin + transkription_position.width
ymin = tr_ymin + transkription_position.top
ymax = ymin + transkription_position.height
word_path = parse_path('M {}, {} L {}, {} L {}, {} L {}, {} z'.format(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax))
return cls(path=word_path)
def do_paths_intersect(self, other_path):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try:
return self.path.intersect(other_path.path, justonemode=True)
except AssertionError:
return False
-
def get_median_y(self, tr_ymin=0.0):
"""Return the median of ymin + ymax.
"""
return (self.path.bbox()[2] + self.path.bbox()[3])/2 - tr_ymin
def get_x(self, tr_xmin=0.0):
"""Return xmin.
"""
return self.path.bbox()[0] - tr_xmin
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {}
- properties.update({'path': (str, 0, '{}/@d'.format(cls.XML_TAG))})
+ properties.update({'d_attribute': { 'class': str, 'cardinality': 0,\
+ 'name': 'hasDAttribute', 'label': 'svg path has d attribute',\
+ 'comment': 'The d attribute defines a path to be drawn.',\
+ 'xpath': '{}/@d'.format(cls.XML_TAG)}})
properties.update({'style_class': (str, 0, '{}/@style-class'.format(cls.XML_TAG))})
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
def is_partially_contained_by(self, other_path):
"""Returns true if other_path containes this path partially.
"""
return other_path.contains_start_of_path(self) or other_path.contains_end_of_path(self)
Index: svgscripts/datatypes/reference.py
===================================================================
--- svgscripts/datatypes/reference.py (revision 65)
+++ svgscripts/datatypes/reference.py (revision 66)
@@ -1,144 +1,148 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a text reference.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
import re
+import sys
from .attachable_object import AttachableObject
-from .class_spec import SemanticClass
+
+sys.path.append('py2ttl')
+from class_spec import SemanticClass
+
class Reference(AttachableObject,SemanticClass):
"""
This class represents a text reference.
Args:
id (int): object id
first_line (int) first line of reference
last_line (int) last line of reference
is_uncertain (bool) whether reference is uncertain
title (str) title of reference
page_number (str) page_number of reference
tag (str) xml tag
"""
XML_TAG = 'reference'
intKeys = [ 'first_line', 'last_line']
boolKeys = [ 'is_uncertain' ]
stringKeys = [ 'title', 'page_number' ]
def __init__(self, node=None, id=0, first_line=-1, last_line=-1, is_uncertain=False, title='', page_number='', tag=XML_TAG):
self.intKeys = []
self.intKeys += Reference.intKeys
self.intKeys.append('id')
self.stringKeys = []
self.stringKeys += Reference.stringKeys
self.boolKeys = []
self.boolKeys += Reference.boolKeys
self.id = id
self.first_line = first_line
self.last_line = last_line
self.is_uncertain = is_uncertain
self.title = title
self.page_number = page_number
self.tag = tag
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.tag)
for key in self.boolKeys:
if self.__dict__[key] is not None:
obj_node.set(key.replace('_','-'), str(self.__dict__[key]).lower())
for key in self.intKeys:
if self.__dict__[key] is not None and self.__dict__[key] > -1:
obj_node.set(key.replace('_','-'), str(self.__dict__[key]))
for key in self.stringKeys:
if self.__dict__[key] is not None and self.__dict__[key] != '':
obj_node.set(key.replace('_','-'), str(self.__dict__[key]))
@classmethod
def create_cls(cls, node=None, id=0, is_uncertain=False, reference_string='', title='', page_number=''):
"""Creates a Reference from a (lxml.etree.Element) node or a reference_string.
:return: (datatypes.reference) Reference
"""
if node is not None:
instance = cls()
for key in instance.boolKeys:
xml_key = key.replace('_', '-')
if bool(node.get(xml_key)):
instance.__dict__[key] = node.get(xml_key) == 'true'
for key in instance.intKeys:
xml_key = key.replace('_', '-')
if bool(node.get(xml_key)):
instance.__dict__[key] = int(node.get(xml_key))
for key in instance.stringKeys:
xml_key = key.replace('_', '-')
if bool(node.get(xml_key)):
instance.__dict__[key] = node.get(xml_key)
return instance
else:
first_line = -1
last_line = -1
if re.match(r'[0-9]+([a-z]+)*,[0-9]+(-[0-9]+)*', reference_string):
page_number = reference_string.split(',')[0]
line_numbers = reference_string.split(',')[1].split('-')
first_line = int(line_numbers[0])
last_line = int(line_numbers[1]) if len(line_numbers) > 1 else -1
else:
if ',' not in reference_string:
line_numbers = reference_string.split('-')
first_line = int(line_numbers[0])
last_line = int(line_numbers[1]) if len(line_numbers) > 1 else -1
else:
if ' ' not in reference_string:
raise Exception('String "{}" is not a valid reference_string'.format(reference_string))
title = reference_string.split(' ')[0]
return cls.create_cls(id=id, is_uncertain=is_uncertain, reference_string=reference_string[len(title)+1:],\
title=title, page_number=page_number)
return cls(id=id, is_uncertain=is_uncertain, first_line=first_line, last_line=last_line,\
title=title, page_number=page_number)
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = {}
properties.update(dict(zip(cls.intKeys, [ (int, 0, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in cls.intKeys])))
properties.update(dict(zip(cls.boolKeys, [ (bool, 1, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in cls.boolKeys])))
properties.update(dict(zip(cls.stringKeys, [ (str, 1, '{}/@{}'.format(cls.XML_TAG, i.replace('_','-'))) for i in cls.stringKeys])))
dictionary.update({'class': class_dict})
dictionary.update({'properties': properties})
return dictionary
Index: svgscripts/datatypes/word_position.py
===================================================================
--- svgscripts/datatypes/word_position.py (revision 65)
+++ svgscripts/datatypes/word_position.py (revision 66)
@@ -1,90 +1,92 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word position.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from .matrix import Matrix
from .positional_object import PositionalObject
from .writing_process import WritingProcess
class WordPosition(PositionalObject):
"""
This class represents a word position.
Args:
id (int): word id
matrix (datatypes.Matrix): matrix containing information about conversion.
height (float): height of word
width (float): width of word
x (float): x position of word
y (float): y position of word
tag (str) location of the word position: 'WordPosition.TRANSKRIPTION' (default) or 'WordPosition.FAKSIMILE'
"""
TRANSKRIPTION = 'transkription-position'
FAKSIMILE = 'faksimile-position'
XML_TAG = 'faksimile-position'
def __init__(self, id=0, node=None, text=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, tag=TRANSKRIPTION):
super(WordPosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=tag)
self.intKeys.append('writing_process_id')
self.writing_process_id = -1
self.text = text
if node is not None:
self.writing_process_id = int(node.get('writing-process-id'))\
if bool(node.get('writing-process-id')) else -1
@classmethod
def copy_list_of_cls(cls, word_positions):
"""Return a copy of word_positions.
"""
return [ cls(id=wp.id, height=wp.height, width=wp.width, x=wp.left, y=wp.top, matrix=wp.transform)\
for wp in word_positions ]
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(WordPosition,cls).get_semantic_dictionary()
+ """
dictionary['properties'].update({'writing_process_id':\
{ 'class': WritingProcess, 'cardinality': 1, 'cardinality_restriction': 'cardinality',\
'name': '{}BelongsTo{}'.format(WordPosition.__name__, WritingProcess.__name__),\
'label': "connects a {} with a stage in Nietzsche's process of writing".format(WordPosition.__name__),\
'xpath': '{}/@writing-process-id'.format(cls.XML_TAG)}})
+ """
return dictionary
def isOnTranskription(self):
"""Returns whether position is on transkription.
"""
return self.tag == self.TRANSKRIPTION
def isOnFaksimile(self):
"""Returns whether position is on transkription.
"""
return self.tag == self.FAKSIMILE
Index: svgscripts/util.py
===================================================================
--- svgscripts/util.py (revision 65)
+++ svgscripts/util.py (revision 66)
@@ -1,319 +1,321 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to copy a faksimile svg file with the option of highlighting some word boxes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
from functools import cmp_to_key
import getopt
import inspect
import itertools
import lxml.etree as ET
import re
import shutil
import signal
import string
import subprocess
from svgpathtools import svg_to_paths
import sys
import tempfile
import os
from os import listdir, sep, path, setpgrp, devnull, makedirs
from os.path import basename, commonpath, dirname, exists, isfile, isdir, realpath, splitext
import warnings
import wget
import xml.etree.ElementTree as XET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.faksimile_image import FaksimileImage
from datatypes.lineNumber import LineNumber
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import update_transkription_position_ids
from local_config import FAKSIMILE_LOCATION, PDF_READER, SVG_EDITOR, USER_ROOT_LOCATION_DICT
-from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from process_files import update_svgposfile_status
+sys.path.append('shared_util')
+from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
+
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
class ExternalViewer:
"""This class can be used to show files with external viewers.
"""
file_format_viewer_dict = { '.pdf': PDF_READER, '.svg': SVG_EDITOR }
@classmethod
def show_files(cls, single_file=None, list_of_files=[]):
"""Opens file(s) with corresponding external viewer(s).
"""
DEVNULL = None
if type(single_file) == list:
list_of_files = single_file
elif single_file is not None:
list_of_files.append(single_file)
if len(list_of_files) > 1:
DEVNULL = open(devnull, 'wb')
process_list = []
list_of_files.reverse()
while len(list_of_files) > 0:
file2open = list_of_files.pop()
viewer = cls.file_format_viewer_dict.get(splitext(file2open)[1])
if viewer is not None:
if len(list_of_files) > 0:
process_list.append(\
subprocess.Popen([viewer, file2open], stdout=DEVNULL, stderr=DEVNULL, preexec_fn=os.setsid))
else:
subprocess.run([viewer, file2open])
for process in process_list:
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
if DEVNULL is not None:
DEVNULL.close()
def copy_faksimile_svg_file(target_file=None, faksimile_source_file=None, faksimile_tree=None, target_directory=None, abs_image_path=None, local_image_path=None, namespaces=None):
"""Copy a faksimile_svg_file to target_file.
"""
if faksimile_source_file is None and faksimile_tree is not None:
faksimile_source_file = faksimile_tree.docinfo.URL
elif faksimile_source_file is None:
raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file')
if target_file is not None and target_directory is not None:
target_file = target_directory + sep + target_file
elif target_file is None and target_directory is not None:
target_file = target_directory + sep + basename(faksimile_source_file)
elif target_file is None:
raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory')
paths, attributes, svg_attributes = svg_to_paths.svg2paths(faksimile_source_file, return_svg_attributes=True)
for key in [ key for key in svg_attributes.keys() if key.startswith('xmlns:') ]:
try:
XET.register_namespace(key.replace('xmlns:', ''), svg_attributes[key])
except ValueError: pass
XET.register_namespace('', 'http://www.w3.org/2000/svg')
if namespaces is None:
namespaces = { 'ns': svg_attributes['xmlns'], 'xlink': svg_attributes['xmlns:xlink'],\
'sodipodi': svg_attributes['xmlns:sodipodi'] }
if faksimile_tree is not None:
element = XET.fromstring(ET.tostring(faksimile_tree))\
if type(faksimile_tree) == ET._ElementTree\
else XET.fromstring(XET.tostring(faksimile_tree.getroot()))
target_tree = XET.ElementTree(element)
else:
target_tree = XET.parse(faksimile_source_file)
if (local_image_path is not None or abs_image_path is not None)\
and len(target_tree.findall('.//ns:image', namespaces=namespaces)) > 0:
image_node = target_tree.findall('.//ns:image', namespaces=namespaces)[0]
if local_image_path is not None:
image_node.set('{%s}href' % namespaces['xlink'], local_image_path)
if abs_image_path is not None:
image_node.set('{%s}absref' % namespaces['sodipodi'], abs_image_path)
target_tree.write(target_file)
def copy_faksimile_update_image_location(faksimile_source_file=None, faksimile_tree=None, target_file=None, target_directory=None, overwrite=False):
"""Copy a faksimile_svg_file to target_file and update image location.
"""
if faksimile_source_file is None and faksimile_tree is not None:
faksimile_source_file = faksimile_tree.docinfo.URL
elif faksimile_source_file is None:
raise Exception('copy_faksimile_svg_file needs either a faksimile_tree (lxml.etree.ElementTree) or a faksimile_source_file')
if target_file is not None and target_directory is not None:
target_file = target_directory + sep + target_file
elif target_file is None and target_directory is not None:
target_file = target_directory + sep + basename(faksimile_source_file)
elif target_directory is None and target_file is not None:
target_directory = dirname(target_file)
elif target_file is None:
raise Exception('copy_faksimile_svg_file needs either a target_file or a target_directory')
source_tree = ET.parse(faksimile_source_file) if faksimile_tree is None else faksimile_tree
namespaces = { k if k is not None else 'ns': v for k, v in source_tree.getroot().nsmap.items() }
image_nodes = source_tree.xpath('//ns:image', namespaces=namespaces)
local_image_path = None
abs_image_path = None
user_abs_image_path = None
if len(image_nodes) > 0:
image = FaksimileImage.CREATE_IMAGE(image_nodes[0], source_file=faksimile_source_file)
abs_image_path = image.local_path
for user_name in USER_ROOT_LOCATION_DICT.keys():
if user_name in target_directory:
user_abs_image_path = abs_image_path.replace(FAKSIMILE_LOCATION, USER_ROOT_LOCATION_DICT[user_name]).replace('//','/')
break
# if target_directory is subdir of FAKSIMILE_LOCATION
if realpath(target_directory).startswith(realpath(FAKSIMILE_LOCATION)):
common_path = commonpath([ realpath(target_directory), realpath(dirname(image.local_path)) ])
relative_directory = '/'.join(\
[ '..' for d in realpath(target_directory).replace(common_path + '/', '').split('/') ])
local_image_path = relative_directory + realpath(image.local_path).replace(common_path, '')
if not isfile(target_directory + sep + local_image_path):
local_image_path = None
elif abs_image_path is not None:
local_image_path = abs_image_path
if abs_image_path is not None and not isfile(abs_image_path):
wget.download(image.URL, out=dirname(abs_image_path))
if overwrite or not isfile(target_file):
abs_image_path = user_abs_image_path if user_abs_image_path is not None else abs_image_path
copy_faksimile_svg_file(target_file=target_file, faksimile_source_file=faksimile_source_file,\
faksimile_tree=faksimile_tree, abs_image_path=abs_image_path,\
local_image_path=local_image_path, namespaces=namespaces)
else:
msg = 'File {0} not copied to directory {1}, it already contains a file {2}.'.format(faksimile_source_file, target_directory, target_file)
warnings.warn(msg)
def create_highlighted_svg_file(faksimile_tree, node_ids, target_file=None, target_directory=None, local_image_path=None, namespaces={}, highlight_color=HIGHLIGHT_COLOR, opacity=OPACITY):
"""Highlights the nodes of a faksimile_tree that are specified by the list of node_ids and writes the tree to a file.
"""
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
for node in itertools.chain(*[\
faksimile_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)\
for node_id in node_ids\
]):
node.set('fill', highlight_color)
node.set('opacity', opacity)
node.set('style', '')
copy_faksimile_update_image_location(target_file=target_file, faksimile_tree=faksimile_tree, target_directory=target_directory)
def get_empty_node_ids(faksimile_tree, x_min=0.0, x_max=0.0, y_min=0.0, y_max=0.0, text_field_id=None, faksimile_page=None, namespaces={}):
"""Returns a list of ids of rect and path nodes that do not have a title element.
"""
THRESHOLD_X = 10
if faksimile_page is not None:
x_min = faksimile_page.text_field.xmin + faksimile_page.faksimile_image.x
x_max = faksimile_page.text_field.xmax + faksimile_page.faksimile_image.x - THRESHOLD_X
y_min = faksimile_page.text_field.ymin + faksimile_page.faksimile_image.y
y_max = faksimile_page.text_field.ymax + faksimile_page.faksimile_image.y
text_field_id = faksimile_page.text_field.id
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
empyt_node_ids = []
nodes_without_title = faksimile_tree.xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}" and not(./ns:title)]'.format(\
x_min, x_max, y_min, y_max, text_field_id), namespaces=namespaces)
nodes_without_title += get_paths_inside_rect(faksimile_tree, '//ns:path[not(./ns:title)]', x_min, x_max, y_min, y_max, text_field_id, namespaces=namespaces)
for node_without_title in nodes_without_title:
empyt_node_ids.append(node_without_title.get('id'))
return empyt_node_ids
def record_changes(original_svg_file, changed_svg_file, node_ids, namespaces={}):
"""Copy changes made to changed_svg_file to original_svg_file.
"""
old_tree = ET.parse(original_svg_file)
new_tree = ET.parse(changed_svg_file)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in new_tree.getroot().nsmap.items() }
for node_id in node_ids:
new_titles = new_tree.xpath('//ns:rect[@id="{0}"]/ns:title|//ns:path[@id="{0}"]/ns:title'.format(node_id), namespaces=namespaces)
old_nodes = old_tree.xpath('//ns:rect[@id="{0}"]|//ns:path[@id="{0}"]'.format(node_id), namespaces=namespaces)
if len(new_titles) > 0 and len(old_nodes) > 0:
if old_nodes[0].find('ns:title', namespaces=namespaces) is not None:
old_nodes[0].find('ns:title', namespaces=namespaces).text = new_titles[0].text
else:
old_title_id_string = new_titles[0].get('id')
old_title = ET.SubElement(old_nodes[0], 'title', attrib={ 'id': old_title_id_string })
old_title.text = new_titles[0].text
elif len(old_nodes) > 0:
for old_node in old_nodes:
old_node.getparent().remove(old_node)
copy_faksimile_svg_file(target_file=original_svg_file, faksimile_tree=old_tree)
def record_changes_on_svg_file_to_page(xml_source_file, svg_file, word_ids=None):
"""Copy changes made to svg_file to xml_source_file.
:return: datatypes.page.Page
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
transkription_field = TranskriptionField(svg_file)
page = Page(xml_source_file=xml_source_file)
words = [ word for word in page.words if word.id in word_ids ]\
if word_ids is not None else page.words
new_page_words = []
for word in words:
word_id = 'word_' + str(word.id) + '_'
recorded_ids = []
for transkription_position in word.transkription_positions:
transkription_position_id = word_id + str(transkription_position.id)
tp_nodes = svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]'.format(transkription_position_id), namespaces=namespaces)
if len(tp_nodes) > 0:
record_changes_to_transkription_position(tp_nodes[0], transkription_position,\
transkription_field.xmin, transkription_field.ymin, namespaces=namespaces)
recorded_ids.append(transkription_position_id)
extra_nodes = [ node for node in\
svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[contains(@id, "{0}")]'.format(word_id), namespaces=namespaces)\
if node.get('id') not in recorded_ids ]
if len(extra_nodes) > 0:
for extra_node in extra_nodes:
old_ids = [ inkscape_id.replace('#','') for inkscape_id in\
svg_tree.xpath('//ns:g[@id="Transkription"]/ns:rect[@id="{0}"]/@inkscape:label'.format(extra_node.get('id')),\
namespaces=namespaces) ]
if len(old_ids) > 0 and re.match(r'word_[0-9]+_[0-9]+', old_ids[0]):
old_id_list = old_ids[0].split('_')
ref_word_id = int(old_id_list[1])
ref_tp_id = old_id_list[2]
ref_words = [ word for word in page.words if word.id == ref_word_id ]
if len(ref_words) > 0:
ref_tps = [ tp for tp in ref_words[0].transkription_positions\
if tp.id == ref_tp_id ]
if len(ref_tps) > 0:
ref_words[0].transkription_positions.remove(ref_tps[0])
record_changes_to_transkription_position(extra_node,\
ref_tps[0], transkription_field.xmin, transkription_field.ymin, namespaces=namespaces)
word.transkription_positions.append(ref_tps[0])
for word in page.words:
if word.has_mixed_status('text'):
new_page_words += [ word for word in word.split_according_to_status('text') if word.text is not None and word.text != '' ]
elif len(word.transkription_positions) > 0:
new_text = [ tp.text for tp in word.transkription_positions if tp.text is not None and tp.text != '' ]
if len(new_text) > 0:
word.text = new_text[0]
new_page_words.append(word)
page.words = new_page_words
page.update_and_attach_words2tree(update_function_on_word=update_transkription_position_ids)
page.unlock()
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file,\
script_name=__file__ + ' -> ' + inspect.currentframe().f_code.co_name, file_type=FILE_TYPE_SVG_WORD_POSITION)
return page
def record_changes_to_transkription_position(node, transkription_position, xmin=0.0, ymin=0.0, namespaces=None):
"""Record changes made to node to transkription_position.
"""
if namespaces is None:
namespaces = { k if k is not None else 'ns': v for k, v in node.nsmap.items() }
if bool(node.get('x')):
transkription_position.left = float(node.get('x')) - xmin
if bool(node.get('y')):
transkription_position.top = float(node.get('y')) - ymin
if bool(node.get('width')):
transkription_position.width = float(node.get('width'))
if bool(node.get('height')):
transkription_position.height = float(node.get('height'))
if len(node.xpath('./ns:title/text()', namespaces=namespaces)) > 0:
transkription_position.text = node.xpath('./ns:title/text()', namespaces=namespaces)[0]
Index: svgscripts/convert_wordPositions.py
===================================================================
--- svgscripts/convert_wordPositions.py (revision 65)
+++ svgscripts/convert_wordPositions.py (revision 66)
@@ -1,372 +1,372 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import cairosvg
import getopt
from lxml.html import builder as E
from lxml.html import open_in_browser
import lxml
from os import sep, listdir, mkdir, path, remove
from os.path import exists, isfile, isdir, dirname
import re
import sys
from svgpathtools import svg_to_paths
import xml.etree.ElementTree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.transkriptionField import TranskriptionField
from datatypes.writing_process import WritingProcess
from datatypes.word import Word
-from myxmlwriter import write_pretty
+
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Converter:
"""The converter super class.
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
self.page = page
self.non_testing = non_testing
self.show_word_insertion_mark = show_word_insertion_mark
def _get_transkription_positions(self, transkription_positions, stage_version=''):
"""Returns the transkription_positions of the indicated stage_version.
"""
convertable_transkription_positions = transkription_positions
if stage_version != '':
convertable_transkription_positions = []
if re.match(r'^\d$', stage_version):
writing_process_id = int(stage_version)
for transkription_position in transkription_positions:
if transkription_position.writing_process_id == writing_process_id:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\+$', stage_version):
version_range = [ *range(int(stage_version.replace('+','')), len(WritingProcess.VERSION_DESCRIPTION)) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
elif re.match(r'^\d\-\d$', stage_version):
start_stop = [ int(i) for i in re.split(r'-', stage_version) ]
version_range = [ *range(start_stop[0], start_stop[1]+1) ]
for transkription_position in transkription_positions:
if transkription_position.writing_process_id in version_range:
convertable_transkription_positions.append(transkription_position)
return convertable_transkription_positions
def _get_words(self, words, highlighted_words=None):
"""Return the words that will be hightlighted.
"""
return highlighted_words if highlighted_words is not None else words
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Prints all words.
"""
first_word_of_line = None
out = sys.stdout
if output_file is not None:
out = open(output_file, 'w')
for word in self.page.words:
if first_word_of_line is None or first_word_of_line.line_number != word.line_number:
out.write('\n')
first_word_of_line = word
if word.line_number % 2 == 0:
out.write(str(word.line_number).zfill(2) + ' ')
else:
out.write(' ')
if stage_version == '' or len(self._get_transkription_positions(word.transkription_positions, stage_version=stage_version)) > 0:
if word.text is not None:
out.write(word.text + ' ')
out.close()
@classmethod
def CREATE_CONVERTER(cls, page, non_testing=True,converter_type='', show_word_insertion_mark=False):
"""Returns a converter of type converter_type.
[:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None
"""
cls_dict = { subclass.__name__: subclass for subclass in cls.__subclasses__() }
cls_key = converter_type + 'Converter'
if bool(cls_dict.get(cls_key)):
return cls_dict.get(cls_key)(page, non_testing, show_word_insertion_mark)
else:
return Converter(page, non_testing, show_word_insertion_mark)
class SVGConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text.
"""
BG_COLOR = 'yellow'
OPACITY = '0.2'
def __init__(self, page, non_testing=True, show_word_insertion_mark=False, bg_color=BG_COLOR, opacity=OPACITY):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
self.bg_color = bg_color
self.opacity = opacity
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to SVG
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
transkription_field = TranskriptionField(self.page.svg_file)
if bool(transkription_field.get_svg_attributes('xmlns')):
ET.register_namespace('', transkription_field.get_svg_attributes('xmlns'))
if bool(transkription_field.get_svg_attributes('xmlns:xlink')):
ET.register_namespace('xlink', transkription_field.get_svg_attributes('xmlns:xlink'))
svg_tree = ET.parse(self.page.svg_file)
transkription_node = ET.SubElement(svg_tree.getroot(), 'g', attrib={'id': 'Transkription'})
colors = [ 'yellow', 'orange' ] if self.bg_color == self.BG_COLOR else [ self.bg_color ]
if highlighted_words is not None:
colors = ['yellow']
else:
highlighted_words = []
color_index = 0
for word in self.page.words:
word_id = 'word_' + str(word.id)
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
transkription_position_id = word_id + '_' + str(transkription_position.id)
color = colors[color_index] if word not in highlighted_words else self.bg_color
rect_node = ET.SubElement(transkription_node, 'rect',\
attrib={'id': transkription_position_id, 'x': str(transkription_position.left + transkription_field.xmin),\
'y': str(transkription_position.top + transkription_field.ymin), 'width': str(transkription_position.width),\
'height': str(transkription_position.height), 'fill': color, 'opacity': self.opacity})
if transkription_position.transform is not None:
matrix = transkription_position.transform.clone_transformation_matrix()
matrix.matrix[Matrix.XINDEX] = round(transkription_position.transform.matrix[Matrix.XINDEX] + transkription_field.xmin, 3)
matrix.matrix[Matrix.YINDEX] = round(transkription_position.transform.matrix[Matrix.YINDEX] + transkription_field.ymin, 3)
rect_node.set('transform', matrix.toString())
rect_node.set('x', str(round(transkription_position.left - transkription_position.transform.matrix[Matrix.XINDEX], 3)))
rect_node.set('y', str(round((transkription_position.height-1.5)*-1, 3)))
ET.SubElement(rect_node, 'title').text = word.text
color_index = (color_index + 1) % len(colors)
if output_file is not None:
svg_tree.write(output_file)
class HTMLConverter(Converter):
"""This class can be used to convert a 'svgWordPositions' xml file to a test HTML file.
"""
CSS = """ .highlight0 { background-color: yellow; opacity: 0.2; }
.highlight1 { background-color: pink; opacity: 0.2; }
.foreign { background-color: blue; opacity: 0.4; }
.word-insertion-mark { background-color: orange; opacity: 0.2; }
.deleted { background-color: grey; opacity: 0.2; }
"""
def __init__(self, page, non_testing=True, show_word_insertion_mark=False):
Converter.__init__(self, page, non_testing, show_word_insertion_mark)
def convert(self, output_file=None, stage_version='', highlighted_words=None):
"""Converts Page to HTML
"""
title = self.page.title if(self.page.title is not None) else 'Test Page'
title = '{}, S. {}'.format(title, self.page.number) if (self.page.number is not None) else title
if stage_version != '':
title = title + ', Schreibstufe: ' + stage_version
width = self.page.width
height = self.page.height
style_content = ' position: relative; width: {}px; height: {}px; background-image: url({}); background-size: {}px {}px '\
.format(width, height, path.abspath(self.page.svg_file), width, height)
style = E.STYLE('#transkription {' + style_content + '}', HTMLConverter.CSS)
head = E.HEAD(E.TITLE(title),E.META(charset='UTF-8'), style)
transkription = E.DIV(id="transkription")
counter = 0
for word in self.page.words:
highlight_class = 'highlight' + str(counter)\
if not word.deleted else 'deleted'
earlier_text = '' if word.earlier_version is None else word.earlier_version.text
if earlier_text == '' and len(word.word_parts) > 0:
earlier_versions = [ word for word in word.word_parts if word.earlier_version is not None ]
earlier_text = earlier_versions[0].text if len(earlier_versions) > 0 else ''
if earlier_text != '':
word_title = 'id: {}/line: {}\n0: {}\n1: {}'.format(str(word.id), str(word.line_number), earlier_text, word.text)
else:
word_title = 'id: {}/line: {}\n{}'.format(str(word.id), str(word.line_number), word.text)
for transkription_position in self._get_transkription_positions(word.transkription_positions, stage_version=stage_version):
self._append2transkription(transkription, highlight_class, word_title, transkription_position)
for part_word in word.word_parts:
for part_transkription_position in self._get_transkription_positions(part_word.transkription_positions, stage_version=stage_version):
self._append2transkription(transkription, highlight_class, word_title, part_transkription_position)
counter = (counter + 1) % 2
word_insertion_mark_class = 'word-insertion-mark'
counter = 0
for mark_foreign_hands in self.page.mark_foreign_hands:
highlight_class = 'foreign'
title = 'id: {}/line: {}\n{} {}'.format(str(mark_foreign_hands.id), str(word.line_number),\
mark_foreign_hands.foreign_hands_text, mark_foreign_hands.pen)
for transkription_position in mark_foreign_hands.transkription_positions:
self._append2transkription(transkription, highlight_class, title, transkription_position)
if self.show_word_insertion_mark:
for word_insertion_mark in self.page.word_insertion_marks:
wim_title = 'id: {}/line: {}\nword insertion mark'.format(str(word_insertion_mark.id), str(word_insertion_mark.line_number))
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
word_insertion_mark.top, word_insertion_mark.left, word_insertion_mark.width, word_insertion_mark.height)
link = E.A(' ', E.CLASS(word_insertion_mark_class), title=wim_title, style=style_content)
transkription.append(link)
html = E.HTML(head,E.BODY(transkription))
bool(self.non_testing) and open_in_browser(html)
if output_file is not None:
with open(output_file, 'wb') as f:
f.write(lxml.html.tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8'))
f.closed
def _append2transkription(self, transkription, highlight_class, title, transkription_position):
"""Append content to transkription-div.
"""
style_content = 'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'.format(\
transkription_position.top, transkription_position.left, transkription_position.width, transkription_position.height)
if transkription_position.transform is not None:
style_content = style_content + ' transform: {}; '.format(transkription_position.transform.toCSSTransformString())
transform_origin_x = (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1\
if (transkription_position.left-round(transkription_position.transform.getX(), 1))*-1 < 0 else 0
style_content = style_content + ' transform-origin: {}px {}px; '.format(transform_origin_x, transkription_position.height)
link = E.A(' ', E.CLASS(highlight_class), title=title, style=style_content)
transkription.append(link)
def create_pdf_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, pdf_file_name='output.pdf', bg_color=SVGConverter.BG_COLOR):
"""Creates a pdf file highlighting some words.
"""
if not pdf_file_name.endswith('pdf'):
pdf_file_name = pdf_file_name + '.pdf'
tmp_svg_file = pdf_file_name.replace('.pdf', '.svg')
create_svg_with_highlighted_words(xml_source_file=xml_source_file, page=page, highlighted_words=highlighted_words,\
svg_file_name=tmp_svg_file, bg_color=bg_color)
if isfile(tmp_svg_file):
cairosvg.svg2pdf(url=tmp_svg_file, write_to=pdf_file_name)
remove(tmp_svg_file)
def create_svg_with_highlighted_words(xml_source_file=None, page=None, highlighted_words=None, svg_file_name='output.svg', bg_color=SVGConverter.BG_COLOR):
"""Creates a svg file highlighting some words.
"""
if page is None and xml_source_file is not None:
page = Page(xml_source_file=xml_source_file)
converter = SVGConverter(page, bg_color=bg_color)
if not svg_file_name.endswith('svg'):
svg_file_name = svg_file_name + '.svg'
converter.convert(output_file=svg_file_name, highlighted_words=highlighted_words)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes.
svgscripts/convert_wordPositions.py OPTIONS
OPTIONS:
-h|--help: show help
-H|--HTML [default] convert to HTML test file
-o|--output=outputFile save output to file outputFile
-P|--PDF convert to PDF test file
-S|--SVG convert to SVG test file
-s|--svg=svgFile: svg web file
-T|--TEXT convert to TEXT output
-t|--testing execute in test mode, do not write to file or open browser
-w|--word-insertion-mark show word insertion mark on HTML
-v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. }
:return: exit code (int)
"""
convert_to_type = None
svg_file = None
output_file = None
non_testing = True
show_word_insertion_mark = False
page = None
stage_version = ''
try:
opts, args = getopt.getopt(argv, "htHPSTws:o:v:", ["help", "testing", "HTML", "PDF", "SVG", "TEXT", "word-insertion-mark", "svg=", "output=", "version="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-v', '--version'):
if re.match(r'^(\d|\d\+|\d\-\d)$', arg):
stage_version = arg
else:
raise ValueError('OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'.format(arg))
elif opt in ('-w', '--word-insertion-mark'):
show_word_insertion_mark = True
elif opt in ('-P', '--PDF'):
convert_to_type = 'PDF'
elif opt in ('-S', '--SVG'):
convert_to_type = 'SVG'
elif opt in ('-T', '--TEXT'):
convert_to_type = 'TEXT'
elif opt in ('-H', '--HTML'):
convert_to_type = 'HTML'
elif opt in ('-t', '--testing'):
non_testing = False
elif opt in ('-s', '--svg'):
svg_file = arg
elif opt in ('-o', '--output'):
output_file = arg
if len(args) < 1:
usage()
return 2
if convert_to_type is None:
if output_file is not None and len(re.split(r'\.', output_file)) > 1:
output_file_part_list = re.split(r'\.', output_file)
convert_to_type = output_file_part_list[len(output_file_part_list)-1].upper()
else:
convert_to_type = 'HTML'
for word_position_file in args:
if not isfile(word_position_file):
print("'{}' does not exist!".format(word_position_file))
return 2
if convert_to_type == 'PDF':
if output_file is None:
output_file = 'output.pdf'
create_pdf_with_highlighted_words(word_position_file, pdf_file_name=output_file)
else:
if svg_file is not None:
if isfile(svg_file):
page = Page(xml_source_file=word_position_file, svg_file=svg_file)
else:
print("'{}' does not exist!".format(word_position_file))
return 2
else:
page = Page(xml_source_file=word_position_file)
if page.svg_file is None:
print('Please specify a svg file!')
usage()
return 2
converter = Converter.CREATE_CONVERTER(page, non_testing=non_testing, converter_type=convert_to_type, show_word_insertion_mark=show_word_insertion_mark)
converter.convert(output_file=output_file, stage_version=stage_version)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/extractWordPosition.py
===================================================================
--- svgscripts/extractWordPosition.py (revision 65)
+++ svgscripts/extractWordPosition.py (revision 66)
@@ -1,588 +1,590 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract the position of the words in a svg file and write them to a xml file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
import inspect
import getopt
from lxml import etree as ET
from os import sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
from progress.bar import Bar
import re
import sys
import warnings
-from myxmlwriter import write_pretty
from datatypes.lineNumber import LineNumber
from datatypes.matrix import Matrix
from datatypes.page import Page
from datatypes.pdf import PDFText
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word
from datatypes.word_insertion_mark import WordInsertionMark
+sys.path.append('shared_util')
+from myxmlwriter import write_pretty
+
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Extractor:
"""
This class can be used to extract the word positions in a svg file and write it to a xml file.
Args:
[xml_dir (str): target directory]
[title (str): title of document]
[manuscript_file (str): xml file containing information about the archival unity to which the current page belongs
[extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that
are part of the transkription field.
"""
UNITTESTING = False
SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ]
def __init__(self, xml_dir=None, title=None, manuscript_file=None, extract_transkription_field_only=False, compare2pdf=False):
if bool(xml_dir):
self.xml_dir = xml_dir
not isdir(self.xml_dir) and mkdir(self.xml_dir)
else:
self.xml_dir = 'xml' if(isdir('xml')) else ''
self.latest_status = None
self.compare2pdf = compare2pdf
self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
self.title = title
self.manuscript_file = manuscript_file
self.extract_transkription_field_only = extract_transkription_field_only
self.manuscript_tree = None
if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file):
self.manuscript_tree = ET.parse(self.manuscript_file)
self.title = self.manuscript_tree.getroot().get('title')
elif bool(self.manuscript_file):
raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file))
elif bool(self.title):
self.update_title_and_manuscript(self.title, False)
def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None):
"""Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word).
If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created.
:returns: the new word counter (int)
"""
break_points = []
if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
for Sonderzeichen in self.SONDERZEICHEN_LIST:
contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ]
if True in contains_Sonderzeichen:
break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]]
for sz_point in [i for i, e in break_points]:
wim_index = len(page.word_insertion_marks)
x = float(word_part_objs[sz_point]['x'])
y = float(word_part_objs[sz_point]['y'])
if page.svg_file is not None and isfile(page.svg_file) and transkription_field is not None:
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
xmin = transkription_field.xmin
ymin = transkription_field.ymin
wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\
line_number=page.get_line_number(y-1), mark_type=Sonderzeichen)
page.word_insertion_marks.append(wim)
if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points
THRESHOLDX = 20 # Threshold between line number and text
last_x = -1
for i, x in enumerate([float(dict['x']) for dict in word_part_objs]):
if(last_x > -1 and (x - last_x > THRESHOLDX)):
break_points.append((i, i))
last_x = x
if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words
from_index = 0
for end_point, next_from_index in break_points:
new_word_part_objs = word_part_objs[from_index:end_point]
new_endX = word_part_objs[end_point]['x']
from_index = next_from_index
index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
if from_index > 0 and from_index < len(word_part_objs):
new_word_part_objs = word_part_objs[from_index:]
index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
return index
else:
if len(word_part_objs) > 0:
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\
debug_msg_string=debug_msg, transkription_field=transkription_field)
text = self.get_word_from_part_obj(word_part_objs)
line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
if line_number == -1:
if len(page.words) > 0:
lastWord = page.words[len(page.words)-1]
lastWord_lastTP = lastWord.transkription_positions[len(lastWord.transkription_positions)-1]
lastTP = transkription_positions[len(transkription_positions)-1]
if transkription_positions[0].left > lastWord_lastTP.left\
and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2:
line_number = lastWord.line_number
else:
line_number = lastWord.line_number+1
newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
page.words.append(newWord)
return int(index) + 1
else:
return int(index)
def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default'):
"""Extracts information about positions of text elements and writes them to a xml file.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
exit_status = 0
with warnings.catch_warnings(record=record_warnings) as w:
warnings.simplefilter(warning_filter)
page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile)
status_message = 'OK'
if w is not None and len(w) > 0:
status_message = 'with warnings'
if True in [ str(warn.message).startswith(Page.WARNING_MISSING_USE_NODE4PWP) for warn in w ]:
status_message += ':{}:'.format(Page.WARNING_MISSING_USE_NODE4PWP.lower())
if True in [ str(warn.message).startswith(Page.WARNING_MISSING_GLYPH_ID4WIM) for warn in w ]:
status_message += ':{}:'.format(Page.WARNING_MISSING_GLYPH_ID4WIM.lower())
self.latest_status = status_message
exit_status = 1
else:
self.latest_status = None
page.page_tree.getroot().set('status', status_message)
write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition')
return exit_status
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None):
"""Extracts information about positions of text elements.
[:returns:] (datatypes.page) the Page containing all information.
"""
if isfile(file_name):
if not bool(xml_target_file):
xml_target_file = self.get_file_name(file_name, page_number)
if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
transkription_field = TranskriptionField(file_name) if bool(self.extract_transkription_field_only) else None
svg_tree = ET.parse(file_name)
page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number, pdfFile=pdfFile,\
svg_file=svg_file, extract_transkription_field_only=self.extract_transkription_field_only)
page.add_source(file_name)
sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot())
page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
if transkription_field is not None:
page.init_line_numbers(self.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax)
self.extract_word_position(svg_tree, page, transkription_field=transkription_field)
#if page.pdfFile is not None and isfile(page.pdfFile):
# pdftext = PDFText(page.pdfFile, sonderzeichen=self.SONDERZEICHEN_LIST)
# pdftext.compare_svgWords2pdfWords(page, transkription_field=transkription_field, split_wrongly_concatenated_words=self.compare2pdf)
page.create_writing_processes_and_attach2tree()
#page.categorize_paths(transkription_field=transkription_field)
page.update_and_attach_words2tree()
for word_insertion_mark in page.word_insertion_marks:
# it is not clear if we really need to know this alternative word ordering. See 'TODO.md'
#word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark)
word_insertion_mark.attach_object_to_tree(page.page_tree)
return page
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def extract_line_numbers(self, svg_tree, transkription_field):
"""Extracts line numbers and write them to a xml file.
"""
nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
line_numbers = [ LineNumber(raw_text_node=item, transkription_field=transkription_field)\
for item in filter(lambda x: LineNumber.IS_A_LINE_NUMBER(x), nodes_near_tf)]
if len(line_numbers) > 0:
MINABOVE = 3
last_to_position = transkription_field.ymin
for line_number in line_numbers:
above_current_line_bottom = line_number.bottom + transkription_field.ymin - MINABOVE
bottoms = self.get_bottoms(svg_tree.getroot(), from_position=last_to_position, to_position=above_current_line_bottom)
last_to_position = above_current_line_bottom
if len(bottoms) > 0:
current_line_top = float(bottoms[len(bottoms)-1]) - transkription_field.ymin + MINABOVE
line_number.setTop(current_line_top)
return line_numbers
def extract_word_position(self, svg_tree, page, transkription_field=None):
"""Extracts word positions.
"""
counter = 0
word_part_obj = []
endSign = '%'
last_matrix = None
MAXBOTTOMDIFF = 5
MAXXDIFF = 6
if not Extractor.UNITTESTING:
bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)]))
for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field)
# check for line breaks
if (last_matrix is not None and len(word_part_obj) > 0 and (\
Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
(abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
endSign = '%'
if(self.get_word_from_part_obj(word_part_obj) != ''):
debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field)
word_part_obj = []
endX = current_matrix.getX()
if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: