Index: py2ttl/test_py2ttl.py =================================================================== --- py2ttl/test_py2ttl.py (revision 15) +++ py2ttl/test_py2ttl.py (revision 16) @@ -1,19 +1,21 @@ import unittest from os import sep, path import lxml.etree as ET import py2ttl +from py2ttl import Py2TTLConverter class TestPy2TTL(unittest.TestCase): def test_main(self): argv = ['-t', 'asdf'] self.assertEqual(py2ttl.main(argv), 0) def test_get_semantic_classes(self): - classes = py2ttl.get_semantic_classes('svgscripts/datatypes') + converter = Py2TTLConverter() + classes = converter.get_semantic_classes('svgscripts/datatypes') self.assertEqual('FaksimileImage' in [ name for name, cls in classes ], True) self.assertEqual('Image' in [ name for name, cls in classes ], True) self.assertEqual('SemanticClass' in [ name for name, cls in classes ], False) if __name__ == "__main__": unittest.main() Index: py2ttl/py2ttl.py =================================================================== --- py2ttl/py2ttl.py (revision 15) +++ py2ttl/py2ttl.py (revision 16) @@ -1,118 +1,133 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert py classes that are subclasses of DATATYPES_DIR.class_spec.SemanticClass to rdf. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import getopt import importlib import importlib.util import inspect -import pkgutil from os import sep, path, listdir from os.path import isfile, isdir, dirname, basename +from rdflib import Graph import re import sys +from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL, SHARED_ONTOLOGIES_DIR __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" -def get_semantic_classes(datatypes_dir): - """Returns a list of all classes that are contained in datatypes_dir that are subclasses of DATATYPES_DIR.class_spec.SemanticClass. - - :return: a list of (str_name, class) +class Py2TTLConverter: + """This class can be used convert semantic_dictionaries to ttl. """ - base_dir = dirname(dirname(__file__)) - sys.path.append(base_dir) - root_modul_name = datatypes_dir.replace('/','.') - reference_cls = importlib.import_module('{}.{}'.format(root_modul_name, 'class_spec')) - files = [ file.replace('.py','') for file in listdir(datatypes_dir) if file.endswith('.py') and not file.startswith('test_') and not file.startswith('_')] - all_modules = [] - for name in files: - all_modules.append(importlib.import_module('{}.{}'.format(root_modul_name, name))) - all_classes = [] - for modul in all_modules: - all_classes += inspect.getmembers(modul, inspect.isclass) - all_classes = sorted(set(all_classes)) - semantic_classes = [ (name, cls) for name, cls in all_classes if issubclass(cls, reference_cls.SemanticClass) and not (cls == reference_cls.SemanticClass)] - return semantic_classes - -def convert_py2ttl(datatypes_dir, target_dir): - """Convert all classes contained in datatypes_dir that are subclasses of svgscripts.datatypes.class.SemanticClass to rdf. - - :return: exit code (int) - """ - if isdir(datatypes_dir): - semantic_classes = get_semantic_classes(datatypes_dir) - semantic_dicts = [] - for name, cls in semantic_classes: - semantic_dicts.append(cls.get_semantic_dict()) - print(semantic_dicts) - else: - print('Error: dir {} does not exist!'.format(datatypes_dir)) - usage - return 1 - return 0 + def __init__(self, project_onotology_file=None): + self.project_graph = Graph() + if project_onotology_file is not None and isfile(project_onotology_file): + self.project_graph.parse(project_onotology_file, format="turtle") + + def get_semantic_classes(self, datatypes_dir): + """Returns a list of all classes that are contained in datatypes_dir that are subclasses of DATATYPES_DIR.class_spec.SemanticClass. + + :return: a list of (str_name, class) + """ + base_dir = dirname(dirname(__file__)) + sys.path.append(base_dir) + root_modul_name = datatypes_dir.replace('/','.') + reference_cls = importlib.import_module('{}.{}'.format(root_modul_name, 'class_spec')) + files = [ file.replace('.py','') for file in listdir(datatypes_dir) if file.endswith('.py') and not file.startswith('test_') and not file.startswith('_')] + all_modules = [] + for name in files: + all_modules.append(importlib.import_module('{}.{}'.format(root_modul_name, name))) + all_classes = [] + for modul in all_modules: + all_classes += inspect.getmembers(modul, inspect.isclass) + all_classes = sorted(set(all_classes)) + semantic_classes = [ (name, cls) for name, cls in all_classes if issubclass(cls, reference_cls.SemanticClass) and not (cls == reference_cls.SemanticClass)] + return semantic_classes + + def convert_py2ttl(self, datatypes_dir, target_ontology_file): + """Convert all classes contained in datatypes_dir that are subclasses of DATATYPES_DIR.class_spec.SemanticClass to rdf. + + :return: exit code (int) + """ + if isdir(datatypes_dir): + semantic_classes = self.get_semantic_classes(datatypes_dir) + semantic_dicts = [] + for name, cls in semantic_classes: + semantic_dicts.append(cls.get_semantic_dictionary()) + print(semantic_dicts) + else: + print('Error: dir {} does not exist!'.format(datatypes_dir)) + usage + return 1 + return 0 def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): - """This program can be used to convert py classes that are subclasses of svgscripts.datatypes.class.SemanticClass to rdf. + """This program can be used to convert py classes that are subclasses of .class_spec.SemanticClass to owl:Class. py2ttl/py2ttl.py [OPTIONS] - directory containing datatypes that are subclasses of svgscripts.datatypes.class.SemanticClass. - + directory containing datatypes that are subclasses of .class_spec.SemanticClass. + Overwrites DATATYPES_DIR in config.py. OPTIONS: -h|--help: show help + -s|--source=source_ontology_file source ontology ttl file, option overwrites PROJECT_ONTOLOGY_FILE in config.py + -t|--target=target_ontology_file target ontology ttl file :return: exit code (int) """ - datatypes_dir = 'svgscripts/datatypes' - target_dir = '.{}ontologies'.format(sep) + check_config_files_exist() + datatypes_dir = get_datatypes_dir() + source_ontology_file = PROJECT_ONTOLOGY_FILE + target_ontology_file = '.{0}{1}-onotology_autogenerated.ttl'.format(sep, PROJECT_NAME) try: - opts, args = getopt.getopt(argv, "ht:", ["help", "target="]) + opts, args = getopt.getopt(argv, "hs:t:", ["help","source=", "target="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-t', '--target'): - target_dir = arg - if len(args) < 1: - return convert_py2ttl(datatypes_dir, target_dir) + target_ontology_file = arg + elif opt in ('-s', '--source'): + source_ontology_file = arg + + converter = Py2TTLConverter(project_onotology_file=source_ontology_file) + if len(args) < 1 and datatypes_dir is not None: + return converter.convert_py2ttl(datatypes_dir, target_ontology_file) else: - result = 0 for datatypes_dir in args: - result = convert_py2ttl(datatypes_dir, target_dir) - if result > 0: - break - return result + if converter.convert_py2ttl(datatypes_dir, target_ontology_file) > 0: + return 2 + return 0 if len(args) > 1 else 2 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: py2ttl/test_config.py =================================================================== --- py2ttl/test_config.py (revision 0) +++ py2ttl/test_config.py (revision 16) @@ -0,0 +1,28 @@ +import unittest +from os import sep, path +import lxml.etree as ET + +import config + +class TestConfig(unittest.TestCase): + def test_check_config_files_exist(self): + self.assertEqual(config.check_config_files_exist(), 0) + dir_saved = config.__dict__.get('ONTOLOGY_DIR') + config.__dict__['ONTOLOGY_DIR'] = __file__ + with self.assertRaises(NotADirectoryError): + config.check_config_files_exist() + config.__dict__['ONTOLOGY_DIR'] = 'asf' + with self.assertRaises(FileNotFoundError): + config.check_config_files_exist() + config.__dict__['ONTOLOGY_DIR'] = dir_saved + + def test_get_datatypes_dir(self): + self.assertEqual(config.get_datatypes_dir(), config.DATATYPES_DIR.replace('./', '')) + dir_saved = config.__dict__.get('DATATYPES_DIR') + del config.__dict__['DATATYPES_DIR'] + self.assertEqual(config.get_datatypes_dir(), None) + config.__dict__['DATATYPES_DIR'] = dir_saved + + +if __name__ == "__main__": + unittest.main() Index: py2ttl/config.py =================================================================== --- py2ttl/config.py (revision 0) +++ py2ttl/config.py (revision 16) @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import getpass +from os.path import isfile, isdir, exists +import re + +PROJECT_NAME = 'tln' +PROJECT_URL = 'http://www.knora.org/ontology/0068/nietzsche' + +ONTOLOGY_DIR = './ontologies' if getpass.getuser() == 'knister0' else './ontologies' # local onotology dir, script will read only +SHARED_ONTOLOGIES_DIR = '{}/Ontologies-shared'.format(ONTOLOGY_DIR) +PROJECT_ONTOLOGY_FILE = '{}/project-ontologies/nietzsche-ontology.ttl'.format(ONTOLOGY_DIR) + +DATATYPES_DIR = './svgscripts/datatypes' # optional in config file, can be overwritten by passing a to py2ttl/py2ttl.py + +def check_config_files_exist(): + """Checks whether all files exist that are specified in this file by uppercase variables ending in 'DIR' or 'FILE'. + + :return: exit code (int) + """ + for key in [ key for key in globals().keys() if re.match(r'^[A-Z_-]+(DIR|FILE)$', key) ]: + if not exists(globals().get(key)): + raise FileNotFoundError('Key {} does not specify an existing file or directory'.format(key)) + if key.endswith('DIR') and not isdir(globals().get(key)): + raise NotADirectoryError('Key {} does not specify an existing directory'.format(key)) + return 0 + +def get_datatypes_dir(): + """Returns value of DATATYPES_DIR if set, else None. + """ + if 'DATATYPES_DIR' in globals().keys(): + return DATATYPES_DIR.replace('./','') + else: + None Index: TODO.md =================================================================== --- TODO.md (revision 15) +++ TODO.md (revision 16) @@ -1,21 +1,24 @@ +# Skript: +- write a script that creates a class from a equivalentClass, i.e. a class from the shared ontologies. + # Probleme: - Wie mit Worteinfügungen in Wortkomposita umgehen? # TODO - make datatypes: - Page [ok] - Word [ok] --> deal with non-horizontal text <<<< DONE! --> add style info to word --> connect style with character glyph from svg path file - Style - WordPosition [ok] - TranskriptionPosition [ok] -->TODO: simplify by joininng - FaksimilePosition [ok] - LineNumber [ok] - Marginalien - Freehand: - Deletion - WordInsertionMark [ok] ...reDO - Underline Index: .svn_ignore =================================================================== --- .svn_ignore (revision 15) +++ .svn_ignore (revision 16) @@ -1,4 +1,5 @@ svg pdf text_svg xml +ontologies Index: svgscripts/py2ttl.py =================================================================== --- svgscripts/py2ttl.py (revision 15) +++ svgscripts/py2ttl.py (revision 16) @@ -1,115 +1,115 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to convert xml data to rdf. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import re import getopt import sys from os import sep, listdir, mkdir, path from os.path import exists, isfile, isdir from rdflib import Graph from rdflib.namespace import Namespace import xml.etree.ElementTree as ET from convert_wordPositions import Converter from myxmlwriter import write_pretty from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.transkriptionField import TranskriptionField from datatypes.word import Word __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class TurtleConverter(Converter): """This class can be used to convert a xml file to a ttl data file. It will also create the corresponding ontology. """ def __init__(self, page, non_testing=True, ontology_dir='.{}ontology'.format(sep), data_dir='.{}ttl_data'.format(sep)): super(TurtleConverter, self).__init__(page, non_testing) self.ontology_dir = ontology_dir #not isdir(self.ontology_dir) and mkdir(self.ontology_dir) self.data_dir = data_dir #not isdir(self.data_dir) and mkdir(self.data_dir) self.ontology_graph = Graph() def convert(self, output_file=None): """Converts Page to ttl. """ data = self.page.get_data_dictionary() dictionary = self.page.get_semantic_dictionary() - #print(data) + print(dictionary) def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to convert xml data to rdf. svgscripts/py2ttl.py [-h|--help, -o|--output=outputFile, -t|--testing] -h|--help: show help -o|--output=outputFile save output to file outputFile -t|--testing execute in test mode, do not write to file or open browser :return: exit code (int) """ output_file = None non_testing = True try: opts, args = getopt.getopt(argv, "hto:", ["help", "testing", "output="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-t', '--testing'): non_testing = False elif opt in ('-o', '--output'): output_file = arg if len(args) < 1: usage() return 2 for input_file in args: if not isfile(input_file): print("'{}' does not exist!".format(input_file)) return 2 else: page = Page(xml_source_file=input_file) converter = TurtleConverter(page, non_testing=non_testing) converter.convert(output_file=output_file) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/test_faksimile_image.py =================================================================== --- svgscripts/test_faksimile_image.py (revision 15) +++ svgscripts/test_faksimile_image.py (revision 16) @@ -1,56 +1,57 @@ import unittest from os import sep, path from os.path import isdir, dirname, basename import lxml.etree as ET import sys from datatypes.faksimile_image import FaksimileImage +from datatypes.image import Image class TestFaksimileImage(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.svg_file = DATADIR + sep + 'W-II-1,49et50.svg' def test_init(self): image = FaksimileImage(file_name='test.jpg', height=10, width=10) self.assertEqual(image.tag, FaksimileImage.XML_TAG) self.assertEqual(image.width, 10) def test_attach_object_to_tree(self): image = FaksimileImage(file_name='test.jpg', height=10, width=10, x=-100, y=-200) empty_tree = ET.ElementTree(ET.Element('faksimile')) image.attach_object_to_tree(empty_tree) self.assertEqual(image.tag, FaksimileImage.XML_TAG) for node in empty_tree.getroot().xpath('//' + image.tag): self.assertEqual(node.get('file-name'), 'test.jpg') self.assertEqual(node.get('height'), '10') self.assertEqual(node.get('width'), '10') def test_CREATE_IMAGE(self): svg_tree = ET.parse(self.svg_file) image_node = svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap) image = FaksimileImage.CREATE_IMAGE(image_node) self.assertEqual(image.file_name, 'W-II-1,49et50.jpg') image_node = ET.Element('image') file_name = 'test.jpg' width = 10 height = 10 image_node.set('href', file_name) image_node.set('height', str(height)) image_node.set('width', str(width)) image = FaksimileImage.CREATE_IMAGE(image_node) self.assertEqual(image.height, height) self.assertEqual(image.width, width) self.assertEqual(image.file_name, file_name) def test_get_semantic_dict(self): image = FaksimileImage(file_name='test.jpg', URL=FaksimileImage.NIETZSCHE_SOURCES_URL + "N-II-1,2et3", height=10, width=10) self.assertEqual(FaksimileImage.get_semantic_dictionary()['class'].get('this'), FaksimileImage) - #print(FaksimileImage.get_semantic_dictionary()) + print(Image.get_semantic_dictionary()) if __name__ == "__main__": unittest.main() Index: svgscripts/datatypes/image.py =================================================================== --- svgscripts/datatypes/image.py (revision 15) +++ svgscripts/datatypes/image.py (revision 16) @@ -1,88 +1,88 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This super class can be used to represent all image types. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile from .attachable_object import AttachableObject from .class_spec import SemanticClass class Image(AttachableObject,SemanticClass): """ This super class represents all types of images. Args: file_name (str): name of the image file. node (lxml.etree.Element) node, containing information URL (str): URL of image file. height (float): height of image width (float): width of image """ stringKeys = [ 'file_name', 'URL' ] floatKeys = [ 'height', 'width' ] - RDF_SUBCLASSES = ['http://www.knora.org/ontology/0000/image#Image'] + RDF_SUBCLASSES = ['http://www.knora.org/ontology/shared/image#Image'] def __init__(self, node=None, file_name=None, URL=None, height=0.0, width=0.0, tag='image'): if node is not None: self.file_name = node.get('file-name') self.URL = node.get('absolute-path') self.height = float(node.get('height')) self.width = float(node.get('width')) else: self.tag = tag self.file_name = file_name self.URL = URL self.height = height self.width = width def attach_object_to_tree(self, target_tree): """Attach object to tree. """ obj_node = target_tree.getroot().find('.//' + self.tag) \ if(len(target_tree.getroot().findall('.//' + self.tag)) > 0) \ else ET.SubElement(target_tree.getroot(), self.tag) for key in self.floatKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) for key in self.stringKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), self.__dict__[key]) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} properties.update(dict(zip(Image.floatKeys, [ (float, 1) for i in Image.floatKeys]))) properties.update(dict(zip(Image.stringKeys, [ (str, 1) for i in Image.stringKeys]))) dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary Index: svgscripts/datatypes/lineNumber.py =================================================================== --- svgscripts/datatypes/lineNumber.py (revision 15) +++ svgscripts/datatypes/lineNumber.py (revision 16) @@ -1,100 +1,100 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a line number. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import re from lxml import etree as ET from os.path import isfile from .class_spec import SemanticClass from .matrix import Matrix class LineNumber(SemanticClass): """ This class represents a line number. Args: file_name (str): name of the xml file to be instantiated. """ - RDF_SUBCLASSES = [ 'http://www.knora.org/ontology/0000/????'] + RDF_SUBCLASSES = [ 'http://www.knora.org/ontology/shared/????'] def __init__(self, id=0, bottom=0.0, top=0.0, raw_text_node=None, transkription_field=None, xml_text_node=None): self.id = id self.bottom = bottom self.top = top if xml_text_node is not None: self.id = int(xml_text_node.get('id')) self.bottom = float(xml_text_node.get('bottom')) self.top = float(xml_text_node.get('top')) if raw_text_node is not None and transkription_field is not None: matrix = Matrix(raw_text_node.get('transform'), transkription_field=transkription_field) self.bottom = matrix.getY() self.id = int(raw_text_node.text) if raw_text_node.text is not None\ else int(''.join([x.text for x in raw_text_node.findall('.//tspan', raw_text_node.nsmap)])) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'id': (int, 1), 'bottom': (float, 1), 'top': (float, 1)} dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary @staticmethod def XML_TAG(): """Returns the xml tag for this class. """ return 'line-number' @staticmethod def IS_A_LINE_NUMBER(raw_text_node): """Returns whether svg node contains a line number. """ if raw_text_node.text is not None: return bool(re.search(r'^[0-9]+$', raw_text_node.text)) elif len(raw_text_node.findall('.//tspan', raw_text_node.nsmap)) > 0: text = ''.join([x.text for x in raw_text_node.findall('.//tspan', raw_text_node.nsmap)]) return bool(re.search(r'^[0-9]+$', text)) return False def setTop(self, top): """Sets top position of line number. """ self.top = top def attach_object_to_tree(self, target_tree): """Attach object to tree. """ obj_node = target_tree.getroot().xpath('//' + LineNumber.XML_TAG() + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.getroot().xpath('//' + LineNumber.XML_TAG() + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree.getroot(), LineNumber.XML_TAG()) for key in self.__dict__.keys(): obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) Index: svgscripts/datatypes/positional_object.py =================================================================== --- svgscripts/datatypes/positional_object.py (revision 15) +++ svgscripts/datatypes/positional_object.py (revision 16) @@ -1,136 +1,136 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent an object with positional information. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from .matrix import Matrix from .attachable_object import AttachableObject from .class_spec import SemanticClass class PositionalObject(AttachableObject,SemanticClass): """ This (super) class represents an object with positional information. Args: id (int): object id matrix (datatypes.Matrix): matrix containing information about conversion. height (float): height of width (float): width of object x (float): x position of object y (float): y position of object """ - RDF_SUBCLASSES = ['http://www.knora.org/ontology/0000/????#PositionalObject'] + RDF_SUBCLASSES = ['http://www.knora.org/ontology/shared/????#PositionalObject'] floatKeys = [ 'height', 'width', 'left', 'top', 'bottom'] intKeys = [ ] stringKeys = ['id' ] def __init__(self, node=None, id=0, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, tag='positional-object'): self.floatKeys = [] self.floatKeys += PositionalObject.floatKeys self.intKeys = [] self.intKeys += PositionalObject.intKeys self.stringKeys = [] self.stringKeys += PositionalObject.stringKeys self.attachable_objects = [] if node is not None: self.id = str(node.get('id')) self.height = float(node.get('height')) self.width = float(node.get('width')) self.left = float(node.get('left')) self.top = float(node.get('top')) self.bottom = float(node.get('bottom')) self.transform = Matrix(node.get('transform')) if bool(node.get('transform')) else None self.tag = node.tag else: self.id = str(id) self.height = round(height, 3) self.width = round(width, 3) self.left = round(x, 3) self.top = round(y, 3) self.bottom = round(y + height, 3) self.transform = matrix self.tag = tag def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree, self.tag) for key in self.floatKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(round(self.__dict__[key], 3))) for key in self.intKeys + self.stringKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(self.__dict__[key])) if self.transform is not None and self.transform.isRotationMatrix(): obj_node.set('transform', self.transform.toString()) for attachable_object in self.attachable_objects: attachable_object.attach_object_to_tree(obj_node) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} properties.update(dict(zip(cls.intKeys, [ (int, 1) for i in cls.intKeys]))) properties.update(dict(zip(cls.floatKeys, [ (float, 1) for i in cls.floatKeys]))) properties.update(dict(zip(cls.stringKeys, [ (str, 1) for i in cls.stringKeys]))) properties.update({'transform': (Matrix, 1)}) dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary @staticmethod def POSITIONS_OVERLAP_HORIZONTALLY(position_a, position_b): """Returns whether position a and b overlap horizontally. """ return (position_a.left < position_b.left+position_b.width)\ and (position_a.left+position_a.width > position_b.left) @staticmethod def POSITIONS_OVERLAP_VERTICALLY(position_a, position_b): """Returns whether position a and b overlap vertically. """ return (position_a.top < position_b.bottom)\ and (position_a.bottom > position_b.top) @staticmethod def POSITIONS_ARE_STACKED(position_a, position_b): """Returns whether position a and b are stacked, i.e. are above each other. """ return PositionalObject.POSITIONS_OVERLAP_HORIZONTALLY(position_a, position_b)\ and (not PositionalObject.POSITIONS_OVERLAP_VERTICALLY(position_a, position_b)\ or abs(position_a.top-position_b.top) > (position_a.height/4 + position_b.height/4)) Index: svgscripts/datatypes/class_spec.py =================================================================== --- svgscripts/datatypes/class_spec.py (revision 15) +++ svgscripts/datatypes/class_spec.py (revision 16) @@ -1,89 +1,89 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This is an abstract class for all classes that are semantically interesting. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import abc import inspect class SemanticClass(metaclass=abc.ABCMeta): """ This is an abstract class for all classes that are semantically interesting. """ SINGLE_VALUE = 1 LIST = 99 @classmethod def get_class_dictionary(cls): """Creates and returns a class_dictionary with the keys 'this' ['type']. """ class_dict = {'this': cls } if cls.__dict__.get('RDF_SUBCLASSES') and len(cls.RDF_SUBCLASSES) > 0: - class_dict.update({'rdf:subClassOf': cls.RDF_SUBCLASSES }) + class_dict.update({'owl:equivalentClass': cls.RDF_SUBCLASSES }) else: direct_super_class = inspect.getclasstree([cls],unique=True)[0][0] if issubclass(direct_super_class, SemanticClass) and direct_super_class != SemanticClass: class_dict.update({'type': direct_super_class}) return class_dict def get_data_dictionary(self): """Returns a data dictionary with the keys 'head' and 'body'. Key 'head' points to a dictionary with class information (key: 'class'). Key 'body' points to a dictionary with the data. """ data_dict = {} semantic_dict = self.get_semantic_dictionary() data_dict.update({'head': {'class': semantic_dict['class'].get('this')}}) body = {} for key, (datatype, cardinality) in semantic_dict['properties'].items(): if self.__dict__.get(key) is not None: if issubclass(datatype, SemanticClass): if cardinality > SemanticClass.SINGLE_VALUE: items = [] for item in self.__dict__.get(key): items.append(item.get_data_dictionary().get('body')) body.update({ key: items}) else: body.update({ key: self.__dict__.get(key).get_data_dictionary().get('body')}) else: body.update({ key: self.__dict__.get(key) }) data_dict.update({'body': body}) return data_dict @classmethod @abc.abstractmethod def get_semantic_dictionary(cls): """Creates a semantic dictionary with 'class' and 'properties' as its keys. The class-key points to a class_dictionary with the keys: 'this' ['type', 'rdf:subClassOf'] The properties-key points to a properties_dictionary with semantically relevant keys of self.__dict__ as keys, and tuples of datatype (class), cardinality (int) as its values. Cardinality can be SemanticClass.SINGLE_VALUE, 2, 3, ... SemanticClass.LIST. """ pass Index: svgscripts/datatypes/word.py =================================================================== --- svgscripts/datatypes/word.py (revision 15) +++ svgscripts/datatypes/word.py (revision 16) @@ -1,214 +1,214 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import warnings from .class_spec import SemanticClass from .matrix import Matrix from .word_position import WordPosition from .transkription_position import TranskriptionPosition class Word(SemanticClass): """ This class represents a word. """ DATA = 'debug-data' - RDF_SUBCLASSES = ['http://www.knora.org/ontology/0000/????#Word'] + RDF_SUBCLASSES = ['http://www.knora.org/ontology/shared/????#Word'] def __init__(self, id=0, text='', line_number=-1, transkription_positions=[], faksimile_positions=[], word_part_objs=[]): self.id = id self.text = text self.line_number = line_number self.transkription_positions = transkription_positions self.faksimile_positions = faksimile_positions self.word_part_objs = word_part_objs self.is_head_of_inserted_words = False self.is_tail_of_inserted_words = False self.is_before_inserted_words = False self.is_after_inserted_words = False self.word_insertion_mark = None self.debug_msg = None @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {'id': (int, 1), 'text': (str, 1), 'line_number': (int, 1),\ 'transkription_positions': (TranskriptionPosition, SemanticClass.LIST), 'faksimile_positions': (WordPosition, SemanticClass.LIST)} dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary def set_word_insertion_mark(self, word_insertion_mark): """Sets word_insertion_mark """ self.word_insertion_mark = word_insertion_mark def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = target_tree.getroot().xpath('//word[@id="%s"]' % self.id)[0] \ if(len(target_tree.getroot().xpath('//word[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree.getroot(), 'word', attrib={'id': str(self.id)}) word_node.set('text', self.text) if self.line_number > -1: word_node.set('line-number', str(self.line_number)) for transkription_position in self.transkription_positions: transkription_position.attach_object_to_tree(word_node) """ data_node = word_node.find(self.DATA) if bool(word_node.find(self.DATA)) else ET.SubElement(word_node, self.DATA) for part_index, word_part in enumerate(self.word_part_objs): part_node = data_node.xpath('./part[@index="%s"]' % part_index)[0] \ if(len(data_node.xpath('./part[@index="%s"]' % part_index)) > 0) \ else ET.SubElement(data_node, 'part', attrib={'index': str(part_index)}) part_node.set('text', word_part['text']) part_node.set('class', word_part['class']) part_node.set('x', str(round(float(word_part['x']), 3))) part_node.set('y', str(round(float(word_part['y']), 3))) if self.debug_msg is not None: ET.SubElement(data_node, 'end', attrib={'debug-msg': self.debug_msg}) """ def split(self, page, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) return previousWord, currentWord, nextWord def join(self, other_word, append_at_end_of_new_word=True): """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. """ if append_at_end_of_new_word: self.text = self.text + other_word.text for position in other_word.transkription_positions: position.id = str(len(self.transkription_positions)) self.transkription_positions.append(position) else: self.text = other_word.text + self.text index = 0 for position in other_word.transkription_positions: self.transkription_positions.insert(index, position) index += 1 while index < len(self.transkription_positions): self.transkription_positions[index].id = str(index) index += 1 @staticmethod def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): """Creates a word from a (lxml.Element) node or word_part_objs. [:return:] Word """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number text = word_node.get('text') transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ if len(word_node.findall('.//' + Word.DATA)) > 0\ else [ item.attrib for item in word_node.findall('.//part')] return Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file WIDTH = 5 TOPCORRECTION = 2.0 FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize height = height x = round(float(word_part_objs[0]['x']), 3) if(page is not None and bool(page.style_dict)): HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size if endSign is not None and '%' in endSign: lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ if bool(page.style_dict[key].get('font-size'))] lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR elif endSign is not None and '%' in endSign: endX = float(endX) + WIDTH bottom = round(float(word_part_objs[0]['y']), 3) y = round(bottom - height + TOPCORRECTION, 3) width = round(float(endX) - x, 3) transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] text = ''.join([ dict['text'] for dict in word_part_objs]) line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) word.debug_msg = debug_msg return word else: error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' raise Exception('Error: {}'.format(error_msg)) Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 15) +++ svgscripts/datatypes/page.py (revision 16) @@ -1,236 +1,236 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile #from myxmlwriter import write_pretty from .class_spec import SemanticClass from .image import Image from .word import Word from .lineNumber import LineNumber from .word_insertion_mark import WordInsertionMark from .transkriptionField import TranskriptionField class Page(SemanticClass): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ - RDF_SUBCLASSES = ['http://www.knora.org/ontology/0000/information-carrier#Page'] + RDF_SUBCLASSES = ['http://www.knora.org/ontology/shared/information-carrier#Page'] def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, pdfFile=None, svg_file=None, extract_transkription_field_only=False): self.title = title self.line_numbers = [] self.style_dict = {} self.sonderzeichen_list = [] self.svg_file = None self.pdfFile = None self.source = None self.number = int(page_number) if page_number is not None else -1 if xml_source_file is not None: if isfile(xml_source_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_source_file, parser) self.title = self.page_tree.getroot().get('title') self.number = self.page_tree.getroot().get('number') self.source = self.page_tree.getroot().get('source') self.init_words() self.add_style(style_node=self.page_tree.getroot().find('.//style')) self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None if pdfFile is not None and self.pdfFile is None: self.pdfFile = pdfFile ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) #write_pretty(xml_element_tree=self.page_tree, file_name=xml_source_file, script_name=__file__, file_type='svgWordPosition') if svg_file is not None and self.svg_file is None: self.svg_file = svg_file tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) #write_pretty(xml_element_tree=self.page_tree, file_name=xml_source_file, script_name=__file__, file_type='svgWordPosition') else: raise Exception('File "{}" does not exist!'.format(xml_source_file)) elif xml_target_file is not None: self.word_insertion_marks = [] self.words = [] self.svg_file = svg_file self.pdfFile = pdfFile if isfile(xml_target_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_target_file, parser) self.source = self.page_tree.getroot().get('source') if bool(self.page_tree.getroot().get('title')): self.title = self.page_tree.getroot().get('title') elif title is not None: self.page_tree.getroot().set('title', title) if self.svg_file is None: self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 elif len(self.page_tree.xpath('.//svg/@file')) == 0: tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) else: self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 if self.pdfFile is None: self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None elif len(self.page_tree.xpath('.//pdf/@file')) == 0: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG() ]: for node in self.page_tree.xpath('//' + xpath2remove): node.getparent().remove(node) else: self.page_tree = ET.ElementTree(ET.Element('page')) self.pdfFile = pdfFile self.svg_file = svg_file if title is not None: self.page_tree.getroot().set('title', title) self.page_tree.getroot().set('transkription-field-only', str(extract_transkription_field_only).lower()) if page_number is not None: self.page_tree.getroot().set('number', str(page_number)) if self.pdfFile is not None: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) if self.svg_file is not None: tf = TranskriptionField(self.svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) self.svg_image = Image(file_name=self.svg_file, height=self.height, width=self.width) if self.svg_file is not None\ else None def init_line_numbers(self, line_numbers, document_bottom): """Init line numbers. """ even_index = 0 MINABOVE = 1 self.line_numbers = [] if len(line_numbers) > 0: first_line_bottom = line_numbers[even_index].top - MINABOVE self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 while even_index < len(line_numbers): self.line_numbers.append(LineNumber(id=line_numbers[even_index].id-1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=line_numbers[even_index].top-MINABOVE)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 self.line_numbers.append(LineNumber(id=line_numbers[even_index-1].id+1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=document_bottom)) for line_number in self.line_numbers: line_number.attach_object_to_tree(self.page_tree) def init_words(self): self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG()) ] self.words = [ Word.CREATE_WORD(word_node=word_node) for word_node in self.page_tree.getroot().xpath('//word') ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG()) ] for index, word in enumerate(self.words): for word_insertion_mark in self.word_insertion_marks: self.words[index] = word_insertion_mark.attach_and_update_word_if_involved(word) if self.words[index] != word: break def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None): """Adds a list of classes that are sonderzeichen and a style dictionary to page. """ self.sonderzeichen_list = sonderzeichen_list self.letterspacing_list = letterspacing_list self.style_dict = style_dict if style_node is not None: self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') } self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ] self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('letterspacing-list')) ] elif bool(self.style_dict): style_node = ET.SubElement(self.page_tree.getroot(), 'style') if len(self.sonderzeichen_list) > 0: style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list)) if len(self.letterspacing_list) > 0: style_node.set('letterspacing-list', ' '.join(self.letterspacing_list)) for key in self.style_dict.keys(): self.style_dict[key]['name'] = key ET.SubElement(style_node, 'class', attrib=self.style_dict[key]) def get_biggest_fontSize4styles(self, style_set={}): """Returns biggest font size from style_dict for a set of style class names. [:returns:] (float) biggest font size OR 1 if style_dict is empty """ if bool(self.style_dict): sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True) return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1 else: return 1 def get_line_number(self, y): """Returns line number id for element at y. [:return:] (int) line number id or -1 """ if len(self.line_numbers) > 0: result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ] return result_list[0] if len(result_list) > 0 else -1 else: return -1 @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {'title': (str, 1), 'number': (str, 1), 'line_numbers': (LineNumber, SemanticClass.LIST), 'words': (Word, SemanticClass.LIST),\ 'svg_image': (Image, 1), 'word_insertion_marks': (WordInsertionMark, SemanticClass.LIST)} dictionary.update({'class': class_dict}) dictionary.update({'properties': properties}) return dictionary Index: . =================================================================== --- . (revision 15) +++ . (revision 16) Property changes on: . ___________________________________________________________________ Modified: svn:ignore ## -2,3 +2,4 ## pdf text_svg xml +ontologies