Page MenuHomec4science

py2ttl_data.py
No OneTemporary

File Metadata

Created
Thu, May 2, 20:55

py2ttl_data.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert py objects to data in turtle format.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
import getopt
import lxml.etree as ET
from os import sep, path, listdir
from os.path import isfile, isdir, dirname, basename, getmtime
from progress.bar import Bar
import re
import sys
sys.path.append('svgscripts')
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.super_page import SuperPage
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from class_spec import SemanticClass
from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL
from data_handler import RDFDataHandler
sys.path.append('shared_util')
from myxmlwriter import xml2dict
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Py2TTLDataConverter:
"""This class can be used convert py objects to rdf data in turtle format.
"""
UNITTESTING = False
def __init__(self, manuscript_file, xml_dictionary_file=None, mapping_dictionary=None):
if mapping_dictionary is None and xml_dictionary_file is not None:
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.CYAN + 'initializing mapping dictionary from file "{}" ...'.format(xml_dictionary_file))
self.mapping_dictionary = xml2dict(xml_dictionary_file)
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.GREEN + '[{} classes added]'.format(str(len(self.mapping_dictionary['classes']))))
elif mapping_dictionary is not None:
self.mapping_dictionary = mapping_dictionary
else:
raise Exception('Error: Py2TTLDataConverter init expects either a xml_dictionary_file or a mapping_dictionary!')
self.manuscript_file = manuscript_file
def convert(self, page_status_list=None, create_or_update_changed_pages=False):
"""Convert manuscript instantiated with manuscript_file to rdf data and write to target_file.
"""
if page_status_list is None or len(page_status_list) < 1:
page_status_list = ['OK', SuperPage.STATUS_MERGED_OK]
not Py2TTLDataConverter.UNITTESTING and print(Fore.CYAN + 'initializing python objects with file "{}" ...'.format(self.manuscript_file))
manuscript = ArchivalManuscriptUnity.create_cls(self.manuscript_file, page_status_list=page_status_list, update_page_styles=True)
include_tag = '_INCLUDE'\
if 'OK' in page_status_list and len(page_status_list) == 1\
else ''
target_data_file = manuscript.title.replace(' ', '_') + include_tag + '_DATA.ttl'
skip_list = None\
if create_or_update_changed_pages is False\
else [ 'Page' ]
data_handler = RDFDataHandler(target_data_file, self.mapping_dictionary)
identifier_uri = data_handler.add_data(manuscript, '', skip_data_instance_list=skip_list)
if create_or_update_changed_pages:
counter = 0
for page in [ page for page in manuscript.pages ]:
if 'xml_file' not in page.__dict__.keys():
#TODO: change xml_file to @output in manuscrit_tree
page.xml_file = manuscript.manuscript_tree.docinfo.URL.replace('.xml', '_') + 'page' + page.number + '.xml'
target_page_file = page.xml_file.replace('xml', 'ttl')
if isfile(page.xml_file) and (not isfile(target_page_file) or getmtime(page.xml_file) > getmtime(target_page_file)):
counter += 1
page_data_handler = RDFDataHandler(target_page_file, self.mapping_dictionary)
page_data_handler.add_data(page, identifier_uri.split('#')[1], parent_data_instance=manuscript)
page_data_handler.write()
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.GREEN + f'[{counter} pages created/updated]')
elif not Py2TTLDataConverter.UNITTESTING:
print(Fore.GREEN + '[{} pages added]'.format(str(len([ page for page in manuscript.pages if 'xml_file' in page.__dict__.keys()]))))
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.CYAN + 'adding triples to rdf graph ... ')
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.GREEN + '[{} statements added]'.format(str(len(data_handler.data_graph))))
print(Fore.CYAN + 'writing graph to file "{}" ...'.format(target_data_file))
data_handler.write()
if not Py2TTLDataConverter.UNITTESTING:
print(Fore.GREEN + '[OK]')
print(Style.RESET_ALL)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert py objects to rdf data in turtle format.
py2ttl/py2ttl_data.py [OPTIONS] <manuscript.xml>
<manuscript.xml> xml file of type shared_util.myxmlwriter.FILE_TYPE_XML_MANUSCRIPT.
OPTIONS:
-h|--help: show help
-c|--create-or-update-pages create or update pages as seperate ttl files in dir 'ttl'
-i|--include-status=STATUS include pages with status = STATUS. STATUS is a ':' seperated string of status, e.g. 'OK:faksimile merged'.
-m|--mapping=mapping_dict.xml xml file generated by py2ttl/py2ttl.py containing mapping information for each property of a class.
:return: exit code (int)
"""
check_config_files_exist()
datatypes_dir = get_datatypes_dir()
target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, PROJECT_NAME)
xml_dictionary_file = 'mapping_file4' + datatypes_dir.replace(sep, '.') + '2' + target_ontology_file.replace('.' + sep, '').replace(sep, '.').replace('.ttl', '.xml')
create_or_update_pages = False
manuscript_file = None
page_status_list = None
try:
opts, args = getopt.getopt(argv, "hci:I", ["help", "create-or-update-pages", "include-status=", "Include-files-only"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-i', '--include-status'):
page_status_list = arg.split(':')
elif opt in ('-c', '--create-or-update-pages'):
create_or_update_pages = True
elif opt in ('-m', '--mapping'):
xml_dictionary_file = arg
if len(args) < 1 :
usage()
return 2
manuscript_file = args[0]
if not isfile(xml_dictionary_file) or not isfile(manuscript_file):
usage()
return 2
converter = Py2TTLDataConverter(manuscript_file, xml_dictionary_file=xml_dictionary_file)
converter.convert(page_status_list=page_status_list, create_or_update_changed_pages=create_or_update_pages)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline