py2ttl_data.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jul 27, 20:33

py2ttl_data.py
View Options

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	""" This program can be used to convert py objects to data in turtle format.
	"""
	# Copyright (C) University of Basel 2019 {{{1
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}

	from colorama import Fore, Style
	import getopt
	import lxml.etree as ET
	from os import sep, path, listdir
	from os.path import isfile, isdir, dirname, basename, getmtime
	from progress.bar import Bar
	import re
	import sys

	sys.path.append('svgscripts')
	from datatypes.archival_manuscript import ArchivalManuscriptUnity
	from datatypes.super_page import SuperPage

	if dirname(__file__) not in sys.path:
	sys.path.append(dirname(__file__))

	from class_spec import SemanticClass
	from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL
	from data_handler import RDFDataHandler

	sys.path.append('shared_util')
	from myxmlwriter import xml2dict

	__author__ = "Christian Steiner"
	__maintainer__ = __author__
	__copyright__ = 'University of Basel'
	__email__ = "christian.steiner@unibas.ch"
	__status__ = "Development"
	__license__ = "GPL v3"
	__version__ = "0.0.1"


	class Py2TTLDataConverter:
	"""This class can be used convert py objects to rdf data in turtle format.
	"""
	UNITTESTING = False

	def __init__(self, manuscript_file, xml_dictionary_file=None, mapping_dictionary=None):
	if mapping_dictionary is None and xml_dictionary_file is not None:
	if not Py2TTLDataConverter.UNITTESTING:
	print(Fore.CYAN + 'initializing mapping dictionary from file "{}" ...'.format(xml_dictionary_file))
	self.mapping_dictionary = xml2dict(xml_dictionary_file)
	if not Py2TTLDataConverter.UNITTESTING:
	print(Fore.GREEN + '[{} classes added]'.format(str(len(self.mapping_dictionary['classes']))))
	elif mapping_dictionary is not None:
	self.mapping_dictionary = mapping_dictionary
	else:
	raise Exception('Error: Py2TTLDataConverter init expects either a xml_dictionary_file or a mapping_dictionary!')
	self.manuscript_file = manuscript_file

	def convert(self, page_status_list=None, create_or_update_changed_pages=False):
	"""Convert manuscript instantiated with manuscript_file to rdf data and write to target_file.
	"""
	if page_status_list is None or len(page_status_list) < 1:
	page_status_list = ['OK', SuperPage.STATUS_MERGED_OK]
	not Py2TTLDataConverter.UNITTESTING and print(Fore.CYAN + 'initializing python objects with file "{}" ...'.format(self.manuscript_file))
	manuscript = ArchivalManuscriptUnity.create_cls(self.manuscript_file, page_status_list=page_status_list, update_page_styles=True)
	include_tag = '_INCLUDE'\
	if 'OK' in page_status_list and len(page_status_list) == 1\
	else ''
	target_data_file = manuscript.title.replace(' ', '_') + include_tag + '_DATA.ttl'
	skip_list = None\
	if create_or_update_changed_pages is False\
	else [ 'Page' ]
	data_handler = RDFDataHandler(target_data_file, self.mapping_dictionary)
	identifier_uri = data_handler.add_data(manuscript, '', skip_data_instance_list=skip_list)
	if create_or_update_changed_pages:
	counter = 0
	for page in [ page for page in manuscript.pages ]:
	if 'xml_file' not in page.__dict__.keys():
	#TODO: change xml_file to @output in manuscrit_tree
	page.xml_file = manuscript.manuscript_tree.docinfo.URL.replace('.xml', '_') + 'page' + page.number + '.xml'
	target_page_file = page.xml_file.replace('xml', 'ttl')
	if isfile(page.xml_file) and (not isfile(target_page_file) or getmtime(page.xml_file) > getmtime(target_page_file)):
	counter += 1
	page_data_handler = RDFDataHandler(target_page_file, self.mapping_dictionary)
	page_data_handler.add_data(page, identifier_uri.split('#')[1], parent_data_instance=manuscript)
	page_data_handler.write()
	if not Py2TTLDataConverter.UNITTESTING:
	print(Fore.GREEN + f'[{counter} pages created/updated]')
	elif not Py2TTLDataConverter.UNITTESTING:
	print(Fore.GREEN + '[{} pages added]'.format(str(len([ page for page in manuscript.pages if 'xml_file' in page.__dict__.keys()]))))
	if not Py2TTLDataConverter.UNITTESTING:
	print(Fore.CYAN + 'adding triples to rdf graph ... ')
	if not Py2TTLDataConverter.UNITTESTING:
	print(Fore.GREEN + '[{} statements added]'.format(str(len(data_handler.data_graph))))
	print(Fore.CYAN + 'writing graph to file "{}" ...'.format(target_data_file))
	data_handler.write()
	if not Py2TTLDataConverter.UNITTESTING:
	print(Fore.GREEN + '[OK]')
	print(Style.RESET_ALL)

	def usage():
	"""prints information on how to use the script
	"""
	print(main.__doc__)

	def main(argv):
	"""This program can be used to convert py objects to rdf data in turtle format.

	py2ttl/py2ttl_data.py [OPTIONS] <manuscript.xml>

	<manuscript.xml> xml file of type shared_util.myxmlwriter.FILE_TYPE_XML_MANUSCRIPT.

	OPTIONS:
	-h\|--help: show help
	-c\|--create-or-update-pages create or update pages as seperate ttl files in dir 'ttl'
	-i\|--include-status=STATUS include pages with status = STATUS. STATUS is a ':' seperated string of status, e.g. 'OK:faksimile merged'.
	-m\|--mapping=mapping_dict.xml xml file generated by py2ttl/py2ttl.py containing mapping information for each property of a class.

	:return: exit code (int)
	"""
	check_config_files_exist()
	datatypes_dir = get_datatypes_dir()
	target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, PROJECT_NAME)
	xml_dictionary_file = 'mapping_file4' + datatypes_dir.replace(sep, '.') + '2' + target_ontology_file.replace('.' + sep, '').replace(sep, '.').replace('.ttl', '.xml')
	create_or_update_pages = False
	manuscript_file = None
	page_status_list = None
	try:
	opts, args = getopt.getopt(argv, "hci:I", ["help", "create-or-update-pages", "include-status=", "Include-files-only"])
	except getopt.GetoptError:
	usage()
	return 2
	for opt, arg in opts:
	if opt in ('-h', '--help'):
	usage()
	return 0
	elif opt in ('-i', '--include-status'):
	page_status_list = arg.split(':')
	elif opt in ('-c', '--create-or-update-pages'):
	create_or_update_pages = True
	elif opt in ('-m', '--mapping'):
	xml_dictionary_file = arg
	if len(args) < 1 :
	usage()
	return 2
	manuscript_file = args[0]
	if not isfile(xml_dictionary_file) or not isfile(manuscript_file):
	usage()
	return 2
	converter = Py2TTLDataConverter(manuscript_file, xml_dictionary_file=xml_dictionary_file)
	converter.convert(page_status_list=page_status_list, create_or_update_changed_pages=create_or_update_pages)
	return 0


	if __name__ == "__main__":
	sys.exit(main(sys.argv[1:]))

py2ttl_data.pyNo OneTemporaryActions

File Metadata

py2ttl_data.pyView Options

Event Timeline

py2ttl_data.py
No OneTemporary
Actions

py2ttl_data.py
View Options