Page MenuHomec4science

create_manuscript.py
No OneTemporary

File Metadata

Created
Sat, Nov 9, 06:47

create_manuscript.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to create a ArchivalManuscriptUnity.
"""
# Copyright (C) University of Basel 2020 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from colorama import Fore, Style
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname, basename
import lxml.etree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.super_page import SuperPage
sys.path.append('shared_util')
from myxmlwriter import parse_xml_of_type, write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
UNITTESTING = False
class ManuscriptCreator:
"""This class can be used to create a ArchivalManuscriptUnity.
"""
def __init__(self, xml_target_dir):
self.xml_target_dir = xml_target_dir
def _get_or_create_element(self, node, xpath, create_id=False) ->ET.Element:
"""Return a element with name == element_name, or create if it does not exist.
"""
elements = node.xpath(xpath)
if len(elements) > 0:
return elements[0]
else:
if re.match(r'[a-z]+\[@[a-z-]+=', xpath):
element_name = re.match(r'(.+?)\[@[a-z]+.*', xpath).group(1)
num_elements = len(node.xpath(element_name))
element = ET.SubElement(node, element_name)
element_attribute = re.match(r'[a-z]+\[@(.+?)=.*', xpath).group(1)
element_value = re.match(r'[a-z]+\[@[a-z-]+="(.+?)"]', xpath).group(1)
element.set(element_attribute, element_value)
if create_id:
element.set('id', str(num_elements))
return element
else:
num_elements = len(node.xpath(xpath))
element = ET.SubElement(node, xpath)
if create_id:
element.set('id', str(num_elements))
return element
def _create_or_update_pages(self, pages_node, manuscript_page_url_mapping):
"""Create or update pages.
"""
for page_number, url in manuscript_page_url_mapping.items():
xpath = SuperPage.XML_TAG + f'[@number="{page_number}"]'
page_node = self._get_or_create_element(pages_node, xpath, create_id=True)
if not bool(page_node.get('alias')):
page_node.set('alias', basename(url))
def create_or_update_manuscripts(self, manuscript_files, page_url_mapping):
"""Create or update manuscripts.
"""
for key in page_url_mapping:
relevant_files = [ manuscript_file for manuscript_file in manuscript_files\
if basename(manuscript_file) == key.replace(' ', '_') + '.xml']
if len(relevant_files) == 0:
manuscript_files.append(key.replace(' ', '_') + '.xml')
for manuscript_file in manuscript_files:
target_file = self.xml_target_dir + sep + manuscript_file\
if dirname(manuscript_file) == ''\
else manuscript_file
title = basename(target_file).replace('.xml', '').replace('_', ' ')
manuscript = ArchivalManuscriptUnity(title=title)
if isfile(target_file):
manuscript = ArchivalManuscriptUnity.create_cls(target_file)
else:
manuscript.manuscript_tree = ET.ElementTree(ET.Element(ArchivalManuscriptUnity.XML_TAG))
manuscript.manuscript_tree.docinfo.URL = target_file
manuscript.manuscript_tree.getroot().set('title', manuscript.title)
manuscript.manuscript_tree.getroot().set('type', manuscript.manuscript_type)
if title in page_url_mapping.keys():
pages_node = self._get_or_create_element(manuscript.manuscript_tree.getroot(), 'pages')
self._create_or_update_pages(pages_node, page_url_mapping[title])
if not UNITTESTING:
write_pretty(xml_element_tree=manuscript.manuscript_tree, file_name=target_file,\
script_name=__file__, file_type=FILE_TYPE_XML_MANUSCRIPT)
def create_page_url_mapping(input_file, mapping_dictionary, default_title=''):
"""Create a page to url mapping from input file.
File content:
TITLE PAGENUMBER\nURL
See: 'tests_svgscripts/test_data/content.txt'
"""
lines = []
with open(input_file, 'r') as f:
lines = f.readlines()
key = None
url = None
current_key = default_title
for content in lines:
if content.startswith('http')\
or content.startswith('www'):
url = content.replace('\n', '')\
if content.startswith('http')\
else 'http://' + content.replace('\n', '')
if current_key not in mapping_dictionary.keys():
mapping_dictionary.update({current_key: {}})
mapping_dictionary[current_key].update({key: url})
else:
key_parts = [ part.strip() for part in content.replace('\n', '').replace('S.', '').split(',') ]
key_index = 0
if len(key_parts) > 1:
title = key_parts[0]
if title not in mapping_dictionary.keys():
current_key = title
mapping_dictionary.update({current_key: {}})
key_index = 1
key = key_parts[key_index]
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to create or update one or more manuscripts.
svgscripts/create_manuscript.py [OPTIONS] [<input_fileA.txt>, ...] [<xmlManuscriptFile>, ...]
<input_file.txt> One or more files mapping pages to faksimile URLs, with 'txt'-suffix
<xmlManuscriptFile> manuscript file(s) (~ArchivalManuscriptUnity).
OPTIONS:
-h|--help: show help
-t|--title=title manuscript's title, e.g. "Mp XV".
-x|--xml-target-dir directory containing xmlManuscriptFile, default "./xml"
:return: exit code (int)
"""
title = ''
xml_target_dir = ".{}xml".format(sep)
page_url_mapping = {}
try:
opts, args = getopt.getopt(argv, "ht:x:", ["help", "title=", "xml-target-dir="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-x', '--xml-target-dir'):
xml_target_dir = arg
manuscript_files = [ arg for arg in args if arg.endswith('.xml')\
and '_page' not in arg ]
input_files = [ arg for arg in args if arg.endswith('.txt')\
and isfile(arg)]
for input_file in input_files:
create_page_url_mapping(input_file, page_url_mapping, default_title=title)
creator = ManuscriptCreator(xml_target_dir=xml_target_dir)
creator.create_or_update_manuscripts(manuscript_files, page_url_mapping)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline