process_footnotes.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Apr 18, 09:40

process_footnotes.py
View Options

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	""" This program can be used to process words after they have been merged with faksimile data.
	"""
	# Copyright (C) University of Basel 2019 {{{1
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}

	from colorama import Fore, Style
	import getopt
	import lxml.etree as ET
	import os
	from os import listdir, sep, path, setpgrp, devnull
	from os.path import exists, isfile, isdir, dirname, basename
	from pathlib import Path as PathlibPath
	from progress.bar import Bar
	import inspect
	import re
	import shutil
	import sys
	import warnings

	if dirname(__file__) not in sys.path:
	sys.path.append(dirname(__file__))

	from datatypes.archival_manuscript import ArchivalManuscriptUnity
	from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
	from datatypes.atypical_writing import AtypicalWriting
	from datatypes.clarification import Clarification
	from datatypes.editor_comment import EditorComment
	from datatypes.editor_correction import EditorCorrection
	from datatypes.footnotes import extract_footnotes
	from datatypes.imprint import extract_imprints
	from datatypes.line_continuation import LineContinuation
	from datatypes.standoff_tag import StandoffTag
	from datatypes.text import Text
	from datatypes.text_connection_mark import TextConnectionMark
	from datatypes.uncertain_decipherment import UncertainDecipherment

	from util import back_up
	from process_files import update_svgposfile_status

	sys.path.append('shared_util')
	from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT


	__author__ = "Christian Steiner"
	__maintainer__ = __author__
	__copyright__ = 'University of Basel'
	__email__ = "christian.steiner@unibas.ch"
	__status__ = "Development"
	__license__ = "GPL v3"
	__version__ = "0.0.1"

	UNITTESTING = False

	ATYPICAL_GROUP = re.compile(r'(.:.]\s)(¿)(.)')
	CLARIFICATION_GROUP = re.compile(r'(.:.]\s)(Vk)(.)')
	CONTINUATION_GROUP = re.compile(r'(.:\s)(Fortsetzung\s*)')
	COMMENT_GROUP = re.compile(r'(.:.])')
	EDITOR_CORRECTION_GROUP = re.compile(r'(.:.]\s)(>[?])(.*)')
	LINE_REFERENCE_GROUP = re.compile(r'(\d+-\|\d/(\d+/))([0-9]+)(:.*)')
	LINE_REFERENCE_GROUP_START_INDEX = 1
	LINE_REFERENCE_GROUP_MID_INDEX = 2
	LINE_REFERENCE_GROUP_END_INDEX = 3
	LINE_COMMENT_GROUP = re.compile(r'(.*\d+:)')
	UNCERTAINTY_WORD_GROUP = re.compile(r'(.:.]\s)([>]\?)(.*)')
	UNCERTAINTY_EDITOR_GROUP = re.compile(r'(.*)(\?)')
	WORD_REFERENCE_GROUP = re.compile(r'(.[0-9]+:\s)(.)(].)')
	DEBUG = False

	def categorize_footnotes(page, footnotes=None, debug=False, skip_after=-1.0, find_content=False):
	"""Categorize footnotes.
	"""
	DEBUG = debug
	if footnotes is None:
	footnotes = extract_footnotes(page, skip_after=skip_after)
	for footnote in footnotes:
	line_match = re.match(LINE_REFERENCE_GROUP, footnote.content)
	if line_match is not None:
	_process_line_match(page, footnote, line_match)
	else:
	warnings.warn(f'Unknown editor comment without a line reference: <{footnote}>')
	if find_content and len(page.text_connection_marks) > 0:
	TextConnectionMark.find_content_in_footnotes(page, footnotes=footnotes)
	page.update_and_attach_words2tree()
	for line in page.lines: line.attach_object_to_tree(page.page_tree)
	DEBUG = False
	if not UNITTESTING:
	write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
	script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)

	def save_imprints(page):
	"""Categorize footnotes.
	"""
	for imprint in extract_imprints(page):
	imprint.attach_object_to_tree(page.page_tree)
	if not UNITTESTING:
	write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
	script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}', file_type=FILE_TYPE_SVG_WORD_POSITION)

	def _is_uncertain(footnote) -> bool:
	"""Return whether footnote contains sign for uncertainty.
	"""
	uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content)
	return (uncertain_match is not None\
	and len([ markup for markup in footnote.standoff_markups\
	if markup.css_string.endswith('italic;')\
	and uncertain_match.end() >= markup.startIndex\
	and uncertain_match.end() <= markup.endIndex ]) > 0)

	def _process_line_match(page, footnote, line_match):
	"""Process footnote if reference to a line matches.
	"""
	word_match = re.match(WORD_REFERENCE_GROUP, footnote.content)
	end_line_number = int(line_match.group(LINE_REFERENCE_GROUP_END_INDEX))
	lines = []
	if line_match.group(LINE_REFERENCE_GROUP_START_INDEX) is not None:
	if line_match.group(LINE_REFERENCE_GROUP_MID_INDEX) is not None:
	line_ids = [ int(line_id) for line_id in\
	line_match.group(LINE_REFERENCE_GROUP_START_INDEX).split('/')\
	if line_id != '' ] + [ end_line_number ]
	lines = [ line for line in page.lines if line.id in line_ids ]
	else:
	start_line_number = int(line_match.group(1)[0:-1])
	lines = [ line for line in page.lines if line.id >= start_line_number and line.id <= end_line_number ]
	else:
	lines = [ line for line in page.lines if line.id == end_line_number ]
	if word_match is not None:
	_process_word_match(page.words, footnote, line_match, word_match.group(2), end_line_number)
	elif len(lines) > 0:
	uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content)
	for line in lines:
	_process_line_reference(page, footnote, line, _is_uncertain(footnote))
	else:
	warnings.warn(f'Footnote refers to missing line {line_number}: {footnote}')

	def _process_line_reference(page, footnote, line, is_uncertain):
	"""Process footnote if there is a line reference.
	"""
	continuation_match = re.match(CONTINUATION_GROUP, footnote.content)
	if continuation_match is not None:
	reference_string = footnote.content[continuation_match.end():]
	if is_uncertain:
	reference_string = reference_string[:-1]
	line.editor_comments.append(LineContinuation.create_cls(reference_string=reference_string, is_uncertain=is_uncertain))
	else:
	comment_match = re.match(LINE_COMMENT_GROUP, footnote.content)
	if comment_match is not None:
	is_uncertain = _is_uncertain(footnote)
	comment = footnote.content[comment_match.end():-1].strip()\
	if is_uncertain\
	else footnote.content[comment_match.end():].strip()
	line.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain))
	else:
	warnings.warn(f'Unknown editor comment for line "{line.id}": <{footnote}>')

	def _process_word_match(words, footnote, line_match, word_text, line_number, parent_word_composition=None):
	"""Process footnote if there is a word reference.
	"""
	referred_words = [ word for word in words\
	if word.line_number == line_number\
	and (word.text == word_text\
	or re.match(rf'\W*{word_text}\W', word.text)\
	or word.edited_text == word_text) ]
	referred_word_parts = [ word.word_parts for word in words\
	if word.line_number == line_number\
	and len(word.word_parts) > 0\
	and word_text in [ wp.text for wp in word.word_parts ] ]
	overwritten_word_matches = [ word for word in words\
	if word.line_number == line_number\
	and len(word.word_parts) > 0\
	and len([word_part for word_part in word.word_parts\
	if word_part.overwrites_word is not None\
	and word_part.overwrites_word.text == word_text]) > 0]
	if len(referred_words) > 0\
	or len(overwritten_word_matches) > 0\
	or len(referred_word_parts) > 0:
	word = None
	if len(referred_words) == 1:
	word = referred_words[0]
	elif len(overwritten_word_matches) > 0:
	word = [ word_part.overwrites_word for word_part in overwritten_word_matches[0].word_parts\
	if word_part.overwrites_word is not None and word_part.overwrites_word.text == word_text][0]
	elif len(referred_word_parts) > 0:
	word = [ word_part for word_part in referred_word_parts[0] if word_part.text == word_text ][0]
	elif len([ better_word for better_word in referred_words if better_word.text == word_text]) > 0:
	word = [ better_word for better_word in referred_words if better_word.text == word_text][0]
	else:
	word = referred_words[0]
	atypical_match = re.match(ATYPICAL_GROUP, footnote.content)
	correction_match = re.match(EDITOR_CORRECTION_GROUP, footnote.content)
	clarification_match = re.match(CLARIFICATION_GROUP, footnote.content)
	is_uncertain = re.match(UNCERTAINTY_WORD_GROUP, footnote.content) is not None
	if correction_match is not None:
	correction = correction_match.group(3).strip()
	word.editor_comments.append(EditorCorrection(correction_text=correction, is_uncertain=is_uncertain))
	if not is_uncertain:
	word.edited_text = correction
	elif clarification_match is not None:
	word.editor_comments.append(Clarification(text=footnote.extract_part(word_text, css_filter='bold;')))
	elif atypical_match is not None:
	text = footnote.extract_part(word_text, css_filter='bold;')\
	if footnote.markup_contains_css_filter('bold;')\
	else None
	word.editor_comments.append(AtypicalWriting(text=text))
	elif is_uncertain:
	word.editor_comments.append(UncertainDecipherment())
	else:
	comment_match = re.match(COMMENT_GROUP, footnote.content)
	if comment_match is not None:
	is_uncertain = _is_uncertain(footnote)
	comment = footnote.content[comment_match.end():-1].strip()\
	if is_uncertain\
	else footnote.content[comment_match.end():].strip()
	word.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain))
	else:
	warnings.warn(f'Unknown editor comment for word "{word.text}": <{footnote}>')
	elif re.match(r'.\s.', word_text):
	for word_part in word_text.split(' '):
	_process_word_match(words, footnote, line_match, word_part, line_number, parent_word_composition=word_text)
	elif len([word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]) > 0:
	new_words = []
	for word in [word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]:
	new_words += word.word_parts
	_process_word_match(new_words, footnote, line_match, word_text, line_number)
	else:
	warnings.warn(f'No word found with text "{word_text}" on line {line_number}: <{footnote}>')

	def usage():
	"""prints information on how to use the script
	"""
	print(main.__doc__)

	def main(argv):
	"""This program can be used to process the footnotes of a page.

	svgscripts/process_footnotes.py [OPTIONS] <xmlManuscriptFile\|svg_pos_file>

	<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
	<svg_pos_file> a xml file about a page, containing information about svg word positions.

	OPTIONS:
	-h\|--help show help
	-s\|--skip-until=left skip all nodes.get('X') < left

	:return: exit code (int)
	"""
	skip_after=-1.0
	try:
	opts, args = getopt.getopt(argv, "hs:", ["help", "skip-until=" ])
	except getopt.GetoptError:
	usage()
	return 2
	for opt, arg in opts:
	if opt in ('-h', '--help'):
	usage()
	return 0
	elif opt in ('-s', '--skip-until'):
	skip_after = float(arg)
	if len(args) < 1:
	usage()
	return 2
	exit_status = 0
	file_a = args[0]
	if isfile(file_a):
	manuscript_file = file_a\
	if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
	else None
	counter = 0
	for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK):
	if not UNITTESTING:
	print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL)
	back_up(page, page.xml_file)
	categorize_footnotes(page, skip_after=skip_after, find_content=True)
	save_imprints(page)
	counter += 1
	not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
	else:
	raise FileNotFoundError('File {} does not exist!'.format(file_a))
	return exit_status

	if __name__ == "__main__":
	sys.exit(main(sys.argv[1:]))

process_footnotes.pyNo OneTemporaryActions

File Metadata

process_footnotes.pyView Options

Event Timeline

process_footnotes.py
No OneTemporary
Actions

process_footnotes.py
View Options