Page MenuHomec4science

process_footnotes.py
No OneTemporary

File Metadata

Created
Thu, Apr 18, 09:40

process_footnotes.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
import getopt
import lxml.etree as ET
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path as PathlibPath
from progress.bar import Bar
import inspect
import re
import shutil
import sys
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.atypical_writing import AtypicalWriting
from datatypes.clarification import Clarification
from datatypes.editor_comment import EditorComment
from datatypes.editor_correction import EditorCorrection
from datatypes.footnotes import extract_footnotes
from datatypes.imprint import extract_imprints
from datatypes.line_continuation import LineContinuation
from datatypes.standoff_tag import StandoffTag
from datatypes.text import Text
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.uncertain_decipherment import UncertainDecipherment
from util import back_up
from process_files import update_svgposfile_status
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
ATYPICAL_GROUP = re.compile(r'(.*:.*]\s*)(¿)(.*)')
CLARIFICATION_GROUP = re.compile(r'(.*:.*]\s*)(Vk)(.*)')
CONTINUATION_GROUP = re.compile(r'(.*:\s*)(Fortsetzung\s*)')
COMMENT_GROUP = re.compile(r'(.*:.*])')
EDITOR_CORRECTION_GROUP = re.compile(r'(.*:.*]\s*)(>[?]*)(.*)')
LINE_REFERENCE_GROUP = re.compile(r'(\d+-|\d/(\d+/)*)*([0-9]+)(:.*)')
LINE_REFERENCE_GROUP_START_INDEX = 1
LINE_REFERENCE_GROUP_MID_INDEX = 2
LINE_REFERENCE_GROUP_END_INDEX = 3
LINE_COMMENT_GROUP = re.compile(r'(.*\d+:)')
UNCERTAINTY_WORD_GROUP = re.compile(r'(.*:.*]\s*)([>]*\?)(.*)')
UNCERTAINTY_EDITOR_GROUP = re.compile(r'(.*)(\?)')
WORD_REFERENCE_GROUP = re.compile(r'(.*[0-9]+:\s*)(.*)(].*)')
DEBUG = False
def categorize_footnotes(page, footnotes=None, debug=False, skip_after=-1.0, find_content=False):
"""Categorize footnotes.
"""
DEBUG = debug
if footnotes is None:
footnotes = extract_footnotes(page, skip_after=skip_after)
for footnote in footnotes:
line_match = re.match(LINE_REFERENCE_GROUP, footnote.content)
if line_match is not None:
_process_line_match(page, footnote, line_match)
else:
warnings.warn(f'Unknown editor comment without a line reference: <{footnote}>')
if find_content and len(page.text_connection_marks) > 0:
TextConnectionMark.find_content_in_footnotes(page, footnotes=footnotes)
page.update_and_attach_words2tree()
for line in page.lines: line.attach_object_to_tree(page.page_tree)
DEBUG = False
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
def save_imprints(page):
"""Categorize footnotes.
"""
for imprint in extract_imprints(page):
imprint.attach_object_to_tree(page.page_tree)
if not UNITTESTING:
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}', file_type=FILE_TYPE_SVG_WORD_POSITION)
def _is_uncertain(footnote) -> bool:
"""Return whether footnote contains sign for uncertainty.
"""
uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content)
return (uncertain_match is not None\
and len([ markup for markup in footnote.standoff_markups\
if markup.css_string.endswith('italic;')\
and uncertain_match.end() >= markup.startIndex\
and uncertain_match.end() <= markup.endIndex ]) > 0)
def _process_line_match(page, footnote, line_match):
"""Process footnote if reference to a line matches.
"""
word_match = re.match(WORD_REFERENCE_GROUP, footnote.content)
end_line_number = int(line_match.group(LINE_REFERENCE_GROUP_END_INDEX))
lines = []
if line_match.group(LINE_REFERENCE_GROUP_START_INDEX) is not None:
if line_match.group(LINE_REFERENCE_GROUP_MID_INDEX) is not None:
line_ids = [ int(line_id) for line_id in\
line_match.group(LINE_REFERENCE_GROUP_START_INDEX).split('/')\
if line_id != '' ] + [ end_line_number ]
lines = [ line for line in page.lines if line.id in line_ids ]
else:
start_line_number = int(line_match.group(1)[0:-1])
lines = [ line for line in page.lines if line.id >= start_line_number and line.id <= end_line_number ]
else:
lines = [ line for line in page.lines if line.id == end_line_number ]
if word_match is not None:
_process_word_match(page.words, footnote, line_match, word_match.group(2), end_line_number)
elif len(lines) > 0:
uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content)
for line in lines:
_process_line_reference(page, footnote, line, _is_uncertain(footnote))
else:
warnings.warn(f'Footnote refers to missing line {line_number}: {footnote}')
def _process_line_reference(page, footnote, line, is_uncertain):
"""Process footnote if there is a line reference.
"""
continuation_match = re.match(CONTINUATION_GROUP, footnote.content)
if continuation_match is not None:
reference_string = footnote.content[continuation_match.end():]
if is_uncertain:
reference_string = reference_string[:-1]
line.editor_comments.append(LineContinuation.create_cls(reference_string=reference_string, is_uncertain=is_uncertain))
else:
comment_match = re.match(LINE_COMMENT_GROUP, footnote.content)
if comment_match is not None:
is_uncertain = _is_uncertain(footnote)
comment = footnote.content[comment_match.end():-1].strip()\
if is_uncertain\
else footnote.content[comment_match.end():].strip()
line.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain))
else:
warnings.warn(f'Unknown editor comment for line "{line.id}": <{footnote}>')
def _process_word_match(words, footnote, line_match, word_text, line_number, parent_word_composition=None):
"""Process footnote if there is a word reference.
"""
referred_words = [ word for word in words\
if word.line_number == line_number\
and (word.text == word_text\
or re.match(rf'\W*{word_text}\W', word.text)\
or word.edited_text == word_text) ]
referred_word_parts = [ word.word_parts for word in words\
if word.line_number == line_number\
and len(word.word_parts) > 0\
and word_text in [ wp.text for wp in word.word_parts ] ]
overwritten_word_matches = [ word for word in words\
if word.line_number == line_number\
and len(word.word_parts) > 0\
and len([word_part for word_part in word.word_parts\
if word_part.overwrites_word is not None\
and word_part.overwrites_word.text == word_text]) > 0]
if len(referred_words) > 0\
or len(overwritten_word_matches) > 0\
or len(referred_word_parts) > 0:
word = None
if len(referred_words) == 1:
word = referred_words[0]
elif len(overwritten_word_matches) > 0:
word = [ word_part.overwrites_word for word_part in overwritten_word_matches[0].word_parts\
if word_part.overwrites_word is not None and word_part.overwrites_word.text == word_text][0]
elif len(referred_word_parts) > 0:
word = [ word_part for word_part in referred_word_parts[0] if word_part.text == word_text ][0]
elif len([ better_word for better_word in referred_words if better_word.text == word_text]) > 0:
word = [ better_word for better_word in referred_words if better_word.text == word_text][0]
else:
word = referred_words[0]
atypical_match = re.match(ATYPICAL_GROUP, footnote.content)
correction_match = re.match(EDITOR_CORRECTION_GROUP, footnote.content)
clarification_match = re.match(CLARIFICATION_GROUP, footnote.content)
is_uncertain = re.match(UNCERTAINTY_WORD_GROUP, footnote.content) is not None
if correction_match is not None:
correction = correction_match.group(3).strip()
word.editor_comments.append(EditorCorrection(correction_text=correction, is_uncertain=is_uncertain))
if not is_uncertain:
word.edited_text = correction
elif clarification_match is not None:
word.editor_comments.append(Clarification(text=footnote.extract_part(word_text, css_filter='bold;')))
elif atypical_match is not None:
text = footnote.extract_part(word_text, css_filter='bold;')\
if footnote.markup_contains_css_filter('bold;')\
else None
word.editor_comments.append(AtypicalWriting(text=text))
elif is_uncertain:
word.editor_comments.append(UncertainDecipherment())
else:
comment_match = re.match(COMMENT_GROUP, footnote.content)
if comment_match is not None:
is_uncertain = _is_uncertain(footnote)
comment = footnote.content[comment_match.end():-1].strip()\
if is_uncertain\
else footnote.content[comment_match.end():].strip()
word.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain))
else:
warnings.warn(f'Unknown editor comment for word "{word.text}": <{footnote}>')
elif re.match(r'.*\s.*', word_text):
for word_part in word_text.split(' '):
_process_word_match(words, footnote, line_match, word_part, line_number, parent_word_composition=word_text)
elif len([word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]) > 0:
new_words = []
for word in [word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]:
new_words += word.word_parts
_process_word_match(new_words, footnote, line_match, word_text, line_number)
else:
warnings.warn(f'No word found with text "{word_text}" on line {line_number}: <{footnote}>')
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to process the footnotes of a page.
svgscripts/process_footnotes.py [OPTIONS] <xmlManuscriptFile|svg_pos_file>
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-s|--skip-until=left skip all nodes.get('X') < left
:return: exit code (int)
"""
skip_after=-1.0
try:
opts, args = getopt.getopt(argv, "hs:", ["help", "skip-until=" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-s', '--skip-until'):
skip_after = float(arg)
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
manuscript_file = file_a\
if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\
else None
counter = 0
for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK):
if not UNITTESTING:
print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
categorize_footnotes(page, skip_after=skip_after, find_content=True)
save_imprints(page)
counter += 1
not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline