Page MenuHomec4science

fix_boxes.py
No OneTemporary

File Metadata

Created
Sat, Nov 9, 05:57

fix_boxes.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
from fix_old_data import save_page
sys.path.append('svgscripts')
from convert_wordPositions import HTMLConverter
from datatypes.box import Box
from datatypes.faksimile import FaksimilePage
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word, update_transkription_position_ids
from join_faksimileAndTranskription import sort_words
from util import back_up, back_up_svg_file, copy_faksimile_svg_file
from process_files import update_svgposfile_status
from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import create_function_dictionary
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
MAX_SVG_XY_THRESHOLD = 10
BOX_ERROR_STATUS = 'box error'
DEBUG_MSG = 'TODO: should have a box'
class WordWithBoxes(Word):
@classmethod
def create_cls(cls, word_node):
"""Creates a word from a (lxml.Element) node.
[:return:] WordWithBoxes
"""
word = super(WordWithBoxes,cls).create_cls(word_node)
word.missing_boxes = []
for index, debug_node in enumerate(word_node.xpath('./debug')):
missing_text = debug_node.get('text')
is_earlier_version = bool(debug_node.get('earlier-version'))\
and debug_node.get('earlier-version') == 'true'
overwritten_by = debug_node.get('overwritten-by')
if overwritten_by is not None:
split_into_parts_and_attach_box(word, index, missing_text, is_earlier_version, overwritten_by)
else:
attach_box(word, 0, missing_text, False)
word.create_correction_history()
if len(word.corrections) > 0:
for wp in word.word_parts:
wp.overwrites_word = None
return word
def attach_box(target_word, box_index, earlier_text, is_earlier_version):
"""Attach box to word.
"""
transkription_position = target_word.transkription_positions[0]
if len(target_word.transkription_positions) > 1:
positional_word_parts = []
for tp in target_word.transkription_positions:
positional_word_parts += tp.positional_word_parts
transkription_position = TranskriptionPosition(positional_word_parts=positional_word_parts)
target_word.word_box = Box(id=box_index, path=Path.create_path_from_transkription_position(transkription_position).path,\
earlier_text=earlier_text, earlier_version=is_earlier_version)
def split_into_parts_and_attach_box(target_word, box_index, missing_text, is_earlier_version, overwritten_by, child_process=False)->list:
"""Split word into word parts and attach a box to the part with text == overwritten_by.
"""
if len(target_word.word_parts) > 0:
index = 0
if True in [ wp.word_box is not None for wp in target_word.word_parts ]:
latest_word_with_box = [ wp for wp in target_word.word_parts if wp.word_box is not None ][-1]
index = target_word.word_parts.index(latest_word_with_box)+1
child_word_parts = []
for wp in target_word.word_parts[index:]:
word_parts = split_into_parts_and_attach_box(wp, box_index, missing_text, is_earlier_version, overwritten_by, child_process=True)
if child_process:
child_word_parts += word_parts
elif len(word_parts) > 0:
old_index = target_word.word_parts.index(wp)
target_word.word_parts[old_index] = word_parts[0]
for new_wp in word_parts[1:]:
target_word.word_parts.insert(old_index+1, new_wp)
if overwritten_by in [ new_wp.text for new_wp in word_parts ]:
break
if child_process:
return child_word_parts
return target_word.word_parts
elif overwritten_by in target_word.text:
new_words_triple = target_word.split(overwritten_by)
word_with_box = [ wp for wp in new_words_triple if wp is not None and wp.text == overwritten_by ][0]
attach_box(word_with_box, box_index, missing_text, is_earlier_version)
if not child_process:
if len(new_words_triple) > 1:
target_word.word_parts = [ i for i in new_words_triple if i is not None ]
target_word.transkription_positions = []
else:
target_word.word_box = word_with_box.word_box
return [ i for i in new_words_triple if i is not None ]
return []
def fix_boxes(page)->int:
"""Fix boxes and return exit code
"""
exit_status = 0
for word_node in set([ node.getparent() for node in page.page_tree.xpath('//' + Word.XML_TAG + f'/debug[@msg="{DEBUG_MSG}"]')]):
word = WordWithBoxes.create_cls(word_node)
try:
replace_word = [ w for w in page.words if w.id == word.id and w.text == word.text ][0]
page.words[page.words.index(replace_word)] = word
except IndexError:
return 2
if not UNITTESTING:
save_page(page, attach_first=True)
return exit_status
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix boxes.
svgscripts/fix_boxes.py [OPTIONS] <xmlManuscriptFile|svg_pos_file>
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
xml_file = args[0]
if isfile(xml_file):
counter = 0
for page in Page.get_pages_from_xml_file(xml_file, status_contains=BOX_ERROR_STATUS):
counter = 0
if not UNITTESTING:
print(Fore.CYAN + f'Fixing boxes of {page.title}, {page.number} ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
if fix_boxes(page) == 0:
counter += 1
if not UNITTESTING:
print(Style.RESET_ALL + f'[{counter} pages changed]')
else:
raise FileNotFoundError('File {} does not exist!'.format(xml_file))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline