Page MenuHomec4science

fix_old_data.py
No OneTemporary

File Metadata

Created
Sun, Nov 10, 02:28

fix_old_data.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.path import Path as SVGPath
from svgpathtools.path import Line
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.box import Box
from datatypes.faksimile import FaksimilePage
from datatypes.manuscript import ArchivalManuscriptUnity
from datatypes.mark_foreign_hands import MarkForeignHands
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.path import Path
from datatypes.text_connection_mark import TextConnectionMark
from datatypes.transkriptionField import TranskriptionField
from datatypes.word import Word, update_transkription_position_ids
from util import back_up
from process_files import update_svgposfile_status
from process_words_post_merging import update_faksimile_line_positions
sys.path.append('shared_util')
from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import create_function_dictionary
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
def save_page(page):
"""Write page to xml file
"""
script_name = f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}'
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=script_name, file_type=FILE_TYPE_SVG_WORD_POSITION)
def page_already_changed(page) -> bool:
"""Return whether page has alreadybeen changed by function
"""
return len(\
page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}:{inspect.currentframe().f_back.f_code.co_name}"]')\
) > 0
def fix_faksimile_line_position(page, redo=False) -> bool:
"""Create a faksimile line position.
"""
if not redo and page_already_changed(page):
return False;
update_faksimile_line_positions(page)
if not UNITTESTING:
save_page(page)
return True
def check_faksimile_positions(page, redo=False) -> bool:
"""Check faksimile line position.
"""
if len(page.page_tree.xpath('//data-source/@file')) > 0:
svg_file = page.page_tree.xpath('//data-source/@file')[0]
svg_tree = ET.parse(svg_file)
positions_are_equal_counter = 0
page_changed = False
for faksimile_page in FaksimilePage.GET_FAKSIMILEPAGES(svg_tree):
if page.title == faksimile_page.title\
and page.number == faksimile_page.page_number:
#print([fp.id for fp in faksimile_page.word_positions ])
for word in page.words:
for fp in word.faksimile_positions:
rect_fps = [ rfp for rfp in faksimile_page.word_positions if rfp.id == fp.id ]
if len(rect_fps) > 0:
rfp = rect_fps[0]
if fp.left != rfp.left or fp.top != rfp.top:
#print(f'{fp.id}: {fp.left}/{rfp.left} {fp.top}/{rfp.top}')
fp.left = rfp.left
fp.top = rfp.top
fp.bottom = fp.top + rfp.height
word.attach_word_to_tree(page.page_tree)
page_changed = True
else:
positions_are_equal_counter += 1
print(f'{positions_are_equal_counter}/{len(page.words)} are equal')
if page_changed and not UNITTESTING:
save_page(page)
return page_changed
def fix_faksimile_positions(page, redo=False) -> bool:
"""Set faksimile positions to absolute values.
[:return:] fixed
"""
if not redo and len(page.page_tree.xpath(f'//metadata/modifiedBy[@script="{__file__}"]')) > 0:
return False
x_min = page.text_field.xmin
y_min = page.text_field.ymin
for word in page.words:
for fp in word.faksimile_positions:
fp.left = fp.left + x_min
fp.top = fp.top + y_min
fp.bottom = fp.bottom + y_min
word.attach_word_to_tree(page.page_tree)
if not UNITTESTING:
print(f'writing to {page.page_tree.docinfo.URL}')
write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\
script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
return True
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix faksimile position ->set them to their absolute value.
svgscripts/fix_old_data.py [OPTIONS] <xmlManuscriptFile|svg_pos_file>
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-c|--check-faksimile-positions check whether faksimile positions have been updated
-l|--faksimile-line-position create faksimile line positions
-p|--faksimile-positions fix old faksimile positions
-r|--redo rerun
:return: exit code (int)
"""
function_list = []
function_dict = create_function_dictionary(['default', '-c', '--check-faksimile-positions'], check_faksimile_positions)
function_dict = create_function_dictionary(['default', '-l', '--faksimile-line-position'], fix_faksimile_line_position, function_dictionary=function_dict)
function_dict = create_function_dictionary(['-p', '--faksimile-positions'], fix_faksimile_positions, function_dictionary=function_dict)
redo = False;
try:
opts, args = getopt.getopt(argv, "hcplr", ["help", "check-faksimile-positions", "faksimile-positions", "faksimile-line-position", "redo" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-r', '--redo'):
redo = True;
elif opt in function_dict.keys():
function_list.append(function_dict[opt])
if len(function_list) == 0:
function_list.append(function_dict['default'])
if len(args) < 1:
usage()
return 2
exit_status = 0
xml_file = args[0]
if isfile(xml_file):
counters = { f.__name__: 0 for f in function_list }
for page in Page.get_pages_from_xml_file(xml_file, status_contains=STATUS_MERGED_OK):
for current_function in function_list:
if not UNITTESTING:
print(Fore.CYAN + f'Processing {page.title}, {page.number} with function {current_function.__name__} ...' + Style.RESET_ALL)
back_up(page, page.xml_file)
counters[current_function.__name__] += 1 if current_function(page, redo=redo) else 0
if not UNITTESTING:
for function_name, counter in counters.items():
print(Style.RESET_ALL + f'[{counter} pages changed by {function_name}]')
else:
raise FileNotFoundError('File {} does not exist!'.format(xml_file))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline