Page MenuHomec4science

compare_faksimile_words_line_wise.py
No OneTemporary

File Metadata

Created
Fri, May 10, 10:53

compare_faksimile_words_line_wise.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path
from progress.bar import Bar
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.word_position import WordPosition
from datatypes.faksimile_position import FaksimilePosition
from datatypes.word import Word
from datatypes.lineNumber import LineNumber
from datatypes.page import Page, STATUS_MERGED_OK
from datatypes.transkriptionField import TranskriptionField
from interactive_merger import LineComposer, InteractiveMergerShell, ManualMergerShell
from join_faksimileAndTranskription import get_filelist_and_manuscript_file, sort_faksimile_positions, sort_words
from process_files import update_svgposfile_status
from process_words_post_merging import post_merging_processing_and_saving
from util import ExternalViewer, create_highlighted_svg_file, get_empty_node_ids, record_changes,\
record_changes_on_svg_file_to_page, record_changes_on_xml_file_to_page, get_mismatching_ids,\
replace_chars
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation)
PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"')
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
SINGLE_WORD_PATTERN = r"^[\w{}]$".format(string.punctuation)
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
MIN_THRESHOLD = 2
DO_DEBUG = False
class FaksimileLineComposer(LineComposer):
"""This class arranges the faksimile positions to lines.
"""
DEBUG = False
def __init__(self, faksimile_positions, threshold=10, num_lines_with_words=-1, page=None):
self.current_line_index = 0
self.current_faksimile_index = 0
reference_list = [ word.faksimile_positions[0] for word in page.words if len(word.faksimile_positions) > 0 and word.verified ]\
if page is not None\
else None
if reference_list is not None:
print([fp.text for fp in reference_list])
faksimile_positions = sort_faksimile_positions(faksimile_positions, reference_list=reference_list)
self.lines_of_faksimile_positions = self._init_faksimile_positions_per_line(faksimile_positions, threshold=threshold, num_lines_with_words=num_lines_with_words)
self.interactive_shell = InteractiveMergerShell(self, page=page)
def _init_faksimile_positions_per_line(self, faksimile_positions, threshold=10, num_lines_with_words=-1) -> list:
"""Return a list containing for each line a list of faksimile positions.
"""
if len(faksimile_positions) == 0:
return [[]]
lines_of_faksimile_positions = [[]]
last_wp = faksimile_positions[0]
index = 0
for wp in faksimile_positions:
if (wp.top+wp.bottom)/2 - (last_wp.top+last_wp.bottom)/2 > threshold:
lines_of_faksimile_positions.append([])
index += 1
lines_of_faksimile_positions[index].append(wp)
last_wp = wp
if num_lines_with_words > -1\
and num_lines_with_words != len(lines_of_faksimile_positions)\
and threshold > MIN_THRESHOLD:
return self._init_faksimile_positions_per_line(faksimile_positions,\
threshold=threshold-1, num_lines_with_words=num_lines_with_words)
return lines_of_faksimile_positions
def create_faksimile_dictionary(self, line_of_faksimile_positions, mergeables_only=False) ->dict:
"""Create a faksimile_dictionary with fp.text as key and a list of fp as value.
"""
faksimile_text_dictionary = {}
for faksimile_position in [ fp for fp in line_of_faksimile_positions\
if not mergeables_only or not fp.mergeable ]:
if faksimile_position.text not in faksimile_text_dictionary.keys():
faksimile_text_dictionary.update({faksimile_position.text: []})
faksimile_text_dictionary[faksimile_position.text].append(faksimile_position)
if faksimile_position.text == '-':
if '–' not in faksimile_text_dictionary.keys():
faksimile_text_dictionary.update({'–': []})
faksimile_text_dictionary['–'].append(faksimile_position)
return faksimile_text_dictionary
def fix_for_unmereged_items_if_two_left(self, new_words, unmerged_words, unmerged_faksimile_positions) ->int:
"""Merge if there are only two left.
[:return:] number of unmerged items
"""
if len(unmerged_words) == 1 and len(unmerged_faksimile_positions) == 1:
self.merge_word_with_fp(unmerged_words[0], unmerged_faksimile_positions[0], new_words)
unmerged_words, unmerged_faksimile_positions = [], []
return len(unmerged_words+unmerged_faksimile_positions)
def fix_for_unmereged_items_split_words(self, new_words, unmerged_words, unmerged_faksimile_positions) ->int:
"""Merge if there are only two left.
[:return:] number of unmerged items
"""
if len(unmerged_words) < len(unmerged_faksimile_positions):
for faksimile_position in unmerged_faksimile_positions:
line_number = self.get_line_number(faksimile_position, new_words)
words_on_line = [ word for word in new_words\
if word.line_number == line_number and len(word.faksimile_positions) > 0]
for word in words_on_line:
if word.text.replace(word.faksimile_positions[0].text, '') == faksimile_position.text:
left_word, right_word, _ = word.split(faksimile_position.text)
new_words.remove(word)
self.merge_word_with_fp(left_word, word.faksimile_positions[0], new_words)
self.merge_word_with_fp(right_word, faksimile_position, new_words)
unmerged_faksimile_positions.remove(faksimile_position)
return len(unmerged_words+unmerged_faksimile_positions)
def fix_for_unmereged_items_startswith(self, new_words, unmerged_words, unmerged_faksimile_positions, ignoreCase=False) ->int:
"""Do a final attempt at fixing unmerged words and faksimile_positions.
[:return:] number of unmerged items
"""
for word in sorted(unmerged_words, key=lambda word: len(word.text), reverse=True):
matches = [ fp for fp in unmerged_faksimile_positions if text_starts_with(word.text, fp.text, ignoreCase=ignoreCase) and not fp.joined ]
if len(matches) > 0:
faksimile_position = sorted(matches, key=lambda w: len(w.text), reverse=True)[0]
self.merge_word_with_fp(word, faksimile_position, new_words)
unmerged_words.remove(word)
unmerged_faksimile_positions.remove(faksimile_position)
else:
matches = [ fp for fp in unmerged_faksimile_positions if text_starts_with(fp.text, word.text, ignoreCase=ignoreCase) and not fp.joined ]
if len(matches) > 0:
faksimile_position = sorted(matches, key=lambda w: len(w.text), reverse=True)[0]
self.merge_word_with_fp(word, faksimile_position, new_words)
unmerged_words.remove(word)
unmerged_faksimile_positions.remove(faksimile_position)
return len(unmerged_words+unmerged_faksimile_positions)
def final_fix_for_unmereged_items(self, new_words, unmerged_words, unmerged_faksimile_positions) ->int:
"""Do a final attempt at fixing unmerged words and faksimile_positions.
[:return:] number of unmerged items
"""
self.fix_for_unmereged_items_if_two_left(new_words, unmerged_words, unmerged_faksimile_positions)
self.fix_for_unmereged_items_split_words(new_words, unmerged_words, unmerged_faksimile_positions)
num_unmerged = self.fix_for_unmereged_items_startswith(new_words, unmerged_words, unmerged_faksimile_positions)
latest_unmerged_words = [ word for word in unmerged_words if not word.joined ]
latest_unmerged_fps = [ fp for fp in unmerged_faksimile_positions if not fp.joined ]
if len(latest_unmerged_fps) > 0:
fp_ln_dict = {}
for fp in latest_unmerged_fps:
line_number = self.get_line_number(fp, new_words)
if line_number > -1:
if line_number not in fp_ln_dict.keys():
fp_ln_dict.update({line_number: []})
fp_ln_dict[line_number].append(fp)
for word in latest_unmerged_words:
if word.line_number in fp_ln_dict.keys():
matches = fp_ln_dict[word.line_number]
if len(matches) > 0:
self.merge_word_with_fp(word, matches.pop(0), new_words)
latest_unmerged_words = [ word for word in unmerged_words if not word.joined ]
latest_unmerged_fps = [ fp for fp in unmerged_faksimile_positions if not fp.joined ]
if len(latest_unmerged_words+latest_unmerged_fps) > 0:
if self.fix_for_unmereged_items_startswith(new_words, latest_unmerged_words, latest_unmerged_fps, ignoreCase=True) == 0:
return 0
return self.fix_for_unmereged_items_if_two_left(new_words, latest_unmerged_words, latest_unmerged_fps)
def get_lines_of_faksimile_positions(self) ->list:
"""Return lines_of_faksimile_positions.
"""
return self.lines_of_faksimile_positions
def get_next_faksimile(self) -> WordPosition:
"""Return next faksimile position.
"""
if len(self.lines_of_faksimile_positions) == 0:
return None
if self.current_line_index < len(self.lines_of_faksimile_positions):
if self.current_faksimile_index < len(self.lines_of_faksimile_positions[self.current_line_index]):
self.current_faksimile_index += 1
return self.lines_of_faksimile_positions[self.current_line_index][self.current_faksimile_index-1]
else:
self.current_line_index += 1
else:
self.current_line_index = 0
self.current_faksimile_index = 0
return self.get_next_faksimile()
def get_line_number(self, faksimile_position, new_words) -> int:
"""Return line_number of line containing faksimile_position.
"""
line_number = -1
for line in self.lines_of_faksimile_positions:
if faksimile_position in line:
joined_fps = [ fp for fp in line if fp.joined ]
if len(joined_fps) > 0:
line_numbers_of_joined_words = [ word for word in new_words\
if len(word.faksimile_positions) > 0\
and any(fp in word.faksimile_positions for fp in joined_fps) ]
if len(line_numbers_of_joined_words) > 0:
#print(faksimile_position.text, [ (w.line_number, w.text, w.faksimile_positions[0].id) for w in line_numbers_of_joined_words])
line_number = line_numbers_of_joined_words[0].line_number
return line_number
def get_line(self, line_of_words, index=-1, offset=2, interactive=False) -> list:
"""Return the line that corresponds to the line_of_words.
"""
if index > -1:
start_index = index-offset\
if index >= offset\
else 0
end_index = index+offset+1\
if len(self.lines_of_faksimile_positions) > index+offset\
else len(self.lines_of_faksimile_positions)
else:
start_index = 0
end_index = len(self.lines_of_faksimile_positions)
matched_line = []
mergeable_line_of_word_texts = [ word.text for word in line_of_words if word.mergeable ]
word_text = ''.join(mergeable_line_of_word_texts)
interactive_list = []
for i in range(start_index, end_index):
current_line = [ fp for fp in self.lines_of_faksimile_positions[i] if not fp.joined ]
current_text = ''.join([fp.text for fp in current_line if fp.mergeable])
if (len(word_text) == len(current_text) and word_text == current_text)\
or\
(len(word_text) <= len(current_text) and current_text.find(word_text) > -1):
if interactive:
interactive_list.append((i, current_line))
else:
matched_line = current_line
break
elif (len(current_text) > 0 and len(word_text) > len(current_text) and word_text.find(current_text) > -1):
matched_index = word_text.find(current_text)
next_i = i+1 if matched_index == 0 else i-1
while len(word_text) > len(current_text)\
and next_i > -1 and next_i < len(self.lines_of_faksimile_positions):
current_line += [ fp for fp in self.lines_of_faksimile_positions[next_i] if not fp.joined ]
current_text = ''.join([fp.text for fp in current_line if fp.mergeable])
next_i = next_i+1 if matched_index == 0 else next_i-1
if interactive:
interactive_list.append((i, current_line))
else:
matched_line = current_line
break
if interactive:
if len(interactive_list) > 0:
return interactive_list
else:
for i in range(start_index, end_index):
current_line = [ fp for fp in self.lines_of_faksimile_positions[i] if not fp.joined ]
matched_line.append((i, current_line))
return matched_line
def get_new_index(self, word, line_of_words, new_list_of_words, old_word_new_word_mapping):
"""Return index of word in new_list_of_words such that it can be inserted before this index.
"""
old_index = line_of_words.index(word)
new_index = 0
if old_index > 0:
previous_word = line_of_words[old_index-1]
new_previous_word = old_word_new_word_mapping[previous_word]\
if old_word_new_word_mapping.get(previous_word) is not None\
else previous_word
if new_previous_word in new_list_of_words:
new_index = new_list_of_words.index(new_previous_word)+1
else:
new_index = self.get_new_index(new_previous_word, line_of_words,\
new_list_of_words, old_word_new_word_mapping)+1
return new_index
def join_unmergeable_words(self, words, old_word_new_word_mapping) -> Word:
"""Join all words and return new word.
"""
if len(words) > 1:
new_word = words[0]
for word2join in words[1:]:
new_word.join(word2join)
old_word_new_word_mapping.update({word2join: new_word})
old_word_new_word_mapping.update({words[0]: new_word})
return new_word
else:
old_word_new_word_mapping.update({words[0]: words[0]})
return words[0]
def join_unmergeable_words_with_punctuation(self, line_of_words, old_word_new_word_mapping):
"""Join unmergeable words on line with punctionation words.
"""
index = 0
while index < len(line_of_words):
if not line_of_words[index].mergeable\
and index+1 < len(line_of_words)\
and not line_of_words[index+1].mergeable\
and re.match('^[.,]$', line_of_words[index+1].text):
line_of_words[index].join(line_of_words[index+1])
old_word_new_word_mapping.update({line_of_words[index+1]: line_of_words[index]})
line_of_words.remove(line_of_words[index+1])
index += 1
index += 1
def merge_lines(self, line_of_words, new_words, index=-1, offset=2, interactive=False) -> bool:
"""Merge a line of words with the corresponding line of faksimile positions.
[:return:] interactive
"""
if len([word for word in line_of_words if not word.joined ]) == 0:
return [], interactive
line_of_faksimile_positions = self.get_line(line_of_words, index, offset=offset)
if len(line_of_faksimile_positions) > 0:
faksimile_text_dictionary = self.create_faksimile_dictionary(line_of_faksimile_positions)
self.merge_mergeables(line_of_words, faksimile_text_dictionary, new_words)
self.merge_unmergeables(line_of_words, line_of_faksimile_positions, new_words)
elif interactive:
interactive = self.interactive_shell.interactive_merge_lines(line_of_words, new_words, index, offset+4)
elif len(line_of_words) == 1 and line_of_words[0].text == '–':
line_of_words[0].line_number -= 1
else:
if offset < 10:
interactive = self.merge_lines(line_of_words, new_words, index, offset=offset+1)
return interactive
def merge_mergeables(self, line_of_words, faksimile_text_dictionary, new_words):
"""Merge words with faksimile positions for which there are keys in in faksimile_text_dictionary.
"""
for word in line_of_words:
fp_list = faksimile_text_dictionary.get(word.text)
if fp_list is not None and len(fp_list) > 0:
self.merge_word_with_fp(word, fp_list.pop(0), new_words)
def merge_unmergeables(self, line_of_words, line_of_faksimile_positions, new_words):
"""Merge unmergeable words and faksimile_positions
"""
old_word_new_word_mapping = {}
self.join_unmergeable_words_with_punctuation(line_of_words, old_word_new_word_mapping)
unmerged_words = [ word for word in line_of_words if not word.joined and not word.mergeable ]
unmerged_fps = [ fp for fp in line_of_faksimile_positions if not fp.joined and not fp.mergeable ]
if len(unmerged_words) > 0:
if len(unmerged_words) == len(unmerged_fps):
for i, word in enumerate(unmerged_words):
new_index = self.get_new_index(word, line_of_words, new_words, old_word_new_word_mapping)
self.merge_word_with_fp(word, unmerged_fps[i], new_words, new_index)
else:
fp_index = 0
unmerged_unity = []
for word in unmerged_words:
if len(unmerged_unity) > 0 and fp_index < len(unmerged_fps):
previous_word = unmerged_unity[len(unmerged_unity)-1]
previous_index = line_of_words.index(previous_word)
if line_of_words.index(word) - previous_index > 1:
new_word = self.join_unmergeable_words(unmerged_unity, old_word_new_word_mapping)
new_index = self.get_new_index(unmerged_unity[0], line_of_words, new_words, old_word_new_word_mapping)
self.merge_word_with_fp(new_word, unmerged_fps[fp_index], new_words, new_index)
fp_index += 1
unmerged_unity = []
unmerged_unity.append(word)
if len(unmerged_unity) > 0 and fp_index < len(unmerged_fps):
new_word = self.join_unmergeable_words(unmerged_unity, old_word_new_word_mapping)
new_index = self.get_new_index(unmerged_unity[0], line_of_words, new_words, old_word_new_word_mapping)
self.merge_word_with_fp(new_word, unmerged_fps[fp_index], new_words, new_index)
for old_word, new_word in old_word_new_word_mapping.items():
old_word.joined = new_word.joined
return new_words
def merge_word_with_fp(self, word, faksimile_position, list_of_new_words, index=-1):
"""Merge word with faksimile position.
"""
word.joined, faksimile_position.joined = True, True
word.faksimile_positions.append(faksimile_position)
if index == -1:
list_of_new_words.append(word)
else:
list_of_new_words.insert(index, word)
def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None, redo_ok=False):
"""Return svg_pos_file and manuscript_file if they are ready for processing.
"""
svg_pos_file = None
manuscript_tree = None
if manuscript_file is not None\
and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')):
manuscript_tree = ET.parse(manuscript_file)
else:
title_string = faksimile_page.title.replace(' ', '_')
manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\
if isdir('.{}xml'.format(sep)) else title_string + '.xml'
if isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if manuscript_tree is not None:
if redo_ok and len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
else:
if not UNITTESTING:
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
msg = Fore.LIGHTBLUE_EX +'->' + Fore.CYAN + 'Data from page {0} already merged with {1}!'.format(\
faksimile_page.page_number,\
manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)[0])
else:
msg = Fore.MAGENTA + 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number)
print(msg, end='')
print(Style.RESET_ALL)
return svg_pos_file, manuscript_file
def merge_faksimile_file_and_pages(faksimile_file, manuscript_file=None, page=None) -> int:
"""Merge the data of a faksimile file with the data of svgposfile.
[:return:] exit status
"""
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='')
print(Style.RESET_ALL)
faksimile_tree = ET.parse(faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if page is not None:
faksimile_pages = [ faksimile_page for faksimile_page in faksimile_pages\
if get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)[0]\
== page.page_tree.docinfo.URL ]
exit_status = 0
for faksimile_page in faksimile_pages:
svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)
if svg_pos_file is not None:
image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field)
if page is None:
page = Page(svg_pos_file, faksimile_image=image4page, faksimile_svgFile=faksimile_file)
write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__,\
file_type=FILE_TYPE_SVG_WORD_POSITION)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + '->', end='')
print(Fore.CYAN + 'Merging faksimile positions from page {0} with words from file {1} ... '.format(faksimile_page.page_number, svg_pos_file), end='')
exit_status = num_unmerged = merge_faksimile_positions_and_words(page, faksimile_page.word_positions)
if num_unmerged > 0:
page = Page(page.page_tree.docinfo.URL)
for carrier in faksimile_page.word_positions:
carrier.joined = False
exit_status = num_unmerged = merge_faksimile_positions_and_words(page, faksimile_page.word_positions, interactive=True)
if not UNITTESTING:
if num_unmerged == 0:
print(Fore.GREEN + '[OK]')
new_words = sort_words(page)
for word in new_words:
if len(word.faksimile_positions) == 0 or word.text != word.faksimile_positions[0].text:
word.verified = False
if page.is_locked():
page.unlock()
post_merging_processing_and_saving(svg_pos_file=page.page_tree.docinfo.URL, new_words=new_words, page=page, manuscript_file=manuscript_file)
else:
print(Fore.RED + f'[ERROR: {num_unmerged} not joined!]\n')
print([ (word.id, word.text,word.line_number) for word in page.words if not word.joined])
print([ (fp.id, fp.text) for fp in faksimile_page.word_positions if not fp.joined])
print(Fore.RESET)
else:
if num_unmerged > 0:
unmerged_words = [ word for word in page.words if not word.joined]
unmerged_fps = [ fp for fp in faksimile_page.word_positions if not fp.joined ]
print([ (word.id, word.text,word.line_number) for word in unmerged_words])
print([ (fp.id, fp.text) for fp in unmerged_fps])
if len(unmerged_fps) == 0:
for word in page.words:
if len(word.faksimile_positions) < 1:
print(f'{word.line_number}: {word.id} {word.text}')
elif word.text != word.faksimile_positions[0].text:
print(f'{word.line_number}: {word.id} {word.text} {[(fp.id,fp.text) for fp in word.faksimile_positions]}')
else:
words = sort_words(page)
for word in words:
if len(word.faksimile_positions) < 1:
print(f'{word.line_number}: {word.id} {word.text}')
elif not word.verified and word.text != word.faksimile_positions[0].text:
print(f'{word.line_number}: {word.id} {word.text} {[(fp.id,fp.text) for fp in word.faksimile_positions]}')
page = None
return exit_status
def merge_faksimile_positions_and_words(page, faksimile_positions, interactive=False) -> int:
"""Merge words with faksimile positions.
[:return:] exit code
"""
words = sort_words(page)
mark_unmergeable_words_and_faksimile_positions(words, faksimile_positions)
lines_with_words = set([ word.line_number for word in words])
faksimile_lines_composer = FaksimileLineComposer(faksimile_positions, page=page)
new_words = []
if interactive:
faksimile_lines_composer.interactive_shell.set_command_history()
for index, line_number in enumerate(sorted(lines_with_words)):
words_on_line = [ word for word in words if word.line_number == line_number]
interactive = faksimile_lines_composer.merge_lines(words_on_line, new_words, index, interactive=interactive)
unmerged_words = [ word for word in page.words if not word.joined ]
unmerged_fps = [ fp for fp in faksimile_positions if not fp.joined ]
exit_code = faksimile_lines_composer.final_fix_for_unmereged_items(new_words, unmerged_words, unmerged_fps)
if exit_code == 0:
page.words = new_words
else:
faksimile_lines_composer.interactive_shell.print_command_history()
manual_merger = ManualMergerShell(unmerged_words, unmerged_fps, new_words, page=page)
try:
exit_code = manual_merger.run()
except Exception as e:
exit_code = 666
print(e)
manual_merger.print_history()
return exit_code
def mark_unmergeable_words_and_faksimile_positions(words, faksimile_positions):
"""Mark all words and faksimile_positions for which the number of text instances does not accord.
"""
unique_texts = set()
for text_carrier in words + faksimile_positions:
if 'joined' not in text_carrier.__dict__.keys():
text_carrier.joined = False
text_carrier.mergeable = True
unique_texts.add(text_carrier.text)
for text in unique_texts:
words_with_text = [ word for word in words if word.text == text ]
faksimile_positions_with_text = [ fp for fp in faksimile_positions if fp.text == text ]
if len(words_with_text) != len(faksimile_positions_with_text):
for text_carrier in words_with_text + faksimile_positions_with_text:
text_carrier.mergeable = False
def text_starts_with(text1, text2, ignoreCase=False) ->bool:
"""Return text1.startswith(text2)
"""
if ignoreCase:
return text1.lower().startswith(text2.lower())
else:
return text1.startswith(text2)
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION line wise.
svgscripts/compare_faksimile_words_line_wise.py [OPTIONS] <FAKSIMILE_DIR|faksimile_svg_file> [xmlManuscriptFile]
<FAKSIMILE_DIR> a directory containing <faksimile_svg_file>
<faksimile_svg_file> a svg file containing information about the word positions on the faksimile.
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
correct_words_dir = None
try:
opts, args = getopt.getopt(argv, "hc:", ["help", "correct-words=", ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-c', '--correct-words'):
correct_words_dir = arg
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if exists(file_a):
file_b = None
if len(args) > 1 and exists(args[1]):
file_b = args[1]
file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b, correction_dir=correct_words_dir)
for faksimile_file in file_list:
merge_faksimile_file_and_pages(faksimile_file, manuscript_file=manuscript_file)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline