Page MenuHomec4science

join_faksimileAndTranskription.py
No OneTemporary

File Metadata

Created
Sat, Apr 27, 13:58

join_faksimileAndTranskription.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to join the data of a faksimile svg files with the data of xml files of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
from deprecated import deprecated
from functools import cmp_to_key
import getopt
import inspect
import lxml.etree as ET
import re
import shutil
import string
import sys
import tempfile
from operator import attrgetter
import os
from os import listdir, sep, path, setpgrp, devnull
from os.path import exists, isfile, isdir, dirname, basename
from pathlib import Path
from progress.bar import Bar
import warnings
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from convert_wordPositions import create_pdf_with_highlighted_words, create_svg_with_highlighted_words
from create_task import CorrectWords
from datatypes.faksimile import FaksimilePage, get_paths_inside_rect
from datatypes.lineNumber import LineNumber
from datatypes.page import Page, STATUS_MERGED_OK
from datatypes.transkriptionField import TranskriptionField
from process_files import update_svgposfile_status
from process_words_post_merging import post_merging_processing_and_saving
from util import ExternalViewer, create_highlighted_svg_file, get_empty_node_ids, record_changes,\
record_changes_on_svg_file_to_page, record_changes_on_xml_file_to_page, get_mismatching_ids,\
replace_chars
sys.path.append('shared_util')
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
PUNCTUATION_PATTERN = r"[{}]".format(string.punctuation)
PUNCTUATION_EOW_PATTERN = r"\w+[{}]$".format('\"')
SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation)
SINGLE_WORD_PATTERN = r"^[\w{}]$".format(string.punctuation)
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
def create_task_correct_words(target_dir, xml_source_file=None, source_svg_file=None, page=None, unmatched_word_ids=None, unmatched_node_ids=None):
"""Create a task CorrectWords or process corrected files.
"""
exit_status = 0
if xml_source_file is None or source_svg_file is None:
if xml_source_file is None and page is not None and isfile(page.page_tree.docinfo.URL):
xml_source_file = page.page_tree.docinfo.URL if xml_source_file is None else xml_source_file
elif xml_source_file is None:
raise Exception('create_task_correct_words needs a xml_source_file or a page that has a valid tree source!')
if source_svg_file is None and page is not None and isfile(page.faksimile_svgFile):
source_svg_file = page.faksimile_svgFile if source_svg_file is None else source_svg_file
elif source_svg_file is None:
raise Exception('create_task_correct_words needs a source_svg_file or a page that has a faksimile_svgFile!')
if page is None:
page = Page(xml_source_file)
correct_words = CorrectWords(xml_source_file, source_svg_file, target_dir, page=page,\
unmatched_node_ids=unmatched_node_ids)
if not correct_words.has_been_created(page):
if not page.is_locked():
reference_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.(xml|svg)')
lock_dict = { 'reference_file': reference_file,\
'message': 'Run:$ python3 {0} -c {1} {2}'.format(__file__, target_dir, source_svg_file)}
write_pretty(xml_element_tree=page.page_tree, file_name=xml_source_file, script_name=__file__,\
file_type=FILE_TYPE_SVG_WORD_POSITION, **lock_dict)
correct_words.create()
if not UNITTESTING:
print('Created a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description))
elif correct_words.has_been_finished(page):
msg = 'Task "correct words" for page {} has been finished!'.format(str(page.number))
xml_file = correct_words.get_target_filepath(page, is_faksimile_svg=False, suffix='.xml', is_finished=True)
transkription_svg = correct_words.get_target_filepath(page, is_faksimile_svg=False, is_finished=True)
faksimile_svg = correct_words.get_target_filepath(page, is_finished=True)
faksimile_file = faksimile_svg if isfile(faksimile_svg) else source_svg_file
if isfile(xml_file):
msg += '\n Words loaded from file {}.'.format(xml_file)
page = record_changes_on_xml_file_to_page(xml_source_file, xml_file)
page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=xml_file)
elif isfile(transkription_svg):
msg += '\n Words loaded from file {}.'.format(transkription_svg)
page = record_changes_on_svg_file_to_page(xml_source_file, transkription_svg, word_ids=unmatched_word_ids)
page.update_data_source(faksimile_svgFile=faksimile_file, xml_correction_file=transkription_svg)
msg += '\n Faksimile loaded from file {}.'.format(faksimile_file)
if not UNITTESTING:
print(msg)
exit_status = join_faksimileAndTranskription(faksimile_file, page=page)
elif not UNITTESTING:
print('There is a folder {0} for page {1} with the task:\n {2}'.format(correct_words.target_dir, str(page.number), correct_words.description))
return exit_status
def debug_function(words, input=''):
"""Custon debug function.
"""
if len([ word for word in words if word.debug_container.get('marked') ]) > 0:
print(Fore.RED + 'marked word(s): {}'.format([ word.text for word in words if word.debug_container.get('marked') ]))
if input != '':
print('input: {}'.format(input))
print(Fore.RESET)
def fix_errors(faksimile_file, unmerged_faksimile_positions, unmerged_words, text_field_id=None, faksimile_page=None, xml_source_file=None, manuscript_file=None, namespaces={}):
"""Creates a faksimile svg file and a pdf file highlighting the positions of the word positions
that could not been merged. After correction, results are inserted into origianl file and processed again.
:return: exit status (int)
"""
parser = ET.XMLParser(remove_blank_text=True)
faksimile_tree = ET.parse(faksimile_file, parser)
if len(namespaces) == 0:
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
if faksimile_page is None:
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if text_field_id is not None\
and text_field_id in [ faksimile_page.text_field.id for faksimile_page in faksimile_pages ]:
faksimile_page = [ faksimile_page for faksimile_page in faksimile_pages if faksimile_page.text_field.id == text_field_id ][0]
else:
faksimile_page = faksimile_pages[0]
if xml_source_file is None or manuscript_file is None:
xml_source_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file)
tmp_dir = tempfile.mkdtemp()
tmp_pdf_file = tmp_dir + sep + 'output.pdf'
tmp_svg_file = tmp_dir + sep + 'output.svg'
tmp_faksimile = tmp_dir + sep + 'faksimile.svg'
empyt_node_ids = get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces)\
if len(unmerged_faksimile_positions) < len(unmerged_words) else []
highlight_node_ids = [ faksimile_position.id for faksimile_position in unmerged_faksimile_positions ]
highlight_node_ids += empyt_node_ids
create_highlighted_svg_file(faksimile_tree, highlight_node_ids, target_file=tmp_faksimile,
local_image_path=faksimile_page.faksimile_image.local_path, namespaces=namespaces, highlight_color=HIGHLIGHT_COLOR)
#create_pdf_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, pdf_file_name=tmp_pdf_file, bg_color=HIGHLIGHT_COLOR)
create_svg_with_highlighted_words(xml_source_file, highlighted_words=unmerged_words, svg_file_name=tmp_svg_file, bg_color=HIGHLIGHT_COLOR)
exit_status = 2
if isfile(tmp_svg_file) and isfile(tmp_faksimile):
ExternalViewer.show_files(list_of_files=[tmp_svg_file, tmp_faksimile])
record_changes_on_svg_file_to_page(xml_source_file, tmp_svg_file, word_ids=[ word.id for word in unmerged_words ])
record_changes(faksimile_file, tmp_faksimile, highlight_node_ids, namespaces=namespaces)
shutil.rmtree(tmp_dir)
exit_status = join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, do_fix_errors=False, join_single_char_words=True)
return exit_status
def get_filelist_and_manuscript_file(file_a, file_b=None, correction_dir=None):
"""Returns a file list and a manuscript file (or None)
"""
file_list = []
manuscript_file = None
if isfile(file_a) and file_a.endswith('svg'):
file_list.append(file_a)
if file_b is not None and isfile(file_b):
manuscript_file = file_b
elif isfile(file_a) and file_a.endswith('xml'):
manuscript_file = file_a
if file_b is not None and isfile(file_b):
file_list.append(file_b)
elif file_b is not None and isdir(file_b):
file_list = [ svgfile for svgfile in listdir(file_b) if svgfile.endswith('svg') ]
elif correction_dir is not None and isdir(correction_dir)\
and Path(correction_dir, CorrectWords.finish_dir).is_dir():
finish_dir = Path(correction_dir, CorrectWords.finish_dir)
xml_files = list(finish_dir.glob('*.xml'))
svg_files = list(finish_dir.glob('*.svg'))
if len(xml_files + svg_files) > 1:
manuscript_tree = ET.parse(manuscript_file)
for xml_file in xml_files:
output = manuscript_tree.xpath(f'.//page[contains(@output, "{xml_file.name}")]/@output')
if len(output) > 0:
file_list.append(output[0])
elif isdir(file_a):
file_list = [ file_a + sep + svgfile for svgfile in listdir(file_a) if svgfile.endswith('svg') ]
if file_b is not None and isfile(file_b):
manuscript_file = file_b
return file_list, manuscript_file
def get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=None, redo_ok=False):
"""Return svg_pos_file and manuscript_file if they are ready for processing.
"""
svg_pos_file = None
manuscript_tree = None
if manuscript_file is not None:
#and basename(manuscript_file).startswith(faksimile_page.title.replace(' ', '_')):
manuscript_tree = ET.parse(manuscript_file)
else:
title_string = faksimile_page.title.replace(' ', '_')
manuscript_file = '.{}xml'.format(sep) + sep + title_string + '.xml'\
if isdir('.{}xml'.format(sep)) else title_string + '.xml'
if isfile(manuscript_file):
manuscript_tree = ET.parse(manuscript_file)
if manuscript_tree is not None:
if redo_ok and len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and @status="OK"]/@output' % faksimile_page.page_number)) > 0:
svg_pos_file = manuscript_tree.getroot().xpath('//page[@number="%s"]/@output' % faksimile_page.page_number)[0]
else:
if not UNITTESTING:
if len(manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)) > 0:
msg = Fore.LIGHTBLUE_EX +'->' + Fore.CYAN + 'Data from page {0} already merged with {1}!'.format(\
faksimile_page.page_number,\
manuscript_tree.getroot().xpath('//page[@number="%s" and contains(@status,"OK")]/@output' % faksimile_page.page_number)[0])
else:
msg = Fore.MAGENTA + 'Manuscript file {} does not contain a page number {} ready for joining ...'.format(manuscript_file, faksimile_page.page_number)
print(msg, end='')
print(Style.RESET_ALL)
return svg_pos_file, manuscript_file
def join_faksimileAndTranskription(faksimile_file, manuscript_file=None, page=None, do_fix_errors=False, redo_ok=False, debug_word_text='', **kwargs):
"""Joins the data of a faksimile file with the data of svgposfile.
"""
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Processing file {} '.format(faksimile_file), end='')
print(Style.RESET_ALL)
if not do_fix_errors and 'do_fix_errors' in kwargs.keys():
do_fix_errors = kwargs.get('do_fix_errors')
if not redo_ok and 'redo_ok' in kwargs.keys():
redo_ok = kwargs.get('redo_ok')
if debug_word_text == '' and 'debug_word_text' in kwargs.keys():
debug_word_text = kwargs.get('debug_word_text')
faksimile_tree = ET.parse(faksimile_file)
namespaces = { k if k is not None else 'ns': v for k, v in faksimile_tree.getroot().nsmap.items() }
faksimile_pages = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, namespaces=namespaces)
if page is not None:
faksimile_pages = [ faksimile_page for faksimile_page in faksimile_pages\
if get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)[0]\
== page.page_tree.docinfo.URL ]
exit_status = 0
for faksimile_page in faksimile_pages:
svg_pos_file, manuscript_file = get_svgPosFile_and_manuscriptFile(faksimile_page, manuscript_file=manuscript_file, redo_ok=redo_ok)
if svg_pos_file is not None:
image4page = faksimile_page.faksimile_image.get_image_joined_with_text_field(faksimile_page.text_field)
if page is None:
page = Page(svg_pos_file, faksimile_image=image4page, faksimile_svgFile=faksimile_file)
write_pretty(xml_element_tree=page.page_tree, file_name=svg_pos_file, script_name=__file__,\
file_type=FILE_TYPE_SVG_WORD_POSITION)
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + '->', end='')
print(Fore.CYAN + 'Joining data from page {0} with file {1} ... '.format(faksimile_page.page_number, svg_pos_file), end='')
words = sort_words(page)
if debug_word_text != '' and len([ word for word in words if word.text == debug_word_text ]) > 0:
for word in words:
if word.text == debug_word_text:
word.debug_container.update({'marked': True})
if bool(kwargs.get('join_single_char_words')):
removed_words = join_single_char_words(words)
page.words = words
page.update_and_attach_words2tree()
#print([ word.text for word in page.words if word in removed_words ])
faksimile_positions = sort_faksimile_positions(faksimile_page.word_positions)
new_words = []
unique_faksimile_words = sorted(set(faksimile_position.text for faksimile_position in faksimile_positions),\
key=lambda text: len(text))
faksimile_positions, unique_faksimile_words = replace_chars(words, faksimile_positions, unique_faksimile_words)
for word_text in unique_faksimile_words:
process_word_text(new_words, word_text, faksimile_positions, words)
if False not in [ word.joined for word in words if word.text != '.' ]\
and False not in [ position.joined for position in faksimile_positions]\
and not UNITTESTING:
if page.is_locked():
page.unlock()
post_merging_processing_and_saving(svg_pos_file=svg_pos_file, new_words=new_words, page=page, manuscript_file=manuscript_file)
print(Fore.GREEN + '[OK]')
print(Style.RESET_ALL)
elif not UNITTESTING:
mismatch_words, mismatch_faksimile_positions = get_mismatching_ids(words, faksimile_positions)
not_joined_fp = [ (position.id, position.text) for position in sorted(mismatch_faksimile_positions, key=lambda fp: fp.top) ]
plural_fp = '' if len(not_joined_fp) < 2 else 's'
not_joined_tw = [ (word.id, word.line_number, word.text) for word in sorted(mismatch_words, key=lambda word: word.transkription_positions[0].top) ]
plural_tw = '' if len(not_joined_tw) < 2 else 's'
print(Fore.MAGENTA + '\n--->Not joined faksimile position{0}: {1}'.format(plural_fp, not_joined_fp))
print([(position.id, position.text) for position in faksimile_positions if not position.joined])
print('--->Not joined word{0} from transkription: {1}'.format(plural_tw, not_joined_tw))
print([(word.id, word.line_number, word.text) for word in words if not word.joined ])
debug_function(new_words, input='new_words')
debug_function(words, input='words')
print(Style.RESET_ALL)
if kwargs.get('correct_words') is not None:
unmatched_node_ids = [ position.id for position in mismatch_faksimile_positions ]
unmatched_node_ids += get_empty_node_ids(faksimile_tree, faksimile_page=faksimile_page, namespaces=namespaces)
exit_status = create_task_correct_words(kwargs.get('correct_words'), page=page, source_svg_file=faksimile_file,\
unmatched_word_ids=[ word.id for word in mismatch_words ],\
unmatched_node_ids=unmatched_node_ids)
elif do_fix_errors:
exit_status = fix_errors(faksimile_file, [position for position in faksimile_positions if not position.joined],\
[ word for word in words if not word.joined ], text_field_id=faksimile_page.text_field.id,\
faksimile_page=faksimile_page, xml_source_file=svg_pos_file,\
manuscript_file=manuscript_file, namespaces=namespaces)
else:
exit_status = 2
elif False in [ word.joined for word in words ]:
print([ (word.id, word.text) for word in words if not word.joined ])
exit_status = 2
page = None
return exit_status
def join_single_char_words(words, threshold_x=5, threshold_y=5):
"""Join single char words.
:return: a list of removed words
"""
#all_single_char_words = [ word for word in words if re.match(r'^\w$', word.text) ]
removed_words = []
all_single_char_words = [ word for word in words if re.match(SINGLE_WORD_PATTERN, word.text) ]
if not UNITTESTING:
bar = Bar('Joining single char words', max=len(all_single_char_words))
line_numbers = sorted(set(word.line_number for word in all_single_char_words))
for line_number in line_numbers:
single_char_words = [ word for word in all_single_char_words if word.line_number == line_number ]
index = len(single_char_words)
while index > 0:
index -= 1
word = None
not UNITTESTING and bar.next()
if single_char_words[index] in words:
single_char_word_index = words.index(single_char_words[index])
if re.match(SINGLE_PUNCTUATION_PATTERN, single_char_words[index].text)\
and words_close_enough(words[single_char_word_index-1], single_char_words[index], 15, 12):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
#print('{0} -> {1}, {2}'.format(word.text, words[single_char_word_index-1].text))
elif index > 0\
and words_close_enough(single_char_words[index-1], single_char_words[index], threshold_x, threshold_y):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
elif single_char_word_index > 0\
and words[single_char_word_index-1].line_number == line_number\
and words_close_enough(words[single_char_word_index-1], single_char_words[index], threshold_x, threshold_y):
words[single_char_word_index-1].join(single_char_words[index])
removed_words.append(words.pop(single_char_word_index))
not UNITTESTING and bar.finish()
return removed_words
def process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text='', min_length_split=5):
"""Joins faksimile_positions with text == word_text with words with text == word_text.
"""
text = word_text if alt_word_text == '' else alt_word_text
fposition4word = [ faksimile_position for faksimile_position in faksimile_positions if faksimile_position.text == word_text ]
words4word = [ word for word in words if word.text == word_text and not word.joined ]
if alt_word_text != '':
words4word += [ word for word in words if word.text == text and not word.joined ]
words4word = sorted(words4word, key=attrgetter('id'))
if len(fposition4word) == len(words4word):
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words[words.index(words4word[index])].joined = True
new_words.append(words4word[index])
elif len(words4word) < len(fposition4word):
if re.match(r'(.*)ss(.*)', text):
alt_word_text = re.sub(r'ss', 'ß', text)
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
elif re.match(SINGLE_PUNCTUATION_PATTERN, text):
if text == '-':
alt_word_text = text.replace('-', '–')
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
print('single', word_text, len(fposition4word), len(words4word))
"""
elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text):
alt_word_text = re.sub(PUNCTUATION_PATTERN, "", text)
debug_function(words4word, input='elif re.match(PUNCTUATION_PATTERN, text) or re.match(PUNCTUATION_EOW_PATTERN, text) text {0}'.format(text))
if alt_word_text != '':
pattern = r'(.*){0}(.*)'.format(alt_word_text)
words4word += [ word for word in words if re.match(pattern, word.text) and not word.joined ]
if len(words4word) < len(fposition4word):
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
if words4word[index].text[len(words4word[index].text)-1] != word_text[len(word_text)-1]\
and words.index(words4word[index])+1 < len(words)\
and words[words.index(words4word[index])+1].text == word_text[len(word_text)-1]:
words4word[index].join(words[words.index(words4word[index])+1])
words[words.index(words4word[index])+1].joined = True
words[words.index(words4word[index])].joined = True
words4word[index].text = word_text
new_words.append(words4word[index])
elif len(text) >= min_length_split and len([ word for word in words if word.text.startswith(text) and not word.joined ]) == len(fposition4word):
new_words4word = [ word for word in words if word.text.startswith(text) and not word.joined ]
debug_function(new_words4word, input='word.startswith {}'.format(text))
for index, fposition in enumerate(fposition4word):
old_word = new_words4word[index]
none_word, new_word, next_word = old_word.split(text, start_id=old_word.id)
fposition4word[index].joined = True
new_word.faksimile_positions = [ fposition4word[index] ]
words[words.index(old_word)] = new_word
if next_word is not None:
next_word.id = len(words)
next_word.joined = False
words.append(next_word)
new_word.joined = True
new_words.append(new_word)
elif len(text) >= min_length_split and len([ word for word in words if word.text.endswith(text) and not word.joined ]) == len(fposition4word):
new_words4word = [ word for word in words if word.text.endswith(text) and not word.joined ]
debug_function(new_words4word, input='word.endswith {}'.format(text))
for index, fposition in enumerate(fposition4word):
old_word = new_words4word[index]
before_word, new_word, none_word = old_word.split(text, start_id=old_word.id)
fposition4word[index].joined = True
new_word.faksimile_positions = [ fposition4word[index] ]
words[words.index(old_word)] = new_word
if before_word is not None:
before_word.id = len(words)
before_word.joined = False
words.append(before_word)
new_word.joined = True
new_words.append(new_word)
else:
if len(text) > 1:
new_words4word = [ word for word in words if word.text == text[1:] and not word.joined ]
debug_function(new_words4word, input='else text {0}'.format(text))
if len(new_words4word) == 0:
alt_word_text = text[1:]
process_word_text(new_words, word_text, faksimile_positions, words, alt_word_text=alt_word_text, min_length_split=min_length_split)
else:
for new_word in new_words4word:
collected_text = new_word.text
current_word = new_word
while collected_text != word_text and word_text.endswith(collected_text) and current_word.id > 0:
previous_word = words[current_word.id-1]
if word_text.endswith(previous_word.text + collected_text):
words[current_word.id].joined = True
previous_word.join(current_word)
current_word = previous_word
collected_text = current_word.text
else:
collected_text = previous_word.text + collected_text
words4word.append(current_word)
words4word = sorted(words4word, key=attrgetter('id'))
for index, faksimile_position in enumerate(fposition4word):
if index < len(words4word):
faksimile_position.joined = True
words4word[index].faksimile_positions = [ faksimile_position ]
words4word[index].text = word_text
words[words.index(words4word[index])].joined = True
new_words.append(words4word[index])
else:
print('<{0}> f{1}/t{2}, ids: {3}'.\
format(word_text, len(fposition4word), len(words4word), [ position.id for position in fposition4word ]))
"""
else:
print('<{0}> f{1}/t{2}'.format(word_text, len(fposition4word), len(words4word)))
def sort_words(page)->list:
"""Returns sorted words (from top left to bottom right).
"""
if -1 in [ word.line_number for word in page.words ]:
warnings.warn('{} in page file {} for word with ids {}'.format(LineNumber.WARN_NO_LINE_NUMBER, page.page_tree.docinfo.URL, page.page_tree.xpath('./word[not(@line-number)]/@id')))
words = []
for line_number in page.line_numbers:
word_on_line = [ word for word in page.words if word.line_number == line_number.id ]
if line_number.id % 2 == 0:
words += sorted(word_on_line, key=lambda word: word.transkription_positions[0].left)
else:
words += sorted(word_on_line, key=cmp_to_key(\
lambda wordA, wordB: -1\
if wordA.transkription_positions[0].left < wordB.transkription_positions[0].left\
and abs(wordA.transkription_positions[0].bottom-wordB.transkription_positions[0].bottom) < wordA.transkription_positions[0].height/2\
else 1))
for index, word in enumerate(words):
words[index].id = index
words[index].joined = len(words[index].faksimile_positions) > 0 and words[index].verified
return words
def sort_faksimile_positions(faksimile_positions, reference_list=None):
"""Returns sorted words (from top left to bottom right).
"""
for faksimile_position in faksimile_positions:
faksimile_position.joined = False\
if reference_list is None\
else faksimile_position in reference_list
return sorted(faksimile_positions)
"""
return sorted(faksimile_positions, key=cmp_to_key(\
lambda positionA, positionB: -1\
if positionA.left+positionA.width/2 <= positionB.left+positionB.width/2\
and positionA.top+positionA.height/2 <= positionB.top+positionB.height/2\
else 1\
)\
)
"""
@deprecated(reason="Writing process id is now set to word not word_position, TODO: check faksimile_positions for split candidates!")
def update_writing_process(word):
"""Updates the writing process of the faksimile word position by
synchronizing it with the corresponding transkription word position.
If there are several transkription positions belonging to different writing
processes but just one faksimile position, then we skip the update.
We will fix these faksimile positions by manually adding more word positions
and processing those additions in a later stage.
"""
writing_processes = [ writing_process_id for writing_process_id in set( tp.writing_process_id for tp in word.transkription_positions ) ]
if len(writing_processes) == 1 and len(word.faksimile_positions) > 0:
word.faksimile_positions[0].writing_process_id = writing_processes[0]
def words_close_enough(wordA, wordB, threshold_x=10, threshold_y=5):
"""Return true if words are closer than thresholds
"""
return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left)\
-wordB.transkription_positions[0].left) < threshold_x\
and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y
#return abs((wordA.transkription_positions[len(wordA.transkription_positions)-1].left+wordA.transkription_positions[len(wordA.transkription_positions)-1].width)\
# -wordB.transkription_positions[0].left) < threshold_x\
# and abs(wordA.transkription_positions[len(wordA.transkription_positions)-1].bottom-wordB.transkription_positions[0].bottom) < threshold_y
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to merge the data of some faksimile pages with the data of xml files that are of type myxmlwriter.FILE_TYPE_SVG_WORD_POSITION.
svgscripts/join_faksimileAndTranskription.py [OPTIONS] <FAKSIMILE_DIR|faksimile_svg_file> [xmlManuscriptFile]
<FAKSIMILE_DIR> a directory containing <faksimile_svg_file>
<faksimile_svg_file> a svg file containing information about the word positions on the faksimile.
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
OPTIONS:
-h|--help: show help
-c|--correct-words=DIR create a taks "CorrectWords" in target dir DIR
-d|--debug-word=WORD show debug information for word == WORD
-f|--fix-errors: open faksimilie svg file if there are errors
-i|--ignore-status-ok ignore status "OK:faksimile merged" in manuscript file and redo merging.
-j|--join-single-char-words join single char words
:return: exit code (int)
"""
commando_dict = { 'do_fix_errors': False, 'redo_ok': False, 'debug_word_text': '', 'correct_words': None,\
'join_single_char_words': False }
try:
opts, args = getopt.getopt(argv, "hc:d:fij", ["help", "correct-words=", "debug-word=", "fix-errors", "ignore-status-ok",\
"join-single-char-words" ])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-c', '--correct-words'):
commando_dict['correct_words'] = arg
elif opt in ('-d', '--debug-word'):
commando_dict['debug_word_text'] = arg
elif opt in ('-f', '--fix-errors'):
commando_dict['do_fix_errors'] = True
elif opt in ('-i', '--ignore-status-ok'):
commando_dict['redo_ok'] = True
elif opt in ('-j', '--join-single-char-words'):
commando_dict['join_single_char_words'] = True
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if exists(file_a):
file_b = None
if len(args) > 1 and exists(args[1]):
file_b = args[1]
file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b, correction_dir=commando_dict['correct_words'])
#if commando_dict['correct_words'] is not None and isdir(commando_dict['correct_words']):
# print('checking new function, please remove this condition if successful!')
# for file in file_list: print(file)
# return 0
for faksimile_file in file_list:
join_faksimileAndTranskription(faksimile_file, manuscript_file=manuscript_file, **commando_dict)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline