Page MenuHomec4science

fix_missing_glyphs.py
No OneTemporary

File Metadata

Created
Mon, May 6, 23:16

fix_missing_glyphs.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to fix missing glyphs.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.page import Page, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word
from util import update_svgposfile_status
sys.path.append('shared_util')
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
REMOVE_SVG_WORD_POS_PAGE_ENDING = re.compile('_page[0-9]+\w*')
def find_missing_glyph_for_pwp(pwp, svg_path_tree, namespaces, xmin=0.0, ymin=0.0):
"""Finds missing glyph for a PositionalWordPart.
:return: list of PositionalWordPart
"""
THRESHOLD = 15.5
#pwp = PositionalWordPart(node=positional_word_part_node)
word_part_obj = { "x": pwp.left, "y": pwp.top, "text": pwp.text, "matrix": pwp.transform, "class": pwp.style_class }
start_id = int(pwp.id)
threshold = -0.5
positional_word_parts = []
while threshold < THRESHOLD and len(positional_word_parts) < 1:
try:
positional_word_parts = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces,\
start_id=start_id, xmin=xmin, ymin=ymin, threshold=threshold, throw_error_if_not_found=True)
except Exception:
threshold += 0.1
return positional_word_parts
def update_word(word, old_transkription_position, old_positional_word_part, positional_word_parts):
"""Updates word according to new positional_word_parts.
:return: new transkription_position
"""
if len(positional_word_parts) > 0:
debug_msg_string = 'update word from ' + __file__
old_transkription_position.positional_word_parts.remove(old_positional_word_part)
positional_word_parts.reverse()
for positional_word_part in positional_word_parts:
old_transkription_position.positional_word_parts.insert(int(old_positional_word_part.id), positional_word_part)
for index, positional_word_part in enumerate(old_transkription_position.positional_word_parts):
positional_word_part.id = index
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
old_transkription_position.positional_word_parts, debug_msg_string=debug_msg_string, transkription_position_id=old_transkription_position.id)
word.transkription_positions.remove(old_transkription_position)
transkription_positions.reverse()
for new_tp in transkription_positions:
word.transkription_positions.insert(int(old_transkription_position.id), new_tp)
text = ''
for index, tp in enumerate(word.transkription_positions):
tp.id = index
tp.writing_process_id = old_transkription_position.writing_process_id
for pwp in tp.positional_word_parts:
text += pwp.text
if word.text != text:
word.text = text
return transkription_positions[0]
def fix_missing_glyphs(svg_word_pos_file, manuscript_file=None):
"""Finds missing glyphs for xml file of type FILE_TYPE_SVG_WORD_POSITION.
"""
if isfile(svg_word_pos_file):
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Fixing missing glyphs for file {} ... '.format(svg_word_pos_file), end='')
print(Style.RESET_ALL)
page = Page(svg_word_pos_file)
xmin = 0
ymin = 0
if page.svg_image is None or page.svg_image.text_field is None:
transkription_field = TranskriptionField(page.svg_file)
xmin = transkription_field.xmin
ymin = transkription_field.ymin
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]'))
words_without_glyphs = [ word for word in page.words\
if len([ tp for tp in word.transkription_positions\
if len([ pwp for pwp in tp.positional_word_parts if pwp.symbol_id is None]) > 0]) > 0 ]
for word in words_without_glyphs:
for transkription_position in word.transkription_positions:
positional_word_parts = transkription_position.positional_word_parts[:]
for positional_word_part in positional_word_parts:
if positional_word_part.symbol_id is None:
pwps = find_missing_glyph_for_pwp(positional_word_part, svg_path_tree, namespaces, xmin=xmin, ymin=ymin)
new_transkription_position = update_word(word, transkription_position, positional_word_part, pwps)
if new_transkription_position is not None:
transkription_position = new_transkription_position
page.update_and_attach_words2tree()
write_pretty(xml_element_tree=page.page_tree, file_name=svg_word_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
page = Page(svg_word_pos_file)
new_number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]'))
if not UNITTESTING:
result_color = Fore.LIGHTBLUE_EX if new_number_of_missing_glyphs == 0 else Fore.MAGENTA
print(result_color + ' {0}/{1}'.format(number_of_missing_glyphs-new_number_of_missing_glyphs, number_of_missing_glyphs), end='')
print(Fore.LIGHTBLUE_EX + ' fixed.', end='')
print(Style.RESET_ALL)
if len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) == 0:
update_svgposfile_status(svg_word_pos_file, manuscript_file=manuscript_file, status='OK')
def get_filelist_and_manuscript_file(file_a, file_b=None):
"""Returns a file list and a manuscript file (or None)
"""
file_list = []
manuscript_file = None
source_tree = ET.parse(file_a)
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION\
and len([ word_part for word_part in source_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')]) > 0: # if symbol_ids are missing ...
file_list.append(file_a)
if file_b is not None:
manuscript_file = file_b
else:
manuscript_file = REMOVE_SVG_WORD_POS_PAGE_ENDING.sub('', file_a)
if not isfile(manuscript_file):
manuscript_file = None
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
manuscript_file = file_a
if file_b is not None:
file_list.append(file_b)
else:
file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND.lower()))
return file_list, manuscript_file
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix missing glyphs.
svgscripts/fix_missing_glyphs.py [OPTIONS] <xmlManuscriptFile|svgWordPosition>-File [<xmlManuscriptFile|svgWordPosition>-File]
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svgWordPosition> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
file_b = None
if len(args) > 1 and isfile(args[1]):
file_b = args[1]
file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b)
for svg_word_pos_file in file_list:
fix_missing_glyphs(svg_word_pos_file, manuscript_file=manuscript_file)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline