Page MenuHomec4science

fix_missing_glyphs.py
No OneTemporary

File Metadata

Created
Wed, May 8, 16:15

fix_missing_glyphs.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to fix missing glyphs.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from myxmlwriter import write_pretty, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from datatypes.page import Page
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
def find_missing_glyph_for_pwp(positional_word_part_node, svg_path_tree, namespaces, xmin=0.0, ymin=0.0):
"""Finds missing glyph for node of a PositionalWordPart.
:return: list of PositionalWordPart
"""
THRESHOLD = 15.5
pwp = PositionalWordPart(node=positional_word_part_node)
word_part_obj = { "x": pwp.left, "y": pwp.top, "text": pwp.text, "matrix": pwp.transform, "class": pwp.style_class }
start_id = int(pwp.id)
threshold = -0.5
positional_word_parts = []
while threshold < THRESHOLD and len(positional_word_parts) < 1:
try:
positional_word_parts = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces,\
start_id=start_id, xmin=xmin, ymin=ymin, threshold=threshold, throw_error_if_not_found=True)
except Exception:
threshold += 0.1
return positional_word_parts
def update_word(page, positional_word_part_node, positional_word_parts):
"""Updates word according to new positional_word_parts.
"""
if len(positional_word_parts) > 0:
debug_msg_string = 'update word from ' + __file__
positional_word_part_id = int(positional_word_part_node.get('id'))
transkription_position_id = int(positional_word_part_node.getparent().get('id'))
word_id = int(positional_word_part_node.getparent().getparent().get('id'))
word = page.words[word_id]
transkription_position = word.transkription_positions[transkription_position_id]
transkription_position.positional_word_parts.pop(positional_word_part_id)
positional_word_parts.reverse()
for positional_word_part in positional_word_parts:
transkription_position.positional_word_parts.insert(positional_word_part_id, positional_word_part)
for index, positional_word_part in enumerate(transkription_position.positional_word_parts):
positional_word_part.id = index
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
page, transkription_position.positional_word_parts, debug_msg_string=debug_msg_string, transkription_position_id=transkription_position_id)
word.transkription_positions.pop(transkription_position_id)
transkription_positions.reverse()
for new_tp in transkription_positions:
word.transkription_positions.insert(transkription_position_id, new_tp)
text = ''
for index, tp in enumerate(word.transkription_positions):
tp.id = index
tp.writing_process_id = transkription_position.writing_process_id
for pwp in tp.positional_word_parts:
text += pwp.text
if word.text != text:
word.text = text
word.attach_word_to_tree(page.page_tree)
def fix_missing_glyphs(svg_word_pos_file, manuscript_file=None):
"""Finds missing glyphs for xml file of type FILE_TYPE_SVG_WORD_POSITION.
"""
if isfile(svg_word_pos_file):
page = Page(xml_source_file=svg_word_pos_file)
transkription_field = TranskriptionField(page.svg_file)
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
for positional_word_part_node in page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]'):
find_missing_glyph_for_pwp(positional_word_part_node, svg_path_tree, namespaces, xmin=transkription_field.xmin, ymin=transkription_field.ymin)
def main(argv):
"""This program can be used to fix missing glyphs.
svgscripts/fix_missing_glyphs.py [OPTIONS] <xmlManuscriptFile|svgWordPosition>-File
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svgWordPosition> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
source_file = args[0]
manuscript_file = None
if isfile(source_file):
source_tree = ET.parse(source_file)
file_list = []
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION\
and len([ word_part for word_part in source_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')]) > 0: # if symbol_ids are missing ...
file_list.append(source_file)
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
manuscript_file = source_file
file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND.lower()))
else:
usage()
exit_status = 2
for svg_word_pos_file in file_list:
fix_missing_glyphs(svg_word_pos_file, manuscript_file=manuscript_file)
else:
raise FileNotFoundError('File {} does not exist!'.format(source_file))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline