Page MenuHomec4science

create_task.py
No OneTemporary

File Metadata

Created
Wed, May 1, 05:51

create_task.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to create a task.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import abc
from colorama import Fore, Style
import getopt
import inspect
import itertools
import lxml.etree as ET
import re
import shutil
import sys
import os
from os import listdir, sep, makedirs
from os.path import exists, isfile, isdir, dirname, basename, splitext
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from convert_wordPositions import create_pdf_with_highlighted_words, create_svg_with_highlighted_words
from util import copy_xml_file_word_pos_only, get_mismatching_ids
from datatypes.page import Page
from datatypes.faksimile import FaksimilePage
#from join_faksimileAndTranskription import STATUS_MERGED_OK
from util import ExternalViewer, create_highlighted_svg_file
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
HIGHLIGHT_COLOR = 'red'
OPACITY = '0.5'
class Task(metaclass=abc.ABCMeta):
"""This abstract class can be used to create a task.
"""
finish_dir = 'Fertig'
def __init__(self, xml_source_file, target_dir, page=None, faksimile_svgFile=None, dirname=None, description='', edit_transkription=False, edit_xml=False, manual=None, status_contains='', bg_color=HIGHLIGHT_COLOR, opacity=OPACITY):
self.xml_source_file = xml_source_file
self.page = page
if self.page is None:
self.page = Page(self.xml_source_file)
self.faksimile_svgFile = faksimile_svgFile
self.target_dir = target_dir + sep + dirname\
if dirname is not None else target_dir
self.dirname = dirname
self.description = description if description != '' else self.__doc__
self.edit_transkription = edit_transkription
self.edit_xml = edit_xml
self.manual = manual
self.status_contains = status_contains
self.bg_color = bg_color
self.opacity = opacity
self.created_files = []
self.finished_files = []
if isdir(self.target_dir):
self.created_files = [ created_file for created_file in listdir(self.target_dir) if not isdir(created_file) ]
if isdir(self.target_dir + sep + self.finish_dir):
self.finished_files = listdir(self.target_dir + sep + self.finish_dir)
def create(self):
makedirs(self.target_dir + sep + Task.finish_dir, exist_ok=True)
if self.manual is not None and isfile(self.manual):
shutil.copy(self.manual, self.target_dir)
words = self.select_words(self.page.words)
if not self.edit_transkription:
transkription_file = self.target_dir + sep + self.create_file_name(self.page, is_faksimile_svg=False, suffix='.pdf')
create_pdf_with_highlighted_words(page=self.page, highlighted_words=words,\
pdf_file_name=transkription_file, bg_color=self.bg_color)
else:
transkription_file = self.target_dir + sep + self.create_file_name(self.page, is_faksimile_svg=False)
create_svg_with_highlighted_words(page=self.page, highlighted_words=words,\
svg_file_name=transkription_file, bg_color=self.bg_color)
if self.edit_xml:
xml_file = copy_xml_file_word_pos_only(self.page.page_tree.docinfo.URL, self.target_dir)
self.created_files.append(xml_file)
note = self.create_note_about_missing_words()
if note != '':
note_file = self.target_dir + sep + self.create_file_name(self.page, is_faksimile_svg=False, suffix='.txt')
with open(note_file, 'w+') as f:
f.write(note)
f.close()
if isfile(transkription_file):
self.created_files.append(transkription_file)
source_svg_file = self.page.faksimile_svgFile if self.page.faksimile_svgFile is not None\
else self.faksimile_svgFile
if source_svg_file is None:
raise Exception('source_svg_file not specified: neither page nor self have a faksimile_svgFile!')
svg_file = self.target_dir + sep + self.create_file_name(self.page)\
if self.page.title != '' and self.page.number != -1\
else self.target_dir + sep + basename(source_svg_file)
faksimile_tree = ET.parse(source_svg_file)
node_ids = self.get_node_ids()
create_highlighted_svg_file(faksimile_tree, node_ids, target_file=svg_file,\
highlight_color=self.bg_color, opacity=self.opacity)
if isfile(svg_file):
self.created_files.append(svg_file)
def create_file_name(self, page, suffix='.svg', is_faksimile_svg=True):
"""Return a file name for page.
"""
if is_faksimile_svg:
return page.title.replace(' ', '-') + ',{}.svg'.format(str(page.number))
else:
return basename(page.page_tree.docinfo.URL).replace('.xml', suffix)
def create_note_about_missing_words(self):
"""Create a note about missing words for faksimile and transkription ids.
"""
return ''
def contains_file(self, file_name, is_finished=False):
"""Return whether task created a file with basename file_name.
"""
if is_finished:
return len([ finished_file for finished_file in self.finished_files if basename(finished_file) == basename(file_name) ]) > 0
return len([ created_file for created_file in self.created_files if basename(created_file) == basename(file_name) ]) > 0
def get_fullpath4file(self, file_name):
"""Return full path for created file with file_name.
"""
if not self.contains_file(file_name):
return None
return [ created_file for created_file in self.created_files if basename(created_file) == basename(file_name) ][0]
@abc.abstractmethod
def get_node_ids(self):
"""Return node ids for faksimile svg rect.
"""
pass
def has_been_created(self, page):
"""Return true if task has been created.
"""
faksimile_svg = self.create_file_name(page)
transkription_svg = self.create_file_name(page, is_faksimile_svg=False)
xml_file = self.create_file_name(page, is_faksimile_svg=False, suffix='.xml')
return self.contains_file(faksimile_svg)\
or self.contains_file(transkription_svg)\
or self.contains_file(xml_file)\
or self.has_been_finished(page, faksimile_svg=faksimile_svg,\
transkription_svg=transkription_svg, xml_file=xml_file)
def has_been_finished(self, page, faksimile_svg=None, transkription_svg=None, xml_file=None):
"""Return true if task has been finished.
"""
if faksimile_svg is None:
faksimile_svg = self.create_file_name(page)
if transkription_svg is None:
transkription_svg = self.create_file_name(page, is_faksimile_svg=False)
if xml_file is None:
xml_file = self.create_file_name(page, is_faksimile_svg=False, suffix='.xml')
return self.contains_file(faksimile_svg, is_finished=True)\
or self.contains_file(transkription_svg, is_finished=True)\
or self.contains_file(xml_file, is_finished=True)
@abc.abstractmethod
def select_words(self, words):
"""Returns selected words.
"""
pass
class SplitFaksimileWordBoxes(Task):
"""Split faksimile word boxes according to how many boxes a word has on the transkription.
TODO
"""
def __init__(self, xml_source_file, target_dir):
super(SplitFaksimileWordBoxes, self).__int__(xml_source_file, target_dir,\
status_contains=STATUS_MERGED_OK)
def select_words(self, words):
"""Returns selected words. TODO
"""
#TODO create those functions!!!!
#return [ word for word in words if word.hasParts() and word.partsMissFaksimilePostion() ]
return words
class CorrectWords(Task):
"""Correct words from faksimile and from transkription such that they correspond.
"""
def __init__(self, xml_source_file, source_svg_file, target_dir, page=None, unmatched_node_ids=None, edit_xml=True):
super(CorrectWords, self).__init__(xml_source_file, target_dir, page=page, faksimile_svgFile=source_svg_file,\
edit_transkription=True, edit_xml=edit_xml)
self.unmatched_words = []
self.unmatched_faksimile_positions = []
self.unmatched_node_ids = unmatched_node_ids if unmatched_node_ids is not None else []
if self.page is None:
self.page = Page(self.xml_source_file)
self.init_unmatched_words()
def init_unmatched_words(self):
"""Init unmatched ids.
"""
source_svg_file = self.page.faksimile_svgFile if self.page.faksimile_svgFile is not None\
else self.faksimile_svgFile
faksimile_tree = ET.parse(source_svg_file)
faksimile_page = FaksimilePage.GET_FAKSIMILEPAGES(faksimile_tree, page_number=str(self.page.number))[0]
self.unmatched_words, self.unmatched_faksimile_positions = get_mismatching_ids(self.page.words, faksimile_page.word_positions)
def create_note_about_missing_words(self):
"""Create a note about missing words for faksimile and transkription ids.
"""
note = '{0},{1}: nicht übereinstimmende Wörter.\n'.format(self.page.title, str(self.page.number))
if len(self.unmatched_words) > 0:
note += '\nFolgende Transkription-Wörter haben keine Entsprechung bei den Wörtern auf dem Faksimile:\n'
for word in self.unmatched_words:
note += '- "{0}", id="{1}", line_number: {2}\n'.format(word.text, word.id, word.line_number)
if len(self.unmatched_faksimile_positions) > 0:
note += '\nFolgende Faksimile-Wörter haben keine Entsprechung bei den Wörtern der Transkription:\n'
for faksimile_position in self.unmatched_faksimile_positions:
note += '- "{0}", id: {1}\n'.format(faksimile_position.text, faksimile_position.id)
return note
def get_target_filepath(self, page, is_faksimile_svg=True, suffix='.svg', is_finished=False):
"""Return target filepath for page.
"""
if is_finished:
return self.target_dir + sep + self.finish_dir + sep + self.create_file_name(page, is_faksimile_svg=is_faksimile_svg, suffix=suffix)
return self.target_dir + sep + self.create_file_name(page, is_faksimile_svg=is_faksimile_svg, suffix=suffix)
def get_node_ids(self):
"""Return node ids for faksimile svg rect.
"""
return self.unmatched_node_ids
def select_words(self, words):
"""Return words that match unmatched_word_ids.
"""
if len(self.unmatched_words) == 0:
return words
return self.unmatched_words
def usage(func_name):
"""prints information on how to use the script
"""
print(func_name.__doc__)
def main_correct_words(argv):
"""This program can be used to create the task 'CorrectWords' in directory ./correct-words.
svgscripts/copy_faksimile_svg_file.py [OPTIONS] <xml_source_file>
<xml_source_file>
OPTIONS:
-h|--help: show help
-r|--refdir=dir reference directory
:return: exit code (int)
"""
tmp_dir = './correct-words'
ref_dir = None
try:
opts, args = getopt.getopt(argv, "hr:", ["help", "refdir=" ])
except getopt.GetoptError:
usage(eval(inspect.currentframe().f_code.co_name))
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(eval(inspect.currentframe().f_code.co_name))
return 0
elif opt in ('-r', '--refdir'):
ref_dir = arg
if len(args) < 1:
usage(eval(inspect.currentframe().f_code.co_name))
return 2
exit_status = 0
for xml_source_file in args:
if isfile(xml_source_file):
page = Page(xml_source_file)
if ref_dir is not None and isdir(ref_dir)\
and isfile(ref_dir + sep + basename(xml_source_file)):
ref_page = Page(ref_dir + sep + basename(xml_source_file))
page.words = ref_page.words
if page.faksimile_svgFile is not None\
and isfile(page.faksimile_svgFile):
correct_words = CorrectWords(xml_source_file, page.faksimile_svgFile, tmp_dir, page=page)
for faksimile_position in correct_words.unmatched_faksimile_positions:
correct_words.unmatched_node_ids.append(faksimile_position.id)
correct_words.create()
else:
print('Skipping {0}. File does not contain a valid faksimile_svgFile reference!'.format(xml_source_file))
return exit_status
def main(argv):
return main_correct_words(argv)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline