Page MenuHomec4science

create_blank_svg_files.py
No OneTemporary

File Metadata

Created
Wed, Jun 19, 14:41

create_blank_svg_files.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to create svg files with a rect for the text_field.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from colorama import Fore, Style
import csv
import getopt
import json
import lxml.etree as ET
import shutil
import subprocess
import sys
import os
import wget
from os import listdir, sep, path, setpgrp, devnull, mkdir, remove
from os.path import exists, isfile, isdir, dirname, basename
from progress.bar import Bar
import warnings
from fix_old_data import save_page
from get_text_field import get_text_field_on_image, get_text_field_on_thumb
sys.path.append('svgscripts')
from datatypes.faksimile_image import FaksimileImage
from datatypes.faksimile import FaksimilePage
from datatypes.archival_manuscript import ArchivalManuscriptUnity
from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK
from datatypes.matrix import Matrix
from util import back_up, back_up_svg_file, copy_faksimile_update_image_location, copy_faksimile_svg_file
from process_files import update_svgposfile_status
from process_words_post_merging import update_faksimile_line_positions, MERGED_DIR
sys.path.append('shared_util')
from myxmlwriter import copy_to_bak_dir, write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from main_util import create_function_dictionary
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
MAX_SVG_XY_THRESHOLD = 10
BLANK_STATUS = 'blank'
class SPARQLQuery:
HEADER_OPTION = '--header'
HEADER_VALUE = 'Accept: application/sparql-results+json,*/*;q=0.9'
ENDPOINT = 'https://nietzsche.fuseki.services.dasch.swiss/nietzsche'
QUERY = 'query=PREFIX+tln%3A+%3Chttp%3A%2F%2Fwww.nie.org%2Fontology%2Fnietzsche%23%3E%0ASELECT+(AVG(%3Fleft)+as+%3Favg_left)+(AVG(%3Ftop)+as+%3Favg_top)+(AVG(%3Fwidth)+as+%3Favg_width)+(AVG(%3Fheight)+as+%3Favg_height)+WHERE+%7B%0A+%3Fimage+a+tln%3AFaksimileImage%3B+tln%3AhasTextField+%3Ftextfield.%0A+%3Ftextfield+tln%3AhasLeft+%3Fleft%3B+tln%3AhasTop+%3Ftop%3B+tln%3AhasWidth+%3Fwidth%3B+tln%3AhasHeight+%3Fheight.%7D+'
REQUEST_OPTION = '--request'
REQUEST_VALUE = 'POST'
class OldSVGFileCreator:
"""This class can be used in order to create svg files with textfield rects.
"""
UNITTESTING = False
RESPONSE = 'response.json'
def __init__(self, title, faksimile_dir, endpoint=SPARQLQuery.ENDPOINT, target_dir='./tmp'):
self.avg_left = -1
self.avg_top = -1
self.avg_height = -1
self.avg_width = -1
self.endpoint = endpoint
self.faksimile_dir = faksimile_dir
self.target_dir = target_dir
self.title = title
self.curl = self._get_ext_program_path('curl')
self.inkscape = self._get_ext_program_path('inkscape')
self.namespaces = None
self._init_averages()
def _init_averages(self):
"""Initialize average dimension of textfield based on the data from the endpoint
"""
if not isfile(self.RESPONSE):
subprocess.run([self.curl, self.endpoint,\
SPARQLQuery.REQUEST_OPTION, SPARQLQuery.REQUEST_VALUE, '--data', SPARQLQuery.QUERY, SPARQLQuery.HEADER_OPTION, SPARQLQuery.HEADER_VALUE, '-o', self.RESPONSE ], check=True)
with open(self.RESPONSE) as json_file:
data = json.load(json_file)
keys = data['head']['vars']
for key in keys:
for item in data['results']['bindings']:
self.__dict__[key] = float(item[key]['value'])
def _get_ext_program_path(self, program_name) ->str:
"""Return path to external program
"""
program_path = None
error_msg = f'External command "{program_name}" not found!\nPlease install "{program_name}", check the output of "which {program_name}" and retry.'
try:
cp = subprocess.run(["which", program_name], stdout=subprocess.PIPE, check=True)
program_path = cp.stdout.decode().strip()
if not bool(program_path) or not isfile(program_path):
raise FileNotFoundError(error_msg)
except subprocess.CalledProcessError:
print(error_msg)
raise
return program_path
def create_svg_file(self, page: ET.Element) -> int:
""" Create a svg file.
[return] exit_status
"""
number = page.get('number')
page_file = page.get('output')
faksimile_file = self.faksimile_dir + sep + page.get('alias') + '.jpg'\
if page.get('alias') is not None\
else None
page_id = self.title.replace(' ', '_') + '_' + number
if bool(page.get('alias')) and not isfile(faksimile_file):
wget.download(FaksimileImage.NIETZSCHE_SOURCES_URL + page.get('alias'), out=faksimile_file)
if bool(page.get('alias')) and isfile(faksimile_file) and page_file is not None and isfile(page_file):
target_file = basename(page_file).replace('.xml', '.svg')
if not isfile(self.target_dir + sep + target_file):
prog_list = [self.inkscape, '-z', '-l', target_file, faksimile_file]
subprocess.run(prog_list, check=True)
svg_tree = ET.parse(target_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
images = svg_tree.xpath('//ns:image', namespaces=namespaces)
if len(images) > 0:
image_file = dirname(target_file) + sep + images[0].get('{%s}href' % namespaces['xlink'])
image_width = float(images[0].get('width')) if bool(images[0].get('width')) else 0
image_height = float(images[0].get('height')) if bool(images[0].get('height')) else 0
text_filed = get_text_field_on_image(image_file, image_width, image_height, id=page_id)
text_filed.attach_as_rect(svg_tree.getroot())
copy_faksimile_update_image_location(faksimile_tree=svg_tree, target_directory=self.target_dir)
remove(target_file)
else:
print(f'There has been an error: could not find an image in {target_file}!')
return 2
return 0
else:
print(faksimile_file, page_file)
return 2
def update_textfield_of_svg_file(self, svg_file: str) -> int:
""" Update the textfield of the svg file by using image analysis.
[return] exit_status
"""
svg_tree = ET.parse(target_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
rects = svg_tree.xpath('//ns:rect[not(contains(@id, "rect"))]', namespaces=namespaces)
images = svg_tree.xpath('//ns:image', namespaces=namespaces)
if len(rects) > 0 and len(images) > 0:
image_file = dirname(svg_file) + sep + images[0].get('{%s}href' % namespaces['xlink'])
image_width = float(images[0].get('width')) if bool(images[0].get('width')) else 0
image_height = float(images[0].get('height')) if bool(images[0].get('height')) else 0
text_filed = get_text_field_on_image(image_file, image_width, image_height)
text_filed.attach_as_rect(rects[0])
copy_faksimile_svg_file(target_file=svg_file, faksimile_tree=svg_tree)
return 0
else:
print(f'There has been an error: could not find a rect and an image in {svg_file}!')
return 2
return 2
class SVGFileCreator:
"""This class can be used in order to create svg files with textfield rects.
"""
UNITTESTING = False
def __init__(self, title=None, faksimile_dir=None, target_dir='./tmp'):
self.faksimile_dir = faksimile_dir
self.target_dir = target_dir
self.title = title
self.inkscape = self._get_ext_program_path('inkscape')
self.namespaces = None
def _get_ext_program_path(self, program_name) ->str:
"""Return path to external program
"""
program_path = None
error_msg = f'External command "{program_name}" not found!\nPlease install "{program_name}", check the output of "which {program_name}" and retry.'
try:
cp = subprocess.run(["which", program_name], stdout=subprocess.PIPE, check=True)
program_path = cp.stdout.decode().strip()
if not bool(program_path) or not isfile(program_path):
raise FileNotFoundError(error_msg)
except subprocess.CalledProcessError:
print(error_msg)
raise
return program_path
def create_svg_file(self, page: ET.Element) -> int:
""" Create a svg file.
[return] exit_status
"""
number = page.get('number')
page_file = page.get('output')
faksimile_file = self.faksimile_dir + sep + page.get('alias') + '.jpg'\
if page.get('alias') is not None\
else None
page_id = self.title.replace(' ', '_') + '_' + number
if bool(page.get('alias')) and not isfile(faksimile_file):
wget.download(FaksimileImage.NIETZSCHE_SOURCES_URL + page.get('alias'), out=faksimile_file)
if bool(page.get('alias')) and isfile(faksimile_file) and page_file is not None and isfile(page_file):
target_file = basename(page_file).replace('.xml', '.svg')
return self._create_svg_file(target_file, faksimile_file, page_id)
else:
print(faksimile_file, page_file)
return 2
def _create_svg_file(self, target_file, faksimile_file, page_id=0) ->int:
""" Create a svg file.
[return] exit_status
"""
if not isfile(self.target_dir + sep + target_file):
prog_list = [self.inkscape, '-z', '-l', target_file, faksimile_file]
subprocess.run(prog_list, check=True)
svg_tree = ET.parse(target_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
images = svg_tree.xpath('//ns:image', namespaces=namespaces)
if len(images) > 0:
image_file = dirname(target_file) + sep + images[0].get('{%s}href' % namespaces['xlink']).replace('file://', '')
image_width = float(images[0].get('width')) if bool(images[0].get('width')) else 0
image_height = float(images[0].get('height')) if bool(images[0].get('height')) else 0
print(image_file, image_width, image_height)
text_field = get_text_field_on_image(image_file, image_width, image_height, id=page_id)
text_field.attach_as_rect(svg_tree.getroot())
copy_faksimile_update_image_location(faksimile_tree=svg_tree, target_directory=self.target_dir)
remove(target_file)
else:
print(f'There has been an error: could not find an image in {target_file}!')
return 2
return 0
def create_rotation_svg_file(self, page_file: str) -> int:
""" Create a svg file.
[return] exit_status
"""
page_tree = ET.parse(page_file)
transform = page_tree.xpath('//faksimile-image/@transform')[0]
thumb = page_tree.xpath('//faksimile-image/@file-name')[0].replace('.jpg', '_thumb.svg')
if len(page_tree.xpath('//faksimile-image/@thumb')) > 0:
thumb = page_tree.xpath('//faksimile-image/@thumb')[0]
thumb_file = self.faksimile_dir + sep + thumb
faksimile_file = thumb_file.replace('.svg', '.jpg')
if not isfile(thumb_file) and isfile(faksimile_file):
faksimile_file = thumb_file.replace('.svg', '.jpg')
prog_list = [self.inkscape, '-z', '-l', target_file, faksimile_file]
subprocess.run(prog_list, check=True)
elif not isfile(thumb_file) and not isfile(faksimile_file):
print(f'There has been an error: could not find the faksimile_file {faksimile_file}!')
return 2
if isfile(thumb_file):
svg_tree = ET.parse(thumb_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
images = svg_tree.xpath('//ns:image', namespaces=namespaces)
if len(images) > 0:
matrix = Matrix(transform)
if matrix.matrix[matrix.B] != 0.0 and matrix.matrix[matrix.C] != 0.0:
width = float(svg_tree.getroot().get('width'))
height = float(svg_tree.getroot().get('height'))
matrix.updateOffset(height, width)
images[0].set('{%s}href' % namespaces['xlink'], basename(faksimile_file))
svg_tree.getroot().set('transform', matrix.toString())
copy_faksimile_svg_file(target_file=thumb_file, faksimile_tree=svg_tree)
else:
print(f'There has been an error: could not find an image in {thumb_file}!')
return 2
return 0
else:
print(thumb_file, page_file)
return 2
def create_svg_thumb_file(self, faksimile_file: str) -> int:
""" Create a svg thumb file.
[return] exit_status
"""
if isfile(faksimile_file):
target_file = faksimile_file.replace('.jpg', '.svg')
if not isfile(target_file):
prog_list = [self.inkscape, '-z', '-l', target_file, faksimile_file]
subprocess.run(prog_list, check=True)
svg_tree = ET.parse(target_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
images = svg_tree.xpath('//ns:image', namespaces=namespaces)
if len(images) > 0:
text_field, width, height = get_text_field_on_thumb(faksimile_file)
images[0].set('width', str(width))
images[0].set('height', str(height))
images[0].set('{%s}href' % namespaces['xlink'], basename(faksimile_file))
svg_tree.getroot().set('width', str(text_field.width))
svg_tree.getroot().set('height', str(text_field.height))
svg_tree.getroot().set('viewBox', f'{text_field.left} {text_field.top} {text_field.width} {text_field.height}')
copy_faksimile_svg_file(target_file=target_file, faksimile_tree=svg_tree)
else:
print(f'There has been an error: could not find an image in {target_file}!')
return 2
return 0
else:
print(faksimile_file, page_file)
return 2
@staticmethod
def UPDATE_TEXTFIELD_OF_SVG_FILE(svg_file: str) -> int:
""" Update the textfield of the svg file by using image analysis.
[return] exit_status
"""
svg_tree = ET.parse(svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
rects = svg_tree.xpath('//ns:rect[not(contains(@id, "rect"))]', namespaces=namespaces)
images = svg_tree.xpath('//ns:image', namespaces=namespaces)
if len(rects) > 0 and len(images) > 0:
image_file = dirname(svg_file) + sep + images[0].get('{%s}href' % namespaces['xlink'])
image_width = float(images[0].get('width')) if bool(images[0].get('width')) else 0
image_height = float(images[0].get('height')) if bool(images[0].get('height')) else 0
text_field = get_text_field_on_image(image_file, image_width, image_height)
text_field.update_rect(rects[0])
copy_faksimile_svg_file(target_file=svg_file, faksimile_tree=svg_tree)
return 0
else:
print(f'There has been an error: could not find a rect and an image in {svg_file}!')
return 2
return 2
class CSVSVGFileCreator(SVGFileCreator):
"""This class can be used in order to create svg files with textfield rects from a csv file.
"""
FOLIO = "Blatt-ID"
LABEL = "Label"
ALIAS = "Alias"
URL = "Nietzschesource-URL"
CONTENT = "Inhalt"
def __init__(self, title=None, faksimile_dir=None, target_dir='./tmp'):
super(CSVSVGFileCreator,self).__init__(title=title, faksimile_dir=faksimile_dir, target_dir=target_dir)
def create_svg_file_from_csv_input(self, faksimile_file, page_id) ->int:
"""Create a svg file from csv input.
"""
target_file = basename(faksimile_file).replace('.jpg', '.svg')
return self._create_svg_file(target_file, faksimile_file, page_id)
def process_update(args) ->int:
""" Process option update
"""
if len(args) < 0:
usage()
return 2
svg_dir = args[0]
if not isdir(svg_dir):
raise FileNotFoundError(f'Directory {svg_dir} does not exist!')
counter = 0
for svg_file in [ svg_dir + sep + svg_file for svg_file in listdir(svg_dir) if isfile(svg_dir + sep + svg_file) and svg_file.endswith('.svg') ]:
if not SVGFileCreator.UNITTESTING:
print(Fore.CYAN + f'Updating svg file {svg_file} ...' + Style.RESET_ALL)
copy_to_bak_dir(svg_file)
if SVGFileCreator.UPDATE_TEXTFIELD_OF_SVG_FILE(svg_file) == 0:
counter += 1
if not SVGFileCreator.UNITTESTING:
print(Style.RESET_ALL + f'[{counter} pages created]')
return 0
def process_rotate(args) ->int:
""" Process option rotate
"""
if len(args) < 2:
usage()
return 2
xml_file = args[0]
faksimile_dir = args[1]
if not isdir(faksimile_dir):
raise FileNotFoundError(f'Directory {faksimile_dir} does not exist!')
xpath = f'//page'
source_tree = ET.parse(xml_file)
if len(xml_file.split('_')) > 2: # svg_pos_file
manuscript_file = '_'.join(xml_file.split('_')[0:2]) + '.xml'
if isfile(manuscript_file):
source_tree = ET.parse(manuscript_file)
xpath = f'//page[contains(@output,"{xml_file}")]'
else:
raise FileNotFoundError(f'There is no manuscript file {manuscript_file} for svg_pos_file {xml_file}!')
counter = 0
svg_creator = SVGFileCreator('', faksimile_dir)
for page in [ page for page in source_tree.xpath(xpath) if len(ET.parse(page.get('output')).xpath('//faksimile-image/@transform')) > 0 ]:
if not SVGFileCreator.UNITTESTING:
number = page.get('number')
print(Fore.CYAN + f'Rotating svg file for page {number} ...' + Style.RESET_ALL)
if svg_creator.create_rotation_svg_file(page) == 0:
counter += 1
if not SVGFileCreator.UNITTESTING:
print(Style.RESET_ALL + f'[{counter} pages created]')
return 0
def process_thumb(args) ->int:
""" Process option thumb
"""
if len(args) == 0:
usage()
return 2
faksimile_dir = args[0]
title = args[1]\
if len(args) > 1\
else ''
svg_creator = SVGFileCreator('', faksimile_dir)
print(faksimile_dir, title)
if not isdir(faksimile_dir):
raise FileNotFoundError(f'Directory {faksimile_dir} does not exist!')
counter = 0
for faksimile_file in [ faksimile_dir + sep + faksimile_file for faksimile_file in listdir(faksimile_dir)\
if isfile(faksimile_dir + sep + faksimile_file)\
and basename(faksimile_file).startswith(title)\
and faksimile_file.endswith('_thumb.jpg') ]:
if not SVGFileCreator.UNITTESTING:
print(Fore.CYAN + f'Creating a svg for faksimile file {faksimile_file} ...' + Style.RESET_ALL)
if svg_creator.create_svg_thumb_file(faksimile_file) == 0:
counter += 1
if not SVGFileCreator.UNITTESTING:
print(Style.RESET_ALL + f'[{counter} thumb svg files created]')
return 0
def process_csv(csv_file_name, faksimile_dir, target_dir, downloadOnly=False) ->int:
""" Default process
"""
title = basename(csv_file_name).replace('.csv','').replace('_', ' ')
file_creator = CSVSVGFileCreator(title, faksimile_dir, target_dir)
exit_status = 0
with open(csv_file_name, newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
faksimile_name = row[CSVSVGFileCreator.LABEL]
if '_thumb' not in faksimile_name:
faksimile_url = row[CSVSVGFileCreator.URL]
page_id = row[CSVSVGFileCreator.ALIAS].replace(',', '').replace(' ', '_')
faksimile_file = faksimile_dir + sep + faksimile_name
if not isfile(faksimile_file):
wget.download(faksimile_url, out=faksimile_file)
if not downloadOnly:
if isfile(faksimile_file):
print(f'processing {faksimile_file} ...')
if file_creator.create_svg_file_from_csv_input(faksimile_file, page_id) > 0:
exit_status = 2
else:
exit_status = 2
print(f'There has been an error on downloading {faksimile_url}!', e)
return exit_status
def process_default(args) ->int:
""" Default process
"""
if len(args) < 3:
usage()
return 2
faksimile_dir = args[1]
target_dir = args[2]
not isdir(target_dir) and mkdir(target_dir)
if args[0].endswith('.csv'):
return process_csv(args[0], faksimile_dir, target_dir)
xml_file = args[0]
if isfile(xml_file) and isdir(faksimile_dir):
counter = 0
xpath = f'//page[contains(@status, "{BLANK_STATUS}")]'
if len(xml_file.split('_')) > 2: # svg_pos_file
manuscript_file = '_'.join(xml_file.split('_')[0:2]) + '.xml'
if isfile(manuscript_file):
source_tree = ET.parse(manuscript_file)
xpath = f'//page[contains(@output,"{xml_file}")]'
else:
raise FileNotFoundError(f'There is no manuscript file {manuscript_file} for svg_pos_file {xml_file}!')
else:
source_tree = ET.parse(xml_file)
title = source_tree.getroot().get('title')
svg_creator = SVGFileCreator(title, faksimile_dir, target_dir=target_dir)
for page in source_tree.xpath(xpath):
if not SVGFileCreator.UNITTESTING:
number = page.get('number')
print(Fore.CYAN + f'Creating a svg file for {title}, {number} ...' + Style.RESET_ALL)
if svg_creator.create_svg_file(page) == 0:
counter += 1
if not SVGFileCreator.UNITTESTING:
print(Style.RESET_ALL + f'[{counter} pages created]')
else:
if not isdir(faksimile_dir):
raise FileNotFoundError(f'Directory {faksimile_dir} does not exist!')
raise FileNotFoundError('File {} does not exist!'.format(xml_file))
return 0
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to create svg files with a rect for the text_field.
fixes/create_blank_svg_files.py [OPTIONS] <xmlManuscriptFile|svg_pos_file|csv_file> <faksimile-dir> <target-dir>
fixes/create_blank_svg_files.py -r|--rotate-thumb <xmlManuscriptFile|svg_pos_file> <faksimile-dir>
fixes/create_blank_svg_files.py -u|--update <svg-dir>
fixes/create_blank_svg_files.py -t|--thumb <faksimile-dir> [<title>]
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
<csv_file> a csv file with faksimile information.
<faksimile-dir> a directory containing faksimile images
<target-dir> the directory where the files should be saved to
OPTIONS:
-h|--help show help
-r|--rotate-thumb use image analysis in order to create a svg file for the thumb faksimile files that need rotation.
-u|--update update svg_files: use image analysis in order to update the textfield of the svg_files
-t|--thumb use image analysis in order to create a svg file for the thumb faksimile files that need cropping.
:return: exit code (int)
"""
update = False
thumb = False
rotate = True
try:
opts, args = getopt.getopt(argv, "hrut", ["help","rotate-thumb","update", "thumb"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-r', '--rotate-thumb'):
rotate = True
elif opt in ('-u', '--update'):
update = True
elif opt in ('-t', '--thumb'):
thumb = True
if update:
return process_update(args)
elif rotate:
return process_rotate(args)
elif thumb:
return process_thumb(args)
return process_default(args)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline