Page MenuHomec4science

convertPDF2SVG4Web.py
No OneTemporary

File Metadata

Created
Sat, Jun 1, 00:17

convertPDF2SVG4Web.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert pdf to svg files with the external program 'pdf2svg'.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import getopt
import re
import subprocess
import sys
import PyPDF2
from os import system, sep, listdir, mkdir, path
from os.path import exists, isfile, isdir
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
class Converter:
"""
This class can be used to convert pdf to svg files with the external program 'pdf2svg'.
Args:
[target_dir (str): target directory]
[title (str): title as first part of target file]
[add_to_page_number (int): correction to the page number of source file]
"""
def __init__(self, target_dir=None, title=None, add_to_page_number=0):
if bool(target_dir):
self.target_dir = target_dir
not isdir(self.target_dir) and mkdir(self.target_dir)
else:
self.target_dir = 'svg' if(isdir('svg')) else ''
self.title = title.replace(' ', '_') if(bool(title)) else None
self.page_number = None
self.add_to_page_number = add_to_page_number
try:
cp = subprocess.run(["which", "pdf2svg"], stdout=subprocess.PIPE, check=True)
self.path_to_pdf2svg = cp.stdout.decode().strip()
if not bool(self.path_to_pdf2svg) or not isfile(self.path_to_pdf2svg):
raise FileNotFoundError("External command 'pdf2svg' not found!\nPlease install 'pdf2svg', check the output of 'which pdf2svg' and retry.")
except subprocess.CalledProcessError:
print("External command 'pdf2svg' not found!\nPlease install 'pdf2svg', check the output of 'which pdf2svg' and retry.")
raise
def get_page_number(self, file_name, page_number=None):
""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
"""
if not bool(page_number) and bool(re.search(r'\d', file_name)):
"""if page_number=None and filename contains digits,
then split filename into its parts that contain only digits, remove empty strings
and return the last part containing only digits.
"""
page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop()
if self.add_to_page_number > 0:
page_number = str(self.add_to_page_number + int(page_number))
if bool(page_number):
leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else ''
return leading_zeros + str(page_number)
else:
return ''
def get_file_name(self, file_name, is_part_of_multi_page_doc=False, page_number=None):
"""Returns the file_name of the target svg file.
"""
dir_name = self.target_dir + sep if(bool(self.target_dir)) else ''
if not is_part_of_multi_page_doc:
if bool(self.title):
return dir_name + self.title + '_page' + self.get_page_number(file_name, page_number=page_number) + '_web.svg'
else:
return '{}{}'.format(dir_name, path.basename(file_name).replace('.pdf', '_web.svg'))
else:
if bool(self.title):
return dir_name + self.title.replace(' ', '_') + '_page%03d_web.svg'
else:
return dir_name + path.basename(file_name).replace('.pdf', '_page%03d_web.svg')
def pdf2svg(self, file_name, page_number=None, name_dictionary={}):
"""Converts pdf to svg files using external program 'pdf2svg'.
:returns: return_code (int) of subprocess executing pdf2svg
"""
if isfile(file_name):
self.page_number = str(page_number)
return_code = 0
pdfFileObj = open(file_name, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj,strict=False)
if(pdfReader.numPages == 1):
svg_file_name = self.get_file_name(file_name, page_number=page_number)
cp = subprocess.run([self.path_to_pdf2svg, file_name, svg_file_name], check=True)
return_code = cp.returncode
else:
dir_name = self.target_dir + sep if(bool(self.target_dir)) else ''
if bool(self.title):
name_dictionary = { index: dir_name + svg_file_name.replace('TITLE', self.title).replace('.svg', '') + '.svg' for index, svg_file_name in name_dictionary.items() }
else:
name_dictionary = { index: dir_name + svg_file_name.replace('.svg', '') + '.svg' for index, svg_file_name in name_dictionary.items() }
if len(name_dictionary) == 0:
name_dictionary = { "all": self.get_file_name(file_name, True) }
for index, svg_file_name in name_dictionary.items():
cp = subprocess.run([self.path_to_pdf2svg, file_name, svg_file_name, str(index)], check=True)
return_code = cp.returncode
pdfFileObj.close()
return return_code
else:
raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program converts pdf to svg files with the help of the external program 'pdf2svg'.
svgscripts/convertPDF2SVG4Web.py [-h|--help, -a|--add-to-page-number=value, -d|--dir=targetDir -t|--title=title] <file|dir> ...
svgscripts/convertPDF2SVG4Web.py [-h|--help, -a|--add-to-page-number=value, -t|--title=title, -p|--page=pageNumber,
-d|--dir=targetDir -n|--name-dict='{"pageNumber": "file_name", ...}'] <file>
-h|--help: show help
-a|--add-to-page-number=value: value to add to the page number specification of the pdf file that will be used as the
file name of the target svg file, e.g. -a 2 TITLE_page001.pdf -> TITLE_page003.svg
-d|--dir=targetDir: target directory for the svg file(s)
-t|--title=title: title that will be used as part of the target svg file(s)' filename
-p|--page=pageNumber: page number of the target svg file. For use with _one_ file only.
-n|--name-dict='{"pageNumber": "file_name", ...}': For a multipage pdf, --name-dict can be used to pass a dictionary with
page numbers (str) as keys and file names (str) as values.
The script will extract only those pages for whiche there are keys.
E.g. -n '{"3":"TITLE_page001","5":"TITLE_page003"}' TITLE_multipage.pdf -> TITLE_page001.svg
TITLE_page003.svg
:return: exit code (int)
"""
target_dir = ".{}svg".format(sep)
title = None
page_number = None
add_to_page_number = 0
name_dictionary = {}
try:
opts, args = getopt.getopt(argv, "ha:d:t:p:n:", ["help", "add-to-page-number=", "dir=", "title=", "page=", "name-dict="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help') or not args:
usage()
return 0
elif opt in ('-a', '--add-to-page-number'):
add_to_page_number = int(arg)
elif opt in ('-d', '--dir'):
target_dir = arg
elif opt in ('-t', '--title'):
title = arg
elif opt in ('-p', '--page'):
page_number = str(arg)
elif opt in ('-n', '--name-dict'):
name_dictionary = eval(arg)
if not args:
usage()
return 2
files_to_process = list()
for arg in args:
if isfile(arg):
files_to_process.append(arg)
elif isdir(arg):
files_to_process = files_to_process + list(filter(lambda file: '.pdf' in file, listdir(arg)))
else:
print("'{}' does not exist!".format(arg))
return 2
converter = Converter(target_dir=target_dir, title=title, add_to_page_number=add_to_page_number)
if len(files_to_process) > 1 and (bool(page_number) or bool(name_dictionary)):
print("ERROR: too many input files: option --page and --name-dict presuppose one input file!")
usage()
return 2
for file in files_to_process:
converter.pdf2svg(file, name_dictionary=name_dictionary, page_number=page_number)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline