Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F65123029
convertPDF2SVG4Web.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jun 1, 00:17
Size
9 KB
Mime Type
text/x-python
Expires
Mon, Jun 3, 00:17 (2 d)
Engine
blob
Format
Raw Data
Handle
18007376
Attached To
rNIETZSCHEPYTHON nietzsche-python
convertPDF2SVG4Web.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert pdf to svg files with the external program 'pdf2svg'.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import
getopt
import
re
import
subprocess
import
sys
import
PyPDF2
from
os
import
system
,
sep
,
listdir
,
mkdir
,
path
from
os.path
import
exists
,
isfile
,
isdir
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
class
Converter
:
"""
This class can be used to convert pdf to svg files with the external program 'pdf2svg'.
Args:
[target_dir (str): target directory]
[title (str): title as first part of target file]
[add_to_page_number (int): correction to the page number of source file]
"""
def
__init__
(
self
,
target_dir
=
None
,
title
=
None
,
add_to_page_number
=
0
):
if
bool
(
target_dir
):
self
.
target_dir
=
target_dir
not
isdir
(
self
.
target_dir
)
and
mkdir
(
self
.
target_dir
)
else
:
self
.
target_dir
=
'svg'
if
(
isdir
(
'svg'
))
else
''
self
.
title
=
title
.
replace
(
' '
,
'_'
)
if
(
bool
(
title
))
else
None
self
.
page_number
=
None
self
.
add_to_page_number
=
add_to_page_number
try
:
cp
=
subprocess
.
run
([
"which"
,
"pdf2svg"
],
stdout
=
subprocess
.
PIPE
,
check
=
True
)
self
.
path_to_pdf2svg
=
cp
.
stdout
.
decode
()
.
strip
()
if
not
bool
(
self
.
path_to_pdf2svg
)
or
not
isfile
(
self
.
path_to_pdf2svg
):
raise
FileNotFoundError
(
"External command 'pdf2svg' not found!
\n
Please install 'pdf2svg', check the output of 'which pdf2svg' and retry."
)
except
subprocess
.
CalledProcessError
:
print
(
"External command 'pdf2svg' not found!
\n
Please install 'pdf2svg', check the output of 'which pdf2svg' and retry."
)
raise
def
get_page_number
(
self
,
file_name
,
page_number
=
None
):
""" Returns page number as a string (with leading zero(s) if len(page_number) < 3).
"""
if
not
bool
(
page_number
)
and
bool
(
re
.
search
(
r'\d'
,
file_name
)):
"""if page_number=None and filename contains digits,
then split filename into its parts that contain only digits, remove empty strings
and return the last part containing only digits.
"""
page_number
=
list
(
filter
(
lambda
x
:
x
!=
''
,
re
.
split
(
r'\D+'
,
file_name
)))
.
pop
()
if
self
.
add_to_page_number
>
0
:
page_number
=
str
(
self
.
add_to_page_number
+
int
(
page_number
))
if
bool
(
page_number
):
leading_zeros
=
'00'
if
(
len
(
page_number
)
==
1
)
else
'0'
if
(
len
(
page_number
)
==
2
)
else
''
return
leading_zeros
+
str
(
page_number
)
else
:
return
''
def
get_file_name
(
self
,
file_name
,
is_part_of_multi_page_doc
=
False
,
page_number
=
None
):
"""Returns the file_name of the target svg file.
"""
dir_name
=
self
.
target_dir
+
sep
if
(
bool
(
self
.
target_dir
))
else
''
if
not
is_part_of_multi_page_doc
:
if
bool
(
self
.
title
):
return
dir_name
+
self
.
title
+
'_page'
+
self
.
get_page_number
(
file_name
,
page_number
=
page_number
)
+
'_web.svg'
else
:
return
'{}{}'
.
format
(
dir_name
,
path
.
basename
(
file_name
)
.
replace
(
'.pdf'
,
'_web.svg'
))
else
:
if
bool
(
self
.
title
):
return
dir_name
+
self
.
title
.
replace
(
' '
,
'_'
)
+
'_page
%03d
_web.svg'
else
:
return
dir_name
+
path
.
basename
(
file_name
)
.
replace
(
'.pdf'
,
'_page
%03d
_web.svg'
)
def
pdf2svg
(
self
,
file_name
,
page_number
=
None
,
name_dictionary
=
{}):
"""Converts pdf to svg files using external program 'pdf2svg'.
:returns: return_code (int) of subprocess executing pdf2svg
"""
if
isfile
(
file_name
):
self
.
page_number
=
str
(
page_number
)
return_code
=
0
pdfFileObj
=
open
(
file_name
,
'rb'
)
pdfReader
=
PyPDF2
.
PdfFileReader
(
pdfFileObj
,
strict
=
False
)
if
(
pdfReader
.
numPages
==
1
):
svg_file_name
=
self
.
get_file_name
(
file_name
,
page_number
=
page_number
)
cp
=
subprocess
.
run
([
self
.
path_to_pdf2svg
,
file_name
,
svg_file_name
],
check
=
True
)
return_code
=
cp
.
returncode
else
:
dir_name
=
self
.
target_dir
+
sep
if
(
bool
(
self
.
target_dir
))
else
''
if
bool
(
self
.
title
):
name_dictionary
=
{
index
:
dir_name
+
svg_file_name
.
replace
(
'TITLE'
,
self
.
title
)
.
replace
(
'.svg'
,
''
)
+
'.svg'
for
index
,
svg_file_name
in
name_dictionary
.
items
()
}
else
:
name_dictionary
=
{
index
:
dir_name
+
svg_file_name
.
replace
(
'.svg'
,
''
)
+
'.svg'
for
index
,
svg_file_name
in
name_dictionary
.
items
()
}
if
len
(
name_dictionary
)
==
0
:
name_dictionary
=
{
"all"
:
self
.
get_file_name
(
file_name
,
True
)
}
for
index
,
svg_file_name
in
name_dictionary
.
items
():
cp
=
subprocess
.
run
([
self
.
path_to_pdf2svg
,
file_name
,
svg_file_name
,
str
(
index
)],
check
=
True
)
return_code
=
cp
.
returncode
pdfFileObj
.
close
()
return
return_code
else
:
raise
FileNotFoundError
(
'
\"
{}
\"
is not an existing file!'
.
format
(
file_name
))
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program converts pdf to svg files with the help of the external program 'pdf2svg'.
svgscripts/convertPDF2SVG4Web.py [-h|--help, -a|--add-to-page-number=value, -d|--dir=targetDir -t|--title=title] <file|dir> ...
svgscripts/convertPDF2SVG4Web.py [-h|--help, -a|--add-to-page-number=value, -t|--title=title, -p|--page=pageNumber,
-d|--dir=targetDir -n|--name-dict='{"pageNumber": "file_name", ...}'] <file>
-h|--help: show help
-a|--add-to-page-number=value: value to add to the page number specification of the pdf file that will be used as the
file name of the target svg file, e.g. -a 2 TITLE_page001.pdf -> TITLE_page003.svg
-d|--dir=targetDir: target directory for the svg file(s)
-t|--title=title: title that will be used as part of the target svg file(s)' filename
-p|--page=pageNumber: page number of the target svg file. For use with _one_ file only.
-n|--name-dict='{"pageNumber": "file_name", ...}': For a multipage pdf, --name-dict can be used to pass a dictionary with
page numbers (str) as keys and file names (str) as values.
The script will extract only those pages for whiche there are keys.
E.g. -n '{"3":"TITLE_page001","5":"TITLE_page003"}' TITLE_multipage.pdf -> TITLE_page001.svg
TITLE_page003.svg
:return: exit code (int)
"""
target_dir
=
".{}svg"
.
format
(
sep
)
title
=
None
page_number
=
None
add_to_page_number
=
0
name_dictionary
=
{}
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"ha:d:t:p:n:"
,
[
"help"
,
"add-to-page-number="
,
"dir="
,
"title="
,
"page="
,
"name-dict="
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
)
or
not
args
:
usage
()
return
0
elif
opt
in
(
'-a'
,
'--add-to-page-number'
):
add_to_page_number
=
int
(
arg
)
elif
opt
in
(
'-d'
,
'--dir'
):
target_dir
=
arg
elif
opt
in
(
'-t'
,
'--title'
):
title
=
arg
elif
opt
in
(
'-p'
,
'--page'
):
page_number
=
str
(
arg
)
elif
opt
in
(
'-n'
,
'--name-dict'
):
name_dictionary
=
eval
(
arg
)
if
not
args
:
usage
()
return
2
files_to_process
=
list
()
for
arg
in
args
:
if
isfile
(
arg
):
files_to_process
.
append
(
arg
)
elif
isdir
(
arg
):
files_to_process
=
files_to_process
+
list
(
filter
(
lambda
file
:
'.pdf'
in
file
,
listdir
(
arg
)))
else
:
print
(
"'{}' does not exist!"
.
format
(
arg
))
return
2
converter
=
Converter
(
target_dir
=
target_dir
,
title
=
title
,
add_to_page_number
=
add_to_page_number
)
if
len
(
files_to_process
)
>
1
and
(
bool
(
page_number
)
or
bool
(
name_dictionary
)):
print
(
"ERROR: too many input files: option --page and --name-dict presuppose one input file!"
)
usage
()
return
2
for
file
in
files_to_process
:
converter
.
pdf2svg
(
file
,
name_dictionary
=
name_dictionary
,
page_number
=
page_number
)
return
0
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment