Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91988614
plotextractor.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Nov 16, 10:12
Size
50 KB
Mime Type
text/x-python
Expires
Mon, Nov 18, 10:12 (2 d)
Engine
blob
Format
Raw Data
Handle
22359439
Attached To
R3600 invenio-infoscience
plotextractor.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2010, 2011, CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
import
sys
import
os
import
getopt
import
re
import
time
from
invenio.shellutils
import
run_shell_command
,
Timeout
,
run_process_with_timeout
from
invenio.invenio_connector
import
InvenioConnector
from
invenio.textutils
import
wrap_text_in_a_box
,
\
wait_for_user
from
invenio.config
import
CFG_TMPDIR
,
CFG_SITE_URL
from
invenio.plotextractor_config
import
CFG_PLOTEXTRACTOR_DISALLOWED_TEX
,
\
CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT
,
\
CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT
,
\
CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT
from
invenio.bibtask
import
task_low_level_submission
from
invenio.plotextractor_getter
import
get_list_of_all_matching_files
,
\
parse_and_download
,
\
make_single_directory
,
\
tarballs_by_recids
,
\
tarballs_by_arXiv_id
from
invenio.plotextractor_converter
import
untar
,
extract_text
,
\
convert_images
from
invenio.plotextractor_output_utils
import
assemble_caption
,
\
find_open_and_close_braces
,
\
create_MARC
,
get_tex_location
,
\
get_image_location
,
\
create_contextfiles
,
\
prepare_image_data
,
\
write_message
,
remove_dups
from
tempfile
import
mkstemp
"""
This programme will take a tarball from arXiv, untar it, convert all its
associated images to PNG, find the captions to the images detailed in the
included TeX document, and write MARCXML that reflects these associations.
"""
ARXIV_HEADER
=
'arXiv:'
PLOTS_DIR
=
'plots'
MAIN_CAPTION_OR_IMAGE
=
0
SUB_CAPTION_OR_IMAGE
=
1
def
main
():
"""
The main program loop.
"""
help_param
=
'help'
verbose_param
=
'verbose'
tarball_param
=
'tarball'
tardir_param
=
'tdir'
infile_param
=
'input'
sdir_param
=
'sdir'
extract_text_param
=
'extract-text'
force_param
=
'force'
upload_param
=
'call-bibupload'
yes_i_know_param
=
'yes-i-know'
recid_param
=
'recid'
arXiv_param
=
'arXiv'
squash_param
=
'squash'
refno_url_param
=
'refno-url'
refno_param
=
'skip-refno'
clean_param
=
'clean'
param_abbrs
=
'h:t:d:s:i:a:l:xfuyrqck'
params
=
[
help_param
,
tarball_param
+
'='
,
tardir_param
+
'='
,
\
sdir_param
+
'='
,
infile_param
+
'='
,
arXiv_param
+
'='
,
refno_url_param
+
'='
,
\
extract_text_param
,
force_param
,
upload_param
,
yes_i_know_param
,
recid_param
,
\
squash_param
,
clean_param
]
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
param_abbrs
,
params
)
except
getopt
.
GetoptError
,
err
:
write_message
(
str
(
err
))
usage
()
sys
.
exit
(
2
)
tarball
=
None
sdir
=
None
infile
=
None
tdir
=
None
xtract_text
=
False
upload_plots
=
False
force
=
False
squash
=
False
squash_path
=
""
yes_i_know
=
False
recids
=
None
arXiv
=
None
clean
=
False
refno_url
=
CFG_SITE_URL
skip_refno
=
False
for
opt
,
arg
in
opts
:
if
opt
in
[
'-h'
,
help_param
]:
usage
()
sys
.
exit
()
elif
opt
in
[
'-t'
,
tarball_param
]:
tarball
=
arg
elif
opt
in
[
'-d'
,
tardir_param
]:
tdir
=
arg
elif
opt
in
[
'-i'
,
infile_param
]:
infile
=
arg
elif
opt
in
[
'-r'
,
recid_param
]:
recids
=
arg
elif
opt
in
[
'-a'
,
arXiv_param
]:
arXiv
=
arg
elif
opt
in
[
'-s'
,
sdir_param
]:
sdir
=
arg
elif
opt
in
[
'-x'
,
extract_text_param
]:
xtract_text
=
True
elif
opt
in
[
'-f'
,
force_param
]:
force
=
True
elif
opt
in
[
'-u'
,
upload_param
]:
upload_plots
=
True
elif
opt
in
[
'-q'
,
squash_param
]:
squash
=
True
elif
opt
in
[
'-y'
,
yes_i_know_param
]:
yes_i_know
=
True
elif
opt
in
[
'-c'
,
clean_param
]:
clean
=
True
elif
opt
in
[
'-l'
,
refno_url_param
]:
refno_url
=
arg
elif
opt
in
[
'-k'
,
refno_param
]:
skip_refno
=
True
else
:
usage
()
sys
.
exit
()
if
sdir
==
None
:
sdir
=
CFG_TMPDIR
elif
not
os
.
path
.
isdir
(
sdir
):
try
:
os
.
makedirs
(
sdir
)
except
:
write_message
(
'Error: We can
\'
t use this sdir. using '
+
\
'CFG_TMPDIR'
)
sdir
=
CFG_TMPDIR
if
skip_refno
:
refno_url
=
""
tars_and_gzips
=
[]
if
tarball
!=
None
:
tars_and_gzips
.
append
(
tarball
)
if
tdir
!=
None
:
filetypes
=
[
'gzip compressed'
,
'tar archive'
,
'Tar archive'
]
# FIXME
write_message
(
'Currently processing any tarballs in '
+
tdir
)
tars_and_gzips
.
extend
(
get_list_of_all_matching_files
(
tdir
,
filetypes
))
if
infile
!=
None
:
tars_and_gzips
.
extend
(
parse_and_download
(
infile
,
sdir
))
if
recids
!=
None
:
tars_and_gzips
.
extend
(
tarballs_by_recids
(
recids
,
sdir
))
if
arXiv
!=
None
:
tars_and_gzips
.
extend
(
tarballs_by_arXiv_id
([
arXiv
],
sdir
))
if
tars_and_gzips
==
[]:
write_message
(
'Error: no tarballs to process!'
)
sys
.
exit
(
1
)
if
squash
:
squash_fd
,
squash_path
=
mkstemp
(
suffix
=
"_"
+
time
.
strftime
(
"%Y%m
%d
%H%M%S"
)
+
".xml"
,
\
prefix
=
"plotextractor_"
,
dir
=
sdir
)
os
.
write
(
squash_fd
,
'<?xml version="1.0" encoding="UTF-8"?>
\n
<collection>
\n
'
)
os
.
close
(
squash_fd
)
for
tarball
in
tars_and_gzips
:
process_single
(
tarball
,
sdir
=
sdir
,
xtract_text
=
xtract_text
,
\
upload_plots
=
upload_plots
,
force
=
force
,
squash
=
squash_path
,
\
yes_i_know
=
yes_i_know
,
refno_url
=
refno_url
,
\
clean
=
clean
)
if
squash
:
squash_fd
=
open
(
squash_path
,
"a"
)
squash_fd
.
write
(
"</collection>
\n
"
)
squash_fd
.
close
()
write_message
(
"generated
%s
"
%
(
squash_path
,))
if
upload_plots
:
upload_to_site
(
squash_path
,
yes_i_know
)
def
process_single
(
tarball
,
sdir
=
CFG_TMPDIR
,
xtract_text
=
False
,
\
upload_plots
=
False
,
force
=
False
,
squash
=
""
,
\
yes_i_know
=
False
,
refno_url
=
""
,
\
clean
=
False
):
"""
Processes one tarball end-to-end.
@param: tarball (string): the absolute location of the tarball we wish
to process
@param: sdir (string): where we should put all the intermediate files for
the processing. if you're uploading, this directory should be one
of the ones specified in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, else
the upload won't work
@param: xtract_text (boolean): true iff you want to run pdftotext on the
pdf versions of the tarfiles. this programme assumes that the pdfs
are named the same as the tarballs but with a .pdf extension.
@param: upload_plots (boolean): true iff you want to bibupload the plots
extracted by this process
@param: force (boolean): force creation of new xml file
@param: squash: write MARCXML output into a specified 'squash' file
instead of single files.
@param: yes_i_know: if True, no user interaction if upload_plots is True
@param: refno_url: URL to the invenio-instance to query for refno.
@param: clean: if True, everything except the original tarball, plots and
context- files will be removed
@return: marc_name(string): path to generated marcxml file
"""
sub_dir
,
refno
=
get_defaults
(
tarball
,
sdir
,
refno_url
)
if
not
squash
:
marc_name
=
os
.
path
.
join
(
sub_dir
,
'
%s
.xml'
%
(
refno
,))
if
(
force
or
not
os
.
path
.
exists
(
marc_name
)):
marc_fd
=
open
(
marc_name
,
'w'
)
marc_fd
.
write
(
'<?xml version="1.0" encoding="UTF-8"?>
\n
<collection>
\n
'
)
marc_fd
.
close
()
else
:
marc_name
=
squash
if
xtract_text
:
extract_text
(
tarball
)
try
:
extracted_files_list
,
image_list
,
tex_files
=
untar
(
tarball
,
sub_dir
)
except
Timeout
:
write_message
(
'Timeout during tarball extraction on
%s
'
%
(
tarball
,))
return
if
tex_files
==
[]
or
tex_files
==
None
:
write_message
(
'
%s
is not a tarball'
%
(
os
.
path
.
split
(
tarball
)[
-
1
],))
run_shell_command
(
'rm -r
%s
'
,
(
sub_dir
,))
return
converted_image_list
=
convert_images
(
image_list
)
write_message
(
'converted
%d
of
%d
images found for
%s
'
%
(
len
(
converted_image_list
),
\
len
(
image_list
),
\
os
.
path
.
basename
(
tarball
)))
extracted_image_data
=
[]
for
tex_file
in
tex_files
:
# Extract images, captions and labels
partly_extracted_image_data
=
extract_captions
(
tex_file
,
sub_dir
,
\
converted_image_list
)
if
partly_extracted_image_data
!=
[]:
# Add proper filepaths and do various cleaning
cleaned_image_data
=
prepare_image_data
(
partly_extracted_image_data
,
\
tex_file
,
converted_image_list
)
# Using prev. extracted info, get contexts for each image found
extracted_image_data
.
extend
((
extract_context
(
tex_file
,
cleaned_image_data
)))
extracted_image_data
=
remove_dups
(
extracted_image_data
)
if
extracted_image_data
==
[]:
write_message
(
'No plots detected in
%s
'
%
(
refno
,))
else
:
if
refno_url
==
""
:
refno
=
None
create_contextfiles
(
extracted_image_data
)
marc_xml
=
create_MARC
(
extracted_image_data
,
tarball
,
refno
)
if
not
squash
:
marc_xml
+=
"
\n
</collection>"
if
marc_name
!=
None
:
marc_fd
=
open
(
marc_name
,
'a'
)
marc_fd
.
write
(
'
%s
\n
'
%
(
marc_xml
,))
marc_fd
.
close
()
if
not
squash
:
write_message
(
'generated
%s
'
%
(
marc_name
,))
if
upload_plots
:
upload_to_site
(
marc_name
,
yes_i_know
)
if
clean
:
clean_up
(
extracted_files_list
,
image_list
)
write_message
(
'work complete on
%s
'
%
(
os
.
path
.
split
(
tarball
)[
-
1
],))
return
marc_name
def
clean_up
(
extracted_files_list
,
image_list
):
"""
Removes all the intermediate stuff.
@param: extracted_files_list ([string, string, ...]): list of all extracted files
@param: image_list ([string, string, ...]): list of the images to keep
"""
for
extracted_file
in
extracted_files_list
:
# Remove everything that is not in the image_list or is not a directory
if
extracted_file
not
in
image_list
and
extracted_file
[
-
1
]
!=
os
.
sep
:
run_shell_command
(
'rm
%s
'
,
(
extracted_file
,))
def
get_defaults
(
tarball
,
sdir
,
refno_url
):
"""
A function for parameter-checking.
@param: tarball (string): the location of the tarball to be extracted
@param: sdir (string): the location of the scratch directory for untarring,
conversions, and the ultimate destination of the MARCXML
@param: refno_url (string): server location on where to look for refno
@return sdir, refno (string, string): the same
arguments it was sent as is appropriate.
"""
if
sdir
==
None
:
# Missing sdir: using default directory: CFG_TMPDIR
sdir
=
CFG_TMPDIR
else
:
sdir
=
os
.
path
.
split
(
tarball
)[
0
]
# make a subdir in the scratch directory for each tarball
sdir
=
make_single_directory
(
sdir
,
\
os
.
path
.
split
(
tarball
)[
-
1
]
+
'_'
+
PLOTS_DIR
)
if
refno_url
!=
""
:
refno
=
get_reference_number
(
tarball
,
refno_url
)
if
refno
==
None
:
refno
=
os
.
path
.
basename
(
tarball
)
write_message
(
'Error: can
\'
t find record id for
%s
'
%
(
refno
,))
else
:
refno
=
os
.
path
.
basename
(
tarball
)
write_message
(
"Skipping ref-no check"
)
return
sdir
,
refno
def
get_reference_number
(
tarball
,
refno_url
):
"""
Attempts to determine the reference number of the file by searching.
@param: tarball (string): the name of the tarball as downloaded from
arXiv
@param: refno_url (string): url of repository to check for a
reference number for this record. If not set; returns None
@return: refno (string): the reference number of the paper
"""
if
refno_url
:
server
=
InvenioConnector
(
refno_url
)
# we just need the name of the file
tarball
=
os
.
path
.
split
(
tarball
)[
1
]
prefix
=
'037__a:'
# the name right now looks like arXiv:hep-ph_9703009
# or arXiv:0910.0476
if
tarball
.
startswith
(
ARXIV_HEADER
):
if
len
(
tarball
.
split
(
'_'
))
>
1
:
tarball
=
tarball
.
split
(
':'
)[
1
]
arXiv_record
=
tarball
.
replace
(
'_'
,
'/'
)
else
:
arXiv_record
=
tarball
result
=
server
.
search
(
p
=
prefix
+
arXiv_record
,
of
=
'id'
)
if
len
(
result
)
==
0
:
return
None
return
str
(
result
[
0
])
arXiv_record
=
re
.
findall
(
'(([a-zA-Z
\\
-]+/
\\
d+)|(
\\
d+
\\
.
\\
d+))'
,
tarball
)
if
len
(
arXiv_record
)
>
1
:
arXiv_record
=
arXiv_record
[
0
]
result
=
server
.
search
(
p
=
prefix
+
arXiv_record
,
of
=
'id'
)
if
len
(
result
)
>
0
:
return
str
(
result
[
0
])
tarball_mod
=
tarball
.
replace
(
'_'
,
'/'
)
arXiv_record
=
re
.
findall
(
'(([a-zA-Z
\\
-]+/
\\
d+)|(
\\
d+
\\
.
\\
d+))'
,
\
tarball_mod
)
if
len
(
arXiv_record
)
>
1
:
arXiv_record
=
arXiv_record
[
0
]
result
=
server
.
search
(
p
=
prefix
+
arXiv_record
,
of
=
'id'
)
if
len
(
result
)
>
0
:
return
str
(
result
[
0
])
return
None
def
rotate_image
(
filename
,
line
,
sdir
,
image_list
):
"""
Given a filename and a line, figure out what it is that the author
wanted to do wrt changing the rotation of the image and convert the
file so that this rotation is reflected in its presentation.
@param: filename (string): the name of the file as specified in the TeX
@param: line (string): the line where the rotate command was found
@output: the image file rotated in accordance with the rotate command
@return: True if something was rotated
"""
file_loc
=
get_image_location
(
filename
,
sdir
,
image_list
)
degrees
=
re
.
findall
(
'(angle=[-
\\
d]+|rotate=[-
\\
d]+)'
,
line
)
if
len
(
degrees
)
<
1
:
return
False
degrees
=
degrees
[
0
]
.
split
(
'='
)[
-
1
]
.
strip
()
if
file_loc
==
None
or
file_loc
==
'ERROR'
or
\
not
re
.
match
(
'-*
\\
d+'
,
degrees
):
return
False
degrees
=
str
(
0
-
int
(
degrees
))
dummy
,
dummy
,
cmd_err
=
run_process_with_timeout
(
'mogrify -rotate
%s
%s
'
%
\
(
degrees
,
file_loc
),
shell
=
True
)
if
cmd_err
!=
''
:
return
True
else
:
return
True
def
get_context
(
lines
,
backwards
=
False
):
"""
Given a relevant string from a TeX file, this function will extract text
from it as far as it is deemed contextually relevant, either backwards or forwards
in the text. The level of relevance allowed is configurable. When it reaches some
point in the text that is determined to be out of scope from the current context,
like text that is identified as a new paragraph, a complex TeX structure
('/begin', '/end', etc.) etc., it will return the previously allocated text.
For use when extracting text with contextual value for an figure or plot.
@param lines (string): string to examine
@param reversed (bool): are we searching backwards?
@return context (string): extracted context
"""
tex_tag
=
re
.
compile
(
r".*\\(\w+).*"
)
sentence
=
re
.
compile
(
r"(?<=[.?!])[\s]+(?=[A-Z])"
)
context
=
[]
word_list
=
lines
.
split
()
if
backwards
:
word_list
.
reverse
()
# For each word we do the following:
# 1. Check if we have reached word limit
# 2. If not, see if this is a TeX tag and see if its 'illegal'
# 3. Otherwise, add word to context
for
word
in
word_list
:
if
len
(
context
)
>=
CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT
:
break
match
=
tex_tag
.
match
(
word
)
if
(
match
and
match
.
group
(
1
)
in
CFG_PLOTEXTRACTOR_DISALLOWED_TEX
):
# TeX Construct matched, return
if
backwards
:
# When reversed we need to go back and
# remove unwanted data within brackets
temp_word
=
""
while
len
(
context
):
temp_word
=
context
.
pop
()
if
'}'
in
temp_word
:
break
break
context
.
append
(
word
)
if
backwards
:
context
.
reverse
()
text
=
" "
.
join
(
context
)
sentence_list
=
sentence
.
split
(
text
)
if
backwards
:
sentence_list
.
reverse
()
if
len
(
sentence_list
)
>
CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT
:
return
" "
.
join
(
sentence_list
[:
CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT
])
else
:
return
" "
.
join
(
sentence_list
)
def
extract_context
(
tex_file
,
extracted_image_data
):
"""
Given a .tex file and a label name, this function will extract the text before
and after for all the references made to this label in the text. The number
of characters to extract before and after is configurable.
@param tex_file (list): path to .tex file
@param extracted_image_data ([(string, string, list), ...]):
a list of tuples of images matched to labels and captions from
this document.
@return extracted_image_data ([(string, string, list, list),
(string, string, list, list),...)]: the same list, but now containing
extracted contexts
"""
if
os
.
path
.
isdir
(
tex_file
)
or
not
os
.
path
.
exists
(
tex_file
):
return
[]
fd
=
open
(
tex_file
)
lines
=
fd
.
read
()
fd
.
close
()
# Generate context for each image and its assoc. labels
new_image_data
=
[]
for
image
,
caption
,
label
in
extracted_image_data
:
context_list
=
[]
# Generate a list of index tuples for all matches
indicies
=
[
match
.
span
()
\
for
match
in
re
.
finditer
(
r"(\\(?:fig|ref)\{"
+
label
+
"\})"
,
\
lines
)]
for
startindex
,
endindex
in
indicies
:
# Retrive all lines before label until beginning of file
i
=
startindex
-
CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT
if
i
<
0
:
text_before
=
lines
[:
startindex
]
else
:
text_before
=
lines
[
i
:
startindex
]
context_before
=
get_context
(
text_before
,
backwards
=
True
)
# Retrive all lines from label until end of file and get context
i
=
endindex
+
CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT
text_after
=
lines
[
endindex
:
i
]
context_after
=
get_context
(
text_after
)
context_list
.
append
(
context_before
+
'
\\
ref{'
+
label
+
'} '
+
context_after
)
new_image_data
.
append
((
image
,
caption
,
label
,
context_list
))
return
new_image_data
def
extract_captions
(
tex_file
,
sdir
,
image_list
,
primary
=
True
):
"""
Take the TeX file and the list of images in the tarball (which all,
presumably, are used in the TeX file) and figure out which captions
in the text are associated with which images
@param: lines (list): list of lines of the TeX file
@param: tex_file (string): the name of the TeX file which mentions
the images
@param: sdir (string): path to current sub-directory
@param: image_list (list): list of images in tarball
@param: primary (bool): is this the primary call to extract_caption?
@return: images_and_captions_and_labels ([(string, string, list),
(string, string, list), ...]):
a list of tuples representing the names of images and their
corresponding figure labels from the TeX file
"""
if
os
.
path
.
isdir
(
tex_file
)
or
not
os
.
path
.
exists
(
tex_file
):
return
[]
fd
=
open
(
tex_file
)
lines
=
fd
.
readlines
()
fd
.
close
()
# possible figure lead-ins
figure_head
=
'
\\
begin{figure'
# also matches figure*
figure_tail
=
'
\\
end{figure'
# also matches figure*
picture_head
=
'
\\
begin{picture}'
displaymath_head
=
'
\\
begin{displaymath}'
subfloat_head
=
'
\\
subfloat'
subfig_head
=
'
\\
subfigure'
includegraphics_head
=
'
\\
includegraphics'
epsfig_head
=
'
\\
epsfig'
input_head
=
'
\\
input'
# possible caption lead-ins
caption_head
=
'
\\
caption'
figcaption_head
=
'
\\
figcaption'
label_head
=
'
\\
label'
rotate
=
'rotate='
angle
=
'angle='
eps_tail
=
'.eps'
ps_tail
=
'.ps'
doc_head
=
'
\\
begin{document}'
doc_tail
=
'
\\
end{document}'
extracted_image_data
=
[]
cur_image
=
''
caption
=
''
labels
=
[]
active_label
=
""
# cut out shit before the doc head
if
primary
:
for
line_index
in
range
(
len
(
lines
)):
if
lines
[
line_index
]
.
find
(
doc_head
)
<
0
:
lines
[
line_index
]
=
''
else
:
break
# are we using commas in filenames here?
commas_okay
=
False
for
dummy1
,
dummy2
,
filenames
in
\
os
.
walk
(
os
.
path
.
split
(
os
.
path
.
split
(
tex_file
)[
0
])[
0
]):
for
filename
in
filenames
:
if
filename
.
find
(
','
)
>
-
1
:
commas_okay
=
True
break
# a comment is a % not preceded by a \
comment
=
re
.
compile
(
"(?<!
\\\\
)%"
)
for
line_index
in
range
(
len
(
lines
)):
# get rid of pesky comments by splitting where the comment is
# and keeping only the part before the %
line
=
comment
.
split
(
lines
[
line_index
])[
0
]
line
=
line
.
strip
()
lines
[
line_index
]
=
line
in_figure_tag
=
0
for
line_index
in
range
(
len
(
lines
)):
line
=
lines
[
line_index
]
if
line
==
''
:
continue
if
line
.
find
(
doc_tail
)
>
-
1
:
return
extracted_image_data
"""
FIGURE -
structure of a figure:
\begin{figure}
\formatting...
\includegraphics[someoptions]{FILENAME}
\caption{CAPTION} %caption and includegraphics may be switched!
\end{figure}
"""
index
=
line
.
find
(
figure_head
)
if
index
>
-
1
:
in_figure_tag
=
1
# some punks don't like to put things in the figure tag. so we
# just want to see if there is anything that is sitting outside
# of it when we find it
cur_image
,
caption
,
extracted_image_data
=
\
put_it_together
(
cur_image
,
caption
,
active_label
,
extracted_image_data
,
\
line_index
,
lines
)
# here, you jerks, just make it so that it's fecking impossible to
# figure out your damn inclusion types
index
=
max
([
line
.
find
(
eps_tail
),
line
.
find
(
ps_tail
),
\
line
.
find
(
epsfig_head
)])
if
index
>
-
1
:
if
line
.
find
(
eps_tail
)
>
-
1
or
line
.
find
(
ps_tail
)
>
-
1
:
ext
=
True
else
:
ext
=
False
filenames
=
intelligently_find_filenames
(
line
,
ext
=
ext
,
commas_okay
=
commas_okay
)
# try to look ahead! sometimes there are better matches after
if
line_index
<
len
(
lines
)
-
1
:
filenames
.
extend
(
\
intelligently_find_filenames
(
lines
[
line_index
+
1
],
commas_okay
=
commas_okay
))
if
line_index
<
len
(
lines
)
-
2
:
filenames
.
extend
(
\
intelligently_find_filenames
(
lines
[
line_index
+
2
],
commas_okay
=
commas_okay
))
for
filename
in
filenames
:
filename
=
str
(
filename
)
if
cur_image
==
''
:
cur_image
=
filename
elif
type
(
cur_image
)
==
list
:
if
type
(
cur_image
[
SUB_CAPTION_OR_IMAGE
])
==
list
:
cur_image
[
SUB_CAPTION_OR_IMAGE
]
.
append
(
filename
)
else
:
cur_image
[
SUB_CAPTION_OR_IMAGE
]
=
[
filename
]
else
:
cur_image
=
[
''
,
[
cur_image
,
filename
]]
"""
Rotate and angle
"""
index
=
max
(
line
.
find
(
rotate
),
line
.
find
(
angle
))
if
index
>
-
1
:
# which is the image associated to it?
filenames
=
intelligently_find_filenames
(
line
,
commas_okay
=
commas_okay
)
# try the line after and the line before
if
line_index
+
1
<
len
(
lines
):
filenames
.
extend
(
intelligently_find_filenames
(
lines
[
line_index
+
1
],
commas_okay
=
commas_okay
))
if
line_index
>
1
:
filenames
.
extend
(
intelligently_find_filenames
(
lines
[
line_index
-
1
],
commas_okay
=
commas_okay
))
already_tried
=
[]
for
filename
in
filenames
:
if
filename
!=
'ERROR'
and
not
filename
in
already_tried
:
if
rotate_image
(
filename
,
line
,
sdir
,
image_list
):
break
already_tried
.
append
(
filename
)
"""
INCLUDEGRAPHICS -
structure of includegraphics:
\includegraphics[someoptions]{FILENAME}
"""
index
=
line
.
find
(
includegraphics_head
)
if
index
>
-
1
:
open_curly
,
open_curly_line
,
close_curly
,
dummy
=
\
find_open_and_close_braces
(
line_index
,
index
,
'{'
,
lines
)
filename
=
lines
[
open_curly_line
][
open_curly
+
1
:
close_curly
]
if
cur_image
==
''
:
cur_image
=
filename
elif
type
(
cur_image
)
==
list
:
if
type
(
cur_image
[
SUB_CAPTION_OR_IMAGE
])
==
list
:
cur_image
[
SUB_CAPTION_OR_IMAGE
]
.
append
(
filename
)
else
:
cur_image
[
SUB_CAPTION_OR_IMAGE
]
=
[
filename
]
else
:
cur_image
=
[
''
,
[
cur_image
,
filename
]]
"""
{\input{FILENAME}}
\caption{CAPTION}
This input is ambiguous, since input is also used for things like
inclusion of data from other LaTeX files directly.
"""
index
=
line
.
find
(
input_head
)
if
index
>
-
1
:
new_tex_names
=
intelligently_find_filenames
(
line
,
TeX
=
True
,
\
commas_okay
=
commas_okay
)
for
new_tex_name
in
new_tex_names
:
if
new_tex_name
!=
'ERROR'
:
new_tex_file
=
get_tex_location
(
new_tex_name
,
tex_file
)
if
new_tex_file
!=
None
and
primary
:
#to kill recursion
extracted_image_data
.
extend
(
extract_captions
(
\
new_tex_file
,
sdir
,
\
image_list
,
primary
=
False
))
"""PICTURE"""
index
=
line
.
find
(
picture_head
)
if
index
>
-
1
:
# structure of a picture:
# \begin{picture}
# ....not worrying about this now
#write_message('found picture tag')
#FIXME
pass
"""DISPLAYMATH"""
index
=
line
.
find
(
displaymath_head
)
if
index
>
-
1
:
# structure of a displaymath:
# \begin{displaymath}
# ....not worrying about this now
#write_message('found displaymath tag')
#FIXME
pass
"""
CAPTIONS -
structure of a caption:
\caption[someoptions]{CAPTION}
or
\caption{CAPTION}
or
\caption{{options}{CAPTION}}
"""
index
=
max
([
line
.
find
(
caption_head
),
line
.
find
(
figcaption_head
)])
if
index
>
-
1
:
open_curly
,
open_curly_line
,
close_curly
,
close_curly_line
=
\
find_open_and_close_braces
(
line_index
,
index
,
'{'
,
lines
)
cap_begin
=
open_curly
+
1
cur_caption
=
assemble_caption
(
open_curly_line
,
cap_begin
,
\
close_curly_line
,
close_curly
,
lines
)
if
caption
==
''
:
caption
=
cur_caption
elif
type
(
caption
)
==
list
:
if
type
(
caption
[
SUB_CAPTION_OR_IMAGE
])
==
list
:
caption
[
SUB_CAPTION_OR_IMAGE
]
.
append
(
cur_caption
)
else
:
caption
[
SUB_CAPTION_OR_IMAGE
]
=
[
cur_caption
]
elif
caption
!=
cur_caption
:
caption
=
[
''
,
[
caption
,
cur_caption
]]
"""
SUBFLOATS -
structure of a subfloat (inside of a figure tag):
\subfloat[CAPTION]{options{FILENAME}}
also associated with the overall caption of the enclosing figure
"""
index
=
line
.
find
(
subfloat_head
)
if
index
>
-
1
:
# if we are dealing with subfloats, we need a different
# sort of structure to keep track of captions and subcaptions
if
type
(
cur_image
)
!=
list
:
cur_image
=
[
cur_image
,
[]]
if
type
(
caption
)
!=
list
:
caption
=
[
caption
,
[]]
open_square
,
open_square_line
,
close_square
,
close_square_line
=
\
find_open_and_close_braces
(
line_index
,
index
,
'['
,
lines
)
cap_begin
=
open_square
+
1
sub_caption
=
assemble_caption
(
open_square_line
,
\
cap_begin
,
close_square_line
,
close_square
,
lines
)
caption
[
SUB_CAPTION_OR_IMAGE
]
.
append
(
sub_caption
)
open_curly
,
open_curly_line
,
close_curly
,
dummy
=
\
find_open_and_close_braces
(
close_square_line
,
\
close_square
,
'{'
,
lines
)
sub_image
=
lines
[
open_curly_line
][
open_curly
+
1
:
close_curly
]
cur_image
[
SUB_CAPTION_OR_IMAGE
]
.
append
(
sub_image
)
"""
SUBFIGURES -
structure of a subfigure (inside a figure tag):
\subfigure[CAPTION]{
\includegraphics[options]{FILENAME}}
also associated with the overall caption of the enclosing figure
"""
index
=
line
.
find
(
subfig_head
)
if
index
>
-
1
:
# like with subfloats, we need a different structure for keepin
# track of this stuff
if
type
(
cur_image
)
!=
list
:
cur_image
=
[
cur_image
,
[]]
if
type
(
caption
)
!=
list
:
caption
=
[
caption
,
[]]
open_square
,
open_square_line
,
close_square
,
close_square_line
=
\
find_open_and_close_braces
(
line_index
,
index
,
'['
,
lines
)
cap_begin
=
open_square
+
1
sub_caption
=
assemble_caption
(
open_square_line
,
\
cap_begin
,
close_square_line
,
close_square
,
lines
)
caption
[
SUB_CAPTION_OR_IMAGE
]
.
append
(
sub_caption
)
index_cpy
=
index
# find the graphics tag to get the filename
# it is okay if we eat lines here
index
=
line
.
find
(
includegraphics_head
)
while
index
==
-
1
and
(
line_index
+
1
)
<
len
(
lines
):
line_index
=
line_index
+
1
line
=
lines
[
line_index
]
index
=
line
.
find
(
includegraphics_head
)
if
line_index
==
len
(
lines
):
# didn't find the image name on line
line_index
=
index_cpy
open_curly
,
open_curly_line
,
close_curly
,
dummy
=
\
find_open_and_close_braces
(
line_index
,
\
index
,
'{'
,
lines
)
sub_image
=
lines
[
open_curly_line
][
open_curly
+
1
:
close_curly
]
cur_image
[
SUB_CAPTION_OR_IMAGE
]
.
append
(
sub_image
)
"""
LABELS -
structure of a label:
\label{somelabelnamewhichprobablyincludesacolon}
Labels are used to tag images and will later be used in ref tags
to reference them. This is interesting because in effect the refs
to a plot are additional caption for it.
Notes: labels can be used for many more things than just plots.
We'll have to experiment with how to best associate a label with an
image.. if it's in the caption, it's easy. If it's in a figure, it's
still okay... but the images that aren't in figure tags are numerous.
"""
index
=
line
.
find
(
label_head
)
if
index
>
-
1
and
in_figure_tag
:
open_curly
,
open_curly_line
,
close_curly
,
dummy
=
\
find_open_and_close_braces
(
line_index
,
\
index
,
'{'
,
lines
)
label
=
lines
[
open_curly_line
][
open_curly
+
1
:
close_curly
]
if
label
not
in
labels
:
active_label
=
label
labels
.
append
(
label
)
"""
FIGURE
important: we put the check for the end of the figure at the end
of the loop in case some pathological person puts everything in one
line
"""
index
=
max
([
line
.
find
(
figure_tail
),
line
.
find
(
doc_tail
)])
if
index
>
-
1
:
in_figure_tag
=
0
cur_image
,
caption
,
extracted_image_data
=
\
put_it_together
(
cur_image
,
caption
,
active_label
,
extracted_image_data
,
\
line_index
,
lines
)
"""
END DOCUMENT
we shouldn't look at anything after the end document tag is found
"""
index
=
line
.
find
(
doc_tail
)
if
index
>
-
1
:
break
return
extracted_image_data
def
put_it_together
(
cur_image
,
caption
,
context
,
extracted_image_data
,
line_index
,
\
lines
):
"""
Takes the current image(s) and caption(s) and assembles them into
something useful in the extracted_image_data list.
@param: cur_image (string || list): the image currently being dealt with, or
the list of images, in the case of subimages
@param: caption (string || list): the caption or captions currently in scope
@param: extracted_image_data ([(string, string), (string, string), ...]):
a list of tuples of images matched to captions from this document.
@param: line_index (int): the index where we are in the lines (for
searchback and searchforward purposes)
@param: lines ([string, string, ...]): the lines in the TeX
@return: (cur_image, caption, extracted_image_data): the same arguments it
was sent, processed appropriately
"""
if
type
(
cur_image
)
==
list
:
if
cur_image
[
MAIN_CAPTION_OR_IMAGE
]
==
'ERROR'
:
cur_image
[
MAIN_CAPTION_OR_IMAGE
]
=
''
for
image
in
cur_image
[
SUB_CAPTION_OR_IMAGE
]:
if
image
==
'ERROR'
:
cur_image
[
SUB_CAPTION_OR_IMAGE
]
.
remove
(
image
)
if
cur_image
!=
''
and
caption
!=
''
:
if
type
(
cur_image
)
==
list
and
type
(
caption
)
==
list
:
if
cur_image
[
MAIN_CAPTION_OR_IMAGE
]
!=
''
and
\
caption
[
MAIN_CAPTION_OR_IMAGE
]
!=
''
:
extracted_image_data
.
append
(
(
cur_image
[
MAIN_CAPTION_OR_IMAGE
],
caption
[
MAIN_CAPTION_OR_IMAGE
],
context
))
if
type
(
cur_image
[
MAIN_CAPTION_OR_IMAGE
])
==
list
:
# why is the main image a list?
# it's a good idea to attach the main caption to other
# things, but the main image can only be used once
cur_image
[
MAIN_CAPTION_OR_IMAGE
]
=
''
if
type
(
cur_image
[
SUB_CAPTION_OR_IMAGE
])
==
list
:
if
type
(
caption
[
SUB_CAPTION_OR_IMAGE
])
==
list
:
for
index
in
\
range
(
len
(
cur_image
[
SUB_CAPTION_OR_IMAGE
])):
if
index
<
len
(
caption
[
SUB_CAPTION_OR_IMAGE
]):
long_caption
=
\
caption
[
MAIN_CAPTION_OR_IMAGE
]
+
' : '
+
\
caption
[
SUB_CAPTION_OR_IMAGE
][
index
]
else
:
long_caption
=
\
caption
[
MAIN_CAPTION_OR_IMAGE
]
+
' : '
+
\
'Caption not extracted'
extracted_image_data
.
append
(
(
cur_image
[
SUB_CAPTION_OR_IMAGE
][
index
],
long_caption
,
context
))
else
:
long_caption
=
caption
[
MAIN_CAPTION_OR_IMAGE
]
+
\
' : '
+
caption
[
SUB_CAPTION_OR_IMAGE
]
for
sub_image
in
cur_image
[
SUB_CAPTION_OR_IMAGE
]:
extracted_image_data
.
append
(
(
sub_image
,
long_caption
,
context
))
else
:
if
type
(
caption
[
SUB_CAPTION_OR_IMAGE
])
==
list
:
long_caption
=
caption
[
MAIN_CAPTION_OR_IMAGE
]
for
sub_cap
in
caption
[
SUB_CAPTION_OR_IMAGE
]:
long_caption
=
long_caption
+
' : '
+
sub_cap
extracted_image_data
.
append
(
(
cur_image
[
SUB_CAPTION_OR_IMAGE
],
long_caption
,
context
))
else
:
#wtf are they lists for?
extracted_image_data
.
append
(
(
cur_image
[
SUB_CAPTION_OR_IMAGE
],
caption
[
SUB_CAPTION_OR_IMAGE
],
context
))
elif
type
(
cur_image
)
==
list
:
if
cur_image
[
MAIN_CAPTION_OR_IMAGE
]
!=
''
:
extracted_image_data
.
append
(
(
cur_image
[
MAIN_CAPTION_OR_IMAGE
],
caption
,
context
))
if
type
(
cur_image
[
SUB_CAPTION_OR_IMAGE
])
==
list
:
for
image
in
cur_image
[
SUB_CAPTION_OR_IMAGE
]:
extracted_image_data
.
append
((
image
,
caption
,
context
))
else
:
extracted_image_data
.
append
(
(
cur_image
[
SUB_CAPTION_OR_IMAGE
],
caption
,
context
))
elif
type
(
caption
)
==
list
:
if
caption
[
MAIN_CAPTION_OR_IMAGE
]
!=
''
:
extracted_image_data
.
append
(
(
cur_image
,
caption
[
MAIN_CAPTION_OR_IMAGE
],
context
))
if
type
(
caption
[
SUB_CAPTION_OR_IMAGE
])
==
list
:
# multiple caps for one image:
long_caption
=
caption
[
MAIN_CAPTION_OR_IMAGE
]
for
subcap
in
caption
[
SUB_CAPTION_OR_IMAGE
]:
if
long_caption
!=
''
:
long_caption
+=
' : '
long_caption
+=
subcap
extracted_image_data
.
append
((
cur_image
,
long_caption
,
context
))
else
:
extracted_image_data
.
append
(
(
cur_image
,
caption
[
SUB_CAPTION_OR_IMAGE
]
.
context
))
else
:
extracted_image_data
.
append
((
cur_image
,
caption
,
context
))
elif
cur_image
!=
''
and
caption
==
''
:
# we may have missed the caption somewhere.
REASONABLE_SEARCHBACK
=
25
REASONABLE_SEARCHFORWARD
=
5
curly_no_tag_preceding
=
'(?<!
\\
w){'
for
searchback
in
range
(
REASONABLE_SEARCHBACK
):
if
line_index
-
searchback
<
0
:
continue
back_line
=
lines
[
line_index
-
searchback
]
m
=
re
.
search
(
curly_no_tag_preceding
,
back_line
)
if
m
!=
None
:
open_curly
=
m
.
start
()
open_curly
,
open_curly_line
,
close_curly
,
\
close_curly_line
=
find_open_and_close_braces
(
\
line_index
-
searchback
,
open_curly
,
'{'
,
lines
)
cap_begin
=
open_curly
+
1
caption
=
assemble_caption
(
open_curly_line
,
cap_begin
,
\
close_curly_line
,
close_curly
,
lines
)
if
type
(
cur_image
)
==
list
:
extracted_image_data
.
append
(
(
cur_image
[
MAIN_CAPTION_OR_IMAGE
],
caption
,
context
))
for
sub_img
in
cur_image
[
SUB_CAPTION_OR_IMAGE
]:
extracted_image_data
.
append
((
sub_img
,
caption
,
context
))
else
:
extracted_image_data
.
append
((
cur_image
,
caption
,
context
))
break
if
caption
==
''
:
for
searchforward
in
range
(
REASONABLE_SEARCHFORWARD
):
if
line_index
+
searchforward
>=
len
(
lines
):
break
fwd_line
=
lines
[
line_index
+
searchforward
]
m
=
re
.
search
(
curly_no_tag_preceding
,
fwd_line
)
if
m
!=
None
:
open_curly
=
m
.
start
()
open_curly
,
open_curly_line
,
close_curly
,
\
close_curly_line
=
find_open_and_close_braces
(
\
line_index
+
searchforward
,
open_curly
,
'{'
,
lines
)
cap_begin
=
open_curly
+
1
caption
=
assemble_caption
(
open_curly_line
,
\
cap_begin
,
close_curly_line
,
close_curly
,
lines
)
if
type
(
cur_image
)
==
list
:
extracted_image_data
.
append
(
(
cur_image
[
MAIN_CAPTION_OR_IMAGE
],
caption
,
context
))
for
sub_img
in
cur_image
[
SUB_CAPTION_OR_IMAGE
]:
extracted_image_data
.
append
((
sub_img
,
caption
,
context
))
else
:
extracted_image_data
.
append
((
cur_image
,
caption
,
context
))
break
if
caption
==
''
:
if
type
(
cur_image
)
==
list
:
extracted_image_data
.
append
(
(
cur_image
[
MAIN_CAPTION_OR_IMAGE
],
'No caption found'
,
context
))
for
sub_img
in
cur_image
[
SUB_CAPTION_OR_IMAGE
]:
extracted_image_data
.
append
((
sub_img
,
'No caption'
,
context
))
else
:
extracted_image_data
.
append
(
(
cur_image
,
'No caption found'
,
context
))
elif
caption
!=
''
and
cur_image
==
''
:
if
type
(
caption
)
==
list
:
long_caption
=
caption
[
MAIN_CAPTION_OR_IMAGE
]
for
subcap
in
caption
[
SUB_CAPTION_OR_IMAGE
]:
long_caption
=
long_caption
+
': '
+
subcap
else
:
long_caption
=
caption
extracted_image_data
.
append
((
''
,
'noimg'
+
long_caption
,
context
))
# if we're leaving the figure, no sense keeping the data
cur_image
=
''
caption
=
''
return
(
cur_image
,
caption
,
extracted_image_data
)
def
intelligently_find_filenames
(
line
,
TeX
=
False
,
ext
=
False
,
commas_okay
=
False
):
"""
Find the filename in the line. We don't support all filenames! Just eps
and ps for now.
@param: line (string): the line we want to get a filename out of
@return: filename ([string, ...]): what is probably the name of the file(s)
"""
files_included
=
[
'ERROR'
]
if
commas_okay
:
valid_for_filename
=
'
\\
s*[A-Za-z0-9
\\
-
\\
=
\\
+/
\\\\
_
\\
.,%#]+'
else
:
valid_for_filename
=
'
\\
s*[A-Za-z0-9
\\
-
\\
=
\\
+/
\\\\
_
\\
.%#]+'
if
ext
:
valid_for_filename
=
valid_for_filename
+
'\.e*ps[texfi2]*'
if
TeX
:
valid_for_filename
=
valid_for_filename
+
'[\.latex]*'
file_inclusion
=
re
.
findall
(
'='
+
valid_for_filename
+
'[ ,]'
,
line
)
if
len
(
file_inclusion
)
>
0
:
# right now it looks like '=FILENAME,' or '=FILENAME '
for
file_included
in
file_inclusion
:
files_included
.
append
(
file_included
[
1
:
-
1
])
file_inclusion
=
re
.
findall
(
'(?:[ps]*file=|figure=)'
+
\
valid_for_filename
+
'[,
\\
]} ]*'
,
line
)
if
len
(
file_inclusion
)
>
0
:
# still has the =
for
file_included
in
file_inclusion
:
part_before_equals
=
file_included
.
split
(
'='
)[
0
]
if
len
(
part_before_equals
)
!=
file_included
:
file_included
=
file_included
[
len
(
part_before_equals
)
+
1
:]
.
strip
()
if
not
file_included
in
files_included
:
files_included
.
append
(
file_included
)
file_inclusion
=
re
.
findall
(
'["
\'
{
\\
[]'
+
valid_for_filename
+
'[}
\\
],"
\'
]'
,
\
line
)
if
len
(
file_inclusion
)
>
0
:
# right now it's got the {} or [] or "" or '' around it still
for
file_included
in
file_inclusion
:
file_included
=
file_included
[
1
:
-
1
]
file_included
=
file_included
.
strip
()
if
not
file_included
in
files_included
:
files_included
.
append
(
file_included
)
file_inclusion
=
re
.
findall
(
'^'
+
valid_for_filename
+
'$'
,
line
)
if
len
(
file_inclusion
)
>
0
:
for
file_included
in
file_inclusion
:
file_included
=
file_included
.
strip
()
if
not
file_included
in
files_included
:
files_included
.
append
(
file_included
)
file_inclusion
=
re
.
findall
(
'^'
+
valid_for_filename
+
'[,
\\
} $]'
,
line
)
if
len
(
file_inclusion
)
>
0
:
for
file_included
in
file_inclusion
:
file_included
=
file_included
.
strip
()
if
not
file_included
in
files_included
:
files_included
.
append
(
file_included
)
file_inclusion
=
re
.
findall
(
'
\\
s*'
+
valid_for_filename
+
'
\\
s*$'
,
line
)
if
len
(
file_inclusion
)
>
0
:
for
file_included
in
file_inclusion
:
file_included
=
file_included
.
strip
()
if
not
file_included
in
files_included
:
files_included
.
append
(
file_included
)
if
files_included
!=
[
'ERROR'
]:
files_included
=
files_included
[
1
:]
# cut off the dummy
for
file_included
in
files_included
:
if
file_included
==
''
:
files_included
.
remove
(
file_included
)
if
' '
in
file_included
:
for
subfile
in
file_included
.
split
(
' '
):
if
not
subfile
in
files_included
:
files_included
.
append
(
subfile
)
if
','
in
file_included
:
for
subfile
in
file_included
.
split
(
' '
):
if
not
subfile
in
files_included
:
files_included
.
append
(
subfile
)
return
files_included
def
upload_to_site
(
marcxml
,
yes_i_know
):
"""
makes the appropriate calls to bibupload to get the MARCXML record onto
the site.
@param: marcxml (string): the absolute location of the MARCXML that was
generated by this programme
@param: yes_i_know (boolean): if true, no confirmation. if false, prompt.
@output: a new record on the invenio site
@return: None
"""
if
not
yes_i_know
:
wait_for_user
(
wrap_text_in_a_box
(
'You are going to upload new '
+
\
'plots to the server.'
))
task_low_level_submission
(
'bibupload'
,
'admin'
,
'-a'
,
marcxml
)
help_string
=
"""
name: plotextractor
usage:
python plotextractor.py -d tar/dir -s scratch/dir
python plotextractor.py -i inputfile -u
python plotextractor.py --arXiv=arXiv_id
python plotextractor.py --recid=recids
example:
python plotextractor.py -d /some/path/with/tarballs
python plotextractor.py -i input.txt --no-sdir --extract-text
python plotextractor.py --arXiv=hep-ex/0101001
python plotextractor.py --recid=13-20,29
options:
-d, --tardir=
if you wish to do a batch of tarballs, search the tree
rooted at this directory for them
-s, --scratchdir=
the directory for scratchwork (untarring, conversion, etc.).
make sure that this directory is one of the allowed dirs in
CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS to avoid errors. with an
sdir selected, one xml file will be generated for the whole
batch of files processed, and it will live in this sdir.
-i, --input=
if you wish to give an input file for downloading files from
arXiv (or wherever), this is the pointer to that file, which
should contain urls to download, no more than 1 per line. each
line should be the url of a tarball or gzipped tarball, and
each downloaded item will then be processed.
-x, --extract-text
if there is a pdf with the same base name as the tarball for each
tarball this is being run on, running with the -x parameter will
run pdftotext on each of these pdfs and store the result in the
folder
-f, --force
if you want to overwrite everything that was done before, just
force the script to overwrite it. otherwise it will only run on
things that haven't been run on yet (for use with tardir).
-c, --clean
if you wish to do delete all non-essential files that were extracted.
-u, --call-bibupload, --yes-i-know
if you want to upload the plots, ask to call bibupload. appending
the --yes-i-know flag bypasses bibupload's prompt to upload
-l, --refno-url
Specify an URL to the invenio-instance to query for refno.
Defaults to CFG_SITE_URL.
-k, --skip-refno
allows you to skip any refno check
-r, --recid=
if you want to process the tarball of one recid, use this tag. it
will also accept ranges (i.e. --recid=13-20)
-a, --arXiv=
if you want to process the tarball of one arXiv id, use this tag.
-t, --tarball=
for processing one tarball.
-q, --squash
if you want to squash all MARC into a single MARC file (for easier
and faster bibuploading)
-h, --help
Print this help and exit.
description: extracts plots from a tarfile from arXiv and generates
MARCXML that links figures and their captions. converts all
images to PNG format.
"""
def
usage
():
write_message
(
help_string
)
if
__name__
==
'__main__'
:
main
()
Event Timeline
Log In to Comment