Page MenuHomec4science

plotextractor.py
No OneTemporary

File Metadata

Created
Sat, Nov 16, 10:18

plotextractor.py

# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2010, 2011,  CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
import sys
import os
import getopt
import re
import time
from invenio.shellutils import run_shell_command, Timeout, run_process_with_timeout
from invenio.invenio_connector import InvenioConnector
from invenio.textutils import wrap_text_in_a_box, \
wait_for_user
from invenio.config import CFG_TMPDIR, CFG_SITE_URL
from invenio.plotextractor_config import CFG_PLOTEXTRACTOR_DISALLOWED_TEX, \
CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT, \
CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT, \
CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT
from invenio.bibtask import task_low_level_submission
from invenio.plotextractor_getter import get_list_of_all_matching_files, \
parse_and_download, \
make_single_directory, \
tarballs_by_recids, \
tarballs_by_arXiv_id
from invenio.plotextractor_converter import untar, extract_text, \
convert_images
from invenio.plotextractor_output_utils import assemble_caption, \
find_open_and_close_braces, \
create_MARC, get_tex_location, \
get_image_location, \
create_contextfiles, \
prepare_image_data, \
write_message, remove_dups
from tempfile import mkstemp
"""
This programme will take a tarball from arXiv, untar it, convert all its
associated images to PNG, find the captions to the images detailed in the
included TeX document, and write MARCXML that reflects these associations.
"""
ARXIV_HEADER = 'arXiv:'
PLOTS_DIR = 'plots'
MAIN_CAPTION_OR_IMAGE = 0
SUB_CAPTION_OR_IMAGE = 1
def main():
"""
The main program loop.
"""
help_param = 'help'
verbose_param = 'verbose'
tarball_param = 'tarball'
tardir_param = 'tdir'
infile_param = 'input'
sdir_param = 'sdir'
extract_text_param = 'extract-text'
force_param = 'force'
upload_param = 'call-bibupload'
yes_i_know_param = 'yes-i-know'
recid_param = 'recid'
arXiv_param = 'arXiv'
squash_param = 'squash'
refno_url_param = 'refno-url'
refno_param = 'skip-refno'
clean_param = 'clean'
param_abbrs = 'h:t:d:s:i:a:l:xfuyrqck'
params = [help_param, tarball_param + '=', tardir_param + '=', \
sdir_param + '=', infile_param + '=', arXiv_param + '=', refno_url_param + '=', \
extract_text_param, force_param, upload_param, yes_i_know_param, recid_param, \
squash_param, clean_param]
try:
opts, args = getopt.getopt(sys.argv[1:], param_abbrs, params)
except getopt.GetoptError, err:
write_message(str(err))
usage()
sys.exit(2)
tarball = None
sdir = None
infile = None
tdir = None
xtract_text = False
upload_plots = False
force = False
squash = False
squash_path = ""
yes_i_know = False
recids = None
arXiv = None
clean = False
refno_url = CFG_SITE_URL
skip_refno = False
for opt, arg in opts:
if opt in ['-h', help_param]:
usage()
sys.exit()
elif opt in ['-t', tarball_param]:
tarball = arg
elif opt in ['-d', tardir_param]:
tdir = arg
elif opt in ['-i', infile_param]:
infile = arg
elif opt in ['-r', recid_param]:
recids = arg
elif opt in ['-a', arXiv_param]:
arXiv = arg
elif opt in ['-s', sdir_param]:
sdir = arg
elif opt in ['-x', extract_text_param]:
xtract_text = True
elif opt in ['-f', force_param]:
force = True
elif opt in ['-u', upload_param]:
upload_plots = True
elif opt in ['-q', squash_param]:
squash = True
elif opt in ['-y', yes_i_know_param]:
yes_i_know = True
elif opt in ['-c', clean_param]:
clean = True
elif opt in ['-l', refno_url_param]:
refno_url = arg
elif opt in ['-k', refno_param]:
skip_refno = True
else:
usage()
sys.exit()
if sdir == None:
sdir = CFG_TMPDIR
elif not os.path.isdir(sdir):
try:
os.makedirs(sdir)
except:
write_message('Error: We can\'t use this sdir. using ' + \
'CFG_TMPDIR')
sdir = CFG_TMPDIR
if skip_refno:
refno_url = ""
tars_and_gzips = []
if tarball != None:
tars_and_gzips.append(tarball)
if tdir != None:
filetypes = ['gzip compressed', 'tar archive', 'Tar archive'] # FIXME
write_message('Currently processing any tarballs in ' + tdir)
tars_and_gzips.extend(get_list_of_all_matching_files(tdir, filetypes))
if infile != None:
tars_and_gzips.extend(parse_and_download(infile, sdir))
if recids != None:
tars_and_gzips.extend(tarballs_by_recids(recids, sdir))
if arXiv != None:
tars_and_gzips.extend(tarballs_by_arXiv_id([arXiv], sdir))
if tars_and_gzips == []:
write_message('Error: no tarballs to process!')
sys.exit(1)
if squash:
squash_fd, squash_path = mkstemp(suffix = "_" + time.strftime("%Y%m%d%H%M%S") + ".xml", \
prefix = "plotextractor_", dir = sdir)
os.write(squash_fd, '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n')
os.close(squash_fd)
for tarball in tars_and_gzips:
process_single(tarball, sdir = sdir, xtract_text = xtract_text, \
upload_plots = upload_plots, force = force, squash = squash_path, \
yes_i_know = yes_i_know, refno_url = refno_url, \
clean = clean)
if squash:
squash_fd = open(squash_path, "a")
squash_fd.write("</collection>\n")
squash_fd.close()
write_message("generated %s" % (squash_path,))
if upload_plots:
upload_to_site(squash_path, yes_i_know)
def process_single(tarball, sdir = CFG_TMPDIR, xtract_text = False, \
upload_plots = False, force = False, squash = "", \
yes_i_know = False, refno_url = "", \
clean = False):
"""
Processes one tarball end-to-end.
@param: tarball (string): the absolute location of the tarball we wish
to process
@param: sdir (string): where we should put all the intermediate files for
the processing. if you're uploading, this directory should be one
of the ones specified in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, else
the upload won't work
@param: xtract_text (boolean): true iff you want to run pdftotext on the
pdf versions of the tarfiles. this programme assumes that the pdfs
are named the same as the tarballs but with a .pdf extension.
@param: upload_plots (boolean): true iff you want to bibupload the plots
extracted by this process
@param: force (boolean): force creation of new xml file
@param: squash: write MARCXML output into a specified 'squash' file
instead of single files.
@param: yes_i_know: if True, no user interaction if upload_plots is True
@param: refno_url: URL to the invenio-instance to query for refno.
@param: clean: if True, everything except the original tarball, plots and
context- files will be removed
@return: marc_name(string): path to generated marcxml file
"""
sub_dir, refno = get_defaults(tarball, sdir, refno_url)
if not squash:
marc_name = os.path.join(sub_dir, '%s.xml' % (refno,))
if (force or not os.path.exists(marc_name)):
marc_fd = open(marc_name, 'w')
marc_fd.write('<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n')
marc_fd.close()
else:
marc_name = squash
if xtract_text:
extract_text(tarball)
try:
extracted_files_list, image_list, tex_files = untar(tarball, sub_dir)
except Timeout:
write_message('Timeout during tarball extraction on %s' % (tarball,))
return
if tex_files == [] or tex_files == None:
write_message('%s is not a tarball' % (os.path.split(tarball)[-1],))
run_shell_command('rm -r %s', (sub_dir,))
return
converted_image_list = convert_images(image_list)
write_message('converted %d of %d images found for %s' % (len(converted_image_list), \
len(image_list), \
os.path.basename(tarball)))
extracted_image_data = []
for tex_file in tex_files:
# Extract images, captions and labels
partly_extracted_image_data = extract_captions(tex_file, sub_dir, \
converted_image_list)
if partly_extracted_image_data != []:
# Add proper filepaths and do various cleaning
cleaned_image_data = prepare_image_data(partly_extracted_image_data, \
tex_file, converted_image_list)
# Using prev. extracted info, get contexts for each image found
extracted_image_data.extend((extract_context(tex_file, cleaned_image_data)))
extracted_image_data = remove_dups(extracted_image_data)
if extracted_image_data == []:
write_message('No plots detected in %s' % (refno,))
else:
if refno_url == "":
refno = None
create_contextfiles(extracted_image_data)
marc_xml = create_MARC(extracted_image_data, tarball, refno)
if not squash:
marc_xml += "\n</collection>"
if marc_name != None:
marc_fd = open(marc_name, 'a')
marc_fd.write('%s\n' % (marc_xml,))
marc_fd.close()
if not squash:
write_message('generated %s' % (marc_name,))
if upload_plots:
upload_to_site(marc_name, yes_i_know)
if clean:
clean_up(extracted_files_list, image_list)
write_message('work complete on %s' % (os.path.split(tarball)[-1],))
return marc_name
def clean_up(extracted_files_list, image_list):
"""
Removes all the intermediate stuff.
@param: extracted_files_list ([string, string, ...]): list of all extracted files
@param: image_list ([string, string, ...]): list of the images to keep
"""
for extracted_file in extracted_files_list:
# Remove everything that is not in the image_list or is not a directory
if extracted_file not in image_list and extracted_file[-1] != os.sep:
run_shell_command('rm %s', (extracted_file,))
def get_defaults(tarball, sdir, refno_url):
"""
A function for parameter-checking.
@param: tarball (string): the location of the tarball to be extracted
@param: sdir (string): the location of the scratch directory for untarring,
conversions, and the ultimate destination of the MARCXML
@param: refno_url (string): server location on where to look for refno
@return sdir, refno (string, string): the same
arguments it was sent as is appropriate.
"""
if sdir == None:
# Missing sdir: using default directory: CFG_TMPDIR
sdir = CFG_TMPDIR
else:
sdir = os.path.split(tarball)[0]
# make a subdir in the scratch directory for each tarball
sdir = make_single_directory(sdir, \
os.path.split(tarball)[-1] + '_' + PLOTS_DIR)
if refno_url != "":
refno = get_reference_number(tarball, refno_url)
if refno == None:
refno = os.path.basename(tarball)
write_message('Error: can\'t find record id for %s' % (refno,))
else:
refno = os.path.basename(tarball)
write_message("Skipping ref-no check")
return sdir, refno
def get_reference_number(tarball, refno_url):
"""
Attempts to determine the reference number of the file by searching.
@param: tarball (string): the name of the tarball as downloaded from
arXiv
@param: refno_url (string): url of repository to check for a
reference number for this record. If not set; returns None
@return: refno (string): the reference number of the paper
"""
if refno_url:
server = InvenioConnector(refno_url)
# we just need the name of the file
tarball = os.path.split(tarball)[1]
prefix = '037__a:'
# the name right now looks like arXiv:hep-ph_9703009
# or arXiv:0910.0476
if tarball.startswith(ARXIV_HEADER):
if len(tarball.split('_')) > 1:
tarball = tarball.split(':')[1]
arXiv_record = tarball.replace('_', '/')
else:
arXiv_record = tarball
result = server.search(p = prefix + arXiv_record, of = 'id')
if len(result) == 0:
return None
return str(result[0])
arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', tarball)
if len(arXiv_record) > 1:
arXiv_record = arXiv_record[0]
result = server.search(p = prefix + arXiv_record, of = 'id')
if len(result) > 0:
return str(result[0])
tarball_mod = tarball.replace('_', '/')
arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', \
tarball_mod)
if len(arXiv_record) > 1:
arXiv_record = arXiv_record[0]
result = server.search(p = prefix + arXiv_record, of = 'id')
if len(result) > 0:
return str(result[0])
return None
def rotate_image(filename, line, sdir, image_list):
"""
Given a filename and a line, figure out what it is that the author
wanted to do wrt changing the rotation of the image and convert the
file so that this rotation is reflected in its presentation.
@param: filename (string): the name of the file as specified in the TeX
@param: line (string): the line where the rotate command was found
@output: the image file rotated in accordance with the rotate command
@return: True if something was rotated
"""
file_loc = get_image_location(filename, sdir, image_list)
degrees = re.findall('(angle=[-\\d]+|rotate=[-\\d]+)', line)
if len(degrees) < 1:
return False
degrees = degrees[0].split('=')[-1].strip()
if file_loc == None or file_loc == 'ERROR' or\
not re.match('-*\\d+', degrees):
return False
degrees = str(0 - int(degrees))
dummy, dummy, cmd_err = run_process_with_timeout('mogrify -rotate %s %s' % \
(degrees, file_loc), shell = True)
if cmd_err != '':
return True
else:
return True
def get_context(lines, backwards = False):
"""
Given a relevant string from a TeX file, this function will extract text
from it as far as it is deemed contextually relevant, either backwards or forwards
in the text. The level of relevance allowed is configurable. When it reaches some
point in the text that is determined to be out of scope from the current context,
like text that is identified as a new paragraph, a complex TeX structure
('/begin', '/end', etc.) etc., it will return the previously allocated text.
For use when extracting text with contextual value for an figure or plot.
@param lines (string): string to examine
@param reversed (bool): are we searching backwards?
@return context (string): extracted context
"""
tex_tag = re.compile(r".*\\(\w+).*")
sentence = re.compile(r"(?<=[.?!])[\s]+(?=[A-Z])")
context = []
word_list = lines.split()
if backwards:
word_list.reverse()
# For each word we do the following:
# 1. Check if we have reached word limit
# 2. If not, see if this is a TeX tag and see if its 'illegal'
# 3. Otherwise, add word to context
for word in word_list:
if len(context) >= CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT:
break
match = tex_tag.match(word)
if (match and match.group(1) in CFG_PLOTEXTRACTOR_DISALLOWED_TEX):
# TeX Construct matched, return
if backwards:
# When reversed we need to go back and
# remove unwanted data within brackets
temp_word = ""
while len(context):
temp_word = context.pop()
if '}' in temp_word:
break
break
context.append(word)
if backwards:
context.reverse()
text = " ".join(context)
sentence_list = sentence.split(text)
if backwards:
sentence_list.reverse()
if len(sentence_list) > CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT:
return " ".join(sentence_list[:CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT])
else:
return " ".join(sentence_list)
def extract_context(tex_file, extracted_image_data):
"""
Given a .tex file and a label name, this function will extract the text before
and after for all the references made to this label in the text. The number
of characters to extract before and after is configurable.
@param tex_file (list): path to .tex file
@param extracted_image_data ([(string, string, list), ...]):
a list of tuples of images matched to labels and captions from
this document.
@return extracted_image_data ([(string, string, list, list),
(string, string, list, list),...)]: the same list, but now containing
extracted contexts
"""
if os.path.isdir(tex_file) or not os.path.exists(tex_file):
return []
fd = open(tex_file)
lines = fd.read()
fd.close()
# Generate context for each image and its assoc. labels
new_image_data = []
for image, caption, label in extracted_image_data:
context_list = []
# Generate a list of index tuples for all matches
indicies = [match.span() \
for match in re.finditer(r"(\\(?:fig|ref)\{" + label + "\})", \
lines)]
for startindex, endindex in indicies:
# Retrive all lines before label until beginning of file
i = startindex - CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT
if i < 0:
text_before = lines[:startindex]
else:
text_before = lines[i:startindex]
context_before = get_context(text_before, backwards = True)
# Retrive all lines from label until end of file and get context
i = endindex + CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT
text_after = lines[endindex:i]
context_after = get_context(text_after)
context_list.append(context_before + ' \\ref{' + label + '} ' + context_after)
new_image_data.append((image, caption, label, context_list))
return new_image_data
def extract_captions(tex_file, sdir, image_list, primary = True):
"""
Take the TeX file and the list of images in the tarball (which all,
presumably, are used in the TeX file) and figure out which captions
in the text are associated with which images
@param: lines (list): list of lines of the TeX file
@param: tex_file (string): the name of the TeX file which mentions
the images
@param: sdir (string): path to current sub-directory
@param: image_list (list): list of images in tarball
@param: primary (bool): is this the primary call to extract_caption?
@return: images_and_captions_and_labels ([(string, string, list),
(string, string, list), ...]):
a list of tuples representing the names of images and their
corresponding figure labels from the TeX file
"""
if os.path.isdir(tex_file) or not os.path.exists(tex_file):
return []
fd = open(tex_file)
lines = fd.readlines()
fd.close()
# possible figure lead-ins
figure_head = '\\begin{figure' # also matches figure*
figure_tail = '\\end{figure' # also matches figure*
picture_head = '\\begin{picture}'
displaymath_head = '\\begin{displaymath}'
subfloat_head = '\\subfloat'
subfig_head = '\\subfigure'
includegraphics_head = '\\includegraphics'
epsfig_head = '\\epsfig'
input_head = '\\input'
# possible caption lead-ins
caption_head = '\\caption'
figcaption_head = '\\figcaption'
label_head = '\\label'
rotate = 'rotate='
angle = 'angle='
eps_tail = '.eps'
ps_tail = '.ps'
doc_head = '\\begin{document}'
doc_tail = '\\end{document}'
extracted_image_data = []
cur_image = ''
caption = ''
labels = []
active_label = ""
# cut out shit before the doc head
if primary:
for line_index in range(len(lines)):
if lines[line_index].find(doc_head) < 0:
lines[line_index] = ''
else:
break
# are we using commas in filenames here?
commas_okay = False
for dummy1, dummy2, filenames in \
os.walk(os.path.split(os.path.split(tex_file)[0])[0]):
for filename in filenames:
if filename.find(',') > -1:
commas_okay = True
break
# a comment is a % not preceded by a \
comment = re.compile("(?<!\\\\)%")
for line_index in range(len(lines)):
# get rid of pesky comments by splitting where the comment is
# and keeping only the part before the %
line = comment.split(lines[line_index])[0]
line = line.strip()
lines[line_index] = line
in_figure_tag = 0
for line_index in range(len(lines)):
line = lines[line_index]
if line == '':
continue
if line.find(doc_tail) > -1:
return extracted_image_data
"""
FIGURE -
structure of a figure:
\begin{figure}
\formatting...
\includegraphics[someoptions]{FILENAME}
\caption{CAPTION} %caption and includegraphics may be switched!
\end{figure}
"""
index = line.find(figure_head)
if index > -1:
in_figure_tag = 1
# some punks don't like to put things in the figure tag. so we
# just want to see if there is anything that is sitting outside
# of it when we find it
cur_image, caption, extracted_image_data = \
put_it_together(cur_image, caption, active_label, extracted_image_data, \
line_index, lines)
# here, you jerks, just make it so that it's fecking impossible to
# figure out your damn inclusion types
index = max([line.find(eps_tail), line.find(ps_tail), \
line.find(epsfig_head)])
if index > -1:
if line.find(eps_tail) > -1 or line.find(ps_tail) > -1:
ext = True
else:
ext = False
filenames = intelligently_find_filenames(line, ext = ext,
commas_okay = commas_okay)
# try to look ahead! sometimes there are better matches after
if line_index < len(lines) - 1:
filenames.extend(\
intelligently_find_filenames(lines[line_index + 1],
commas_okay = commas_okay))
if line_index < len(lines) - 2:
filenames.extend(\
intelligently_find_filenames(lines[line_index + 2],
commas_okay = commas_okay))
for filename in filenames:
filename = str(filename)
if cur_image == '':
cur_image = filename
elif type(cur_image) == list:
if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
cur_image[SUB_CAPTION_OR_IMAGE].append(filename)
else:
cur_image[SUB_CAPTION_OR_IMAGE] = [filename]
else:
cur_image = ['', [cur_image, filename]]
"""
Rotate and angle
"""
index = max(line.find(rotate), line.find(angle))
if index > -1:
# which is the image associated to it?
filenames = intelligently_find_filenames(line,
commas_okay = commas_okay)
# try the line after and the line before
if line_index + 1 < len(lines):
filenames.extend(intelligently_find_filenames(lines[line_index + 1],
commas_okay = commas_okay))
if line_index > 1:
filenames.extend(intelligently_find_filenames(lines[line_index - 1],
commas_okay = commas_okay))
already_tried = []
for filename in filenames:
if filename != 'ERROR' and not filename in already_tried:
if rotate_image(filename, line, sdir, image_list):
break
already_tried.append(filename)
"""
INCLUDEGRAPHICS -
structure of includegraphics:
\includegraphics[someoptions]{FILENAME}
"""
index = line.find(includegraphics_head)
if index > -1:
open_curly, open_curly_line, close_curly, dummy = \
find_open_and_close_braces(line_index, index, '{', lines)
filename = lines[open_curly_line][open_curly + 1:close_curly]
if cur_image == '':
cur_image = filename
elif type(cur_image) == list:
if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
cur_image[SUB_CAPTION_OR_IMAGE].append(filename)
else:
cur_image[SUB_CAPTION_OR_IMAGE] = [filename]
else:
cur_image = ['', [cur_image, filename]]
"""
{\input{FILENAME}}
\caption{CAPTION}
This input is ambiguous, since input is also used for things like
inclusion of data from other LaTeX files directly.
"""
index = line.find(input_head)
if index > -1:
new_tex_names = intelligently_find_filenames(line, TeX = True, \
commas_okay = commas_okay)
for new_tex_name in new_tex_names:
if new_tex_name != 'ERROR':
new_tex_file = get_tex_location(new_tex_name, tex_file)
if new_tex_file != None and primary: #to kill recursion
extracted_image_data.extend(extract_captions(\
new_tex_file, sdir, \
image_list,
primary = False))
"""PICTURE"""
index = line.find(picture_head)
if index > -1:
# structure of a picture:
# \begin{picture}
# ....not worrying about this now
#write_message('found picture tag')
#FIXME
pass
"""DISPLAYMATH"""
index = line.find(displaymath_head)
if index > -1:
# structure of a displaymath:
# \begin{displaymath}
# ....not worrying about this now
#write_message('found displaymath tag')
#FIXME
pass
"""
CAPTIONS -
structure of a caption:
\caption[someoptions]{CAPTION}
or
\caption{CAPTION}
or
\caption{{options}{CAPTION}}
"""
index = max([line.find(caption_head), line.find(figcaption_head)])
if index > -1:
open_curly, open_curly_line, close_curly, close_curly_line = \
find_open_and_close_braces(line_index, index, '{', lines)
cap_begin = open_curly + 1
cur_caption = assemble_caption(open_curly_line, cap_begin, \
close_curly_line, close_curly, lines)
if caption == '':
caption = cur_caption
elif type(caption) == list:
if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
caption[SUB_CAPTION_OR_IMAGE].append(cur_caption)
else:
caption[SUB_CAPTION_OR_IMAGE] = [cur_caption]
elif caption != cur_caption:
caption = ['', [caption, cur_caption]]
"""
SUBFLOATS -
structure of a subfloat (inside of a figure tag):
\subfloat[CAPTION]{options{FILENAME}}
also associated with the overall caption of the enclosing figure
"""
index = line.find(subfloat_head)
if index > -1:
# if we are dealing with subfloats, we need a different
# sort of structure to keep track of captions and subcaptions
if type(cur_image) != list:
cur_image = [cur_image, []]
if type(caption) != list:
caption = [caption, []]
open_square, open_square_line, close_square, close_square_line = \
find_open_and_close_braces(line_index, index, '[', lines)
cap_begin = open_square + 1
sub_caption = assemble_caption(open_square_line, \
cap_begin, close_square_line, close_square, lines)
caption[SUB_CAPTION_OR_IMAGE].append(sub_caption)
open_curly, open_curly_line, close_curly, dummy = \
find_open_and_close_braces(close_square_line, \
close_square, '{', lines)
sub_image = lines[open_curly_line][open_curly + 1:close_curly]
cur_image[SUB_CAPTION_OR_IMAGE].append(sub_image)
"""
SUBFIGURES -
structure of a subfigure (inside a figure tag):
\subfigure[CAPTION]{
\includegraphics[options]{FILENAME}}
also associated with the overall caption of the enclosing figure
"""
index = line.find(subfig_head)
if index > -1:
# like with subfloats, we need a different structure for keepin
# track of this stuff
if type(cur_image) != list:
cur_image = [cur_image, []]
if type(caption) != list:
caption = [caption, []]
open_square, open_square_line, close_square, close_square_line = \
find_open_and_close_braces(line_index, index, '[', lines)
cap_begin = open_square + 1
sub_caption = assemble_caption(open_square_line, \
cap_begin, close_square_line, close_square, lines)
caption[SUB_CAPTION_OR_IMAGE].append(sub_caption)
index_cpy = index
# find the graphics tag to get the filename
# it is okay if we eat lines here
index = line.find(includegraphics_head)
while index == -1 and (line_index + 1) < len(lines):
line_index = line_index + 1
line = lines[line_index]
index = line.find(includegraphics_head)
if line_index == len(lines):
# didn't find the image name on line
line_index = index_cpy
open_curly, open_curly_line, close_curly, dummy = \
find_open_and_close_braces(line_index, \
index, '{', lines)
sub_image = lines[open_curly_line][open_curly + 1:close_curly]
cur_image[SUB_CAPTION_OR_IMAGE].append(sub_image)
"""
LABELS -
structure of a label:
\label{somelabelnamewhichprobablyincludesacolon}
Labels are used to tag images and will later be used in ref tags
to reference them. This is interesting because in effect the refs
to a plot are additional caption for it.
Notes: labels can be used for many more things than just plots.
We'll have to experiment with how to best associate a label with an
image.. if it's in the caption, it's easy. If it's in a figure, it's
still okay... but the images that aren't in figure tags are numerous.
"""
index = line.find(label_head)
if index > -1 and in_figure_tag:
open_curly, open_curly_line, close_curly, dummy = \
find_open_and_close_braces(line_index, \
index, '{', lines)
label = lines[open_curly_line][open_curly + 1:close_curly]
if label not in labels:
active_label = label
labels.append(label)
"""
FIGURE
important: we put the check for the end of the figure at the end
of the loop in case some pathological person puts everything in one
line
"""
index = max([line.find(figure_tail), line.find(doc_tail)])
if index > -1:
in_figure_tag = 0
cur_image, caption, extracted_image_data = \
put_it_together(cur_image, caption, active_label, extracted_image_data, \
line_index, lines)
"""
END DOCUMENT
we shouldn't look at anything after the end document tag is found
"""
index = line.find(doc_tail)
if index > -1:
break
return extracted_image_data
def put_it_together(cur_image, caption, context, extracted_image_data, line_index, \
lines):
"""
Takes the current image(s) and caption(s) and assembles them into
something useful in the extracted_image_data list.
@param: cur_image (string || list): the image currently being dealt with, or
the list of images, in the case of subimages
@param: caption (string || list): the caption or captions currently in scope
@param: extracted_image_data ([(string, string), (string, string), ...]):
a list of tuples of images matched to captions from this document.
@param: line_index (int): the index where we are in the lines (for
searchback and searchforward purposes)
@param: lines ([string, string, ...]): the lines in the TeX
@return: (cur_image, caption, extracted_image_data): the same arguments it
was sent, processed appropriately
"""
if type(cur_image) == list:
if cur_image[MAIN_CAPTION_OR_IMAGE] == 'ERROR':
cur_image[MAIN_CAPTION_OR_IMAGE] = ''
for image in cur_image[SUB_CAPTION_OR_IMAGE]:
if image == 'ERROR':
cur_image[SUB_CAPTION_OR_IMAGE].remove(image)
if cur_image != '' and caption != '':
if type(cur_image) == list and type(caption) == list:
if cur_image[MAIN_CAPTION_OR_IMAGE] != '' and\
caption[MAIN_CAPTION_OR_IMAGE] != '':
extracted_image_data.append(
(cur_image[MAIN_CAPTION_OR_IMAGE],
caption[MAIN_CAPTION_OR_IMAGE],
context))
if type(cur_image[MAIN_CAPTION_OR_IMAGE]) == list:
# why is the main image a list?
# it's a good idea to attach the main caption to other
# things, but the main image can only be used once
cur_image[MAIN_CAPTION_OR_IMAGE] = ''
if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
for index in \
range(len(cur_image[SUB_CAPTION_OR_IMAGE])):
if index < len(caption[SUB_CAPTION_OR_IMAGE]):
long_caption = \
caption[MAIN_CAPTION_OR_IMAGE] + ' : ' + \
caption[SUB_CAPTION_OR_IMAGE][index]
else:
long_caption = \
caption[MAIN_CAPTION_OR_IMAGE] + ' : ' + \
'Caption not extracted'
extracted_image_data.append(
(cur_image[SUB_CAPTION_OR_IMAGE][index],
long_caption, context))
else:
long_caption = caption[MAIN_CAPTION_OR_IMAGE] + \
' : ' + caption[SUB_CAPTION_OR_IMAGE]
for sub_image in cur_image[SUB_CAPTION_OR_IMAGE]:
extracted_image_data.append(
(sub_image, long_caption, context))
else:
if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
long_caption = caption[MAIN_CAPTION_OR_IMAGE]
for sub_cap in caption[SUB_CAPTION_OR_IMAGE]:
long_caption = long_caption + ' : ' + sub_cap
extracted_image_data.append(
(cur_image[SUB_CAPTION_OR_IMAGE], long_caption, context))
else:
#wtf are they lists for?
extracted_image_data.append(
(cur_image[SUB_CAPTION_OR_IMAGE],
caption[SUB_CAPTION_OR_IMAGE], context))
elif type(cur_image) == list:
if cur_image[MAIN_CAPTION_OR_IMAGE] != '':
extracted_image_data.append(
(cur_image[MAIN_CAPTION_OR_IMAGE], caption, context))
if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
for image in cur_image[SUB_CAPTION_OR_IMAGE]:
extracted_image_data.append((image, caption, context))
else:
extracted_image_data.append(
(cur_image[SUB_CAPTION_OR_IMAGE], caption, context))
elif type(caption) == list:
if caption[MAIN_CAPTION_OR_IMAGE] != '':
extracted_image_data.append(
(cur_image, caption[MAIN_CAPTION_OR_IMAGE], context))
if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
# multiple caps for one image:
long_caption = caption[MAIN_CAPTION_OR_IMAGE]
for subcap in caption[SUB_CAPTION_OR_IMAGE]:
if long_caption != '':
long_caption += ' : '
long_caption += subcap
extracted_image_data.append((cur_image, long_caption, context))
else:
extracted_image_data.append(
(cur_image, caption[SUB_CAPTION_OR_IMAGE]. context))
else:
extracted_image_data.append((cur_image, caption, context))
elif cur_image != '' and caption == '':
# we may have missed the caption somewhere.
REASONABLE_SEARCHBACK = 25
REASONABLE_SEARCHFORWARD = 5
curly_no_tag_preceding = '(?<!\\w){'
for searchback in range(REASONABLE_SEARCHBACK):
if line_index - searchback < 0:
continue
back_line = lines[line_index - searchback]
m = re.search(curly_no_tag_preceding, back_line)
if m != None:
open_curly = m.start()
open_curly, open_curly_line, close_curly, \
close_curly_line = find_open_and_close_braces(\
line_index - searchback, open_curly, '{', lines)
cap_begin = open_curly + 1
caption = assemble_caption(open_curly_line, cap_begin, \
close_curly_line, close_curly, lines)
if type(cur_image) == list:
extracted_image_data.append(
(cur_image[MAIN_CAPTION_OR_IMAGE], caption, context))
for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]:
extracted_image_data.append((sub_img, caption, context))
else:
extracted_image_data.append((cur_image, caption, context))
break
if caption == '':
for searchforward in range(REASONABLE_SEARCHFORWARD):
if line_index + searchforward >= len(lines):
break
fwd_line = lines[line_index + searchforward]
m = re.search(curly_no_tag_preceding, fwd_line)
if m != None:
open_curly = m.start()
open_curly, open_curly_line, close_curly, \
close_curly_line = find_open_and_close_braces(\
line_index + searchforward, open_curly, '{', lines)
cap_begin = open_curly + 1
caption = assemble_caption(open_curly_line, \
cap_begin, close_curly_line, close_curly, lines)
if type(cur_image) == list:
extracted_image_data.append(
(cur_image[MAIN_CAPTION_OR_IMAGE], caption, context))
for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]:
extracted_image_data.append((sub_img, caption, context))
else:
extracted_image_data.append((cur_image, caption, context))
break
if caption == '':
if type(cur_image) == list:
extracted_image_data.append(
(cur_image[MAIN_CAPTION_OR_IMAGE], 'No caption found', context))
for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]:
extracted_image_data.append((sub_img, 'No caption', context))
else:
extracted_image_data.append(
(cur_image, 'No caption found', context))
elif caption != '' and cur_image == '':
if type(caption) == list:
long_caption = caption[MAIN_CAPTION_OR_IMAGE]
for subcap in caption[SUB_CAPTION_OR_IMAGE]:
long_caption = long_caption + ': ' + subcap
else:
long_caption = caption
extracted_image_data.append(('', 'noimg' + long_caption, context))
# if we're leaving the figure, no sense keeping the data
cur_image = ''
caption = ''
return (cur_image, caption, extracted_image_data)
def intelligently_find_filenames(line, TeX = False, ext = False, commas_okay = False):
"""
Find the filename in the line. We don't support all filenames! Just eps
and ps for now.
@param: line (string): the line we want to get a filename out of
@return: filename ([string, ...]): what is probably the name of the file(s)
"""
files_included = ['ERROR']
if commas_okay:
valid_for_filename = '\\s*[A-Za-z0-9\\-\\=\\+/\\\\_\\.,%#]+'
else:
valid_for_filename = '\\s*[A-Za-z0-9\\-\\=\\+/\\\\_\\.%#]+'
if ext:
valid_for_filename = valid_for_filename + '\.e*ps[texfi2]*'
if TeX:
valid_for_filename = valid_for_filename + '[\.latex]*'
file_inclusion = re.findall('=' + valid_for_filename + '[ ,]', line)
if len(file_inclusion) > 0:
# right now it looks like '=FILENAME,' or '=FILENAME '
for file_included in file_inclusion:
files_included.append(file_included[1:-1])
file_inclusion = re.findall('(?:[ps]*file=|figure=)' + \
valid_for_filename + '[,\\]} ]*', line)
if len(file_inclusion) > 0:
# still has the =
for file_included in file_inclusion:
part_before_equals = file_included.split('=')[0]
if len(part_before_equals) != file_included:
file_included = file_included[len(part_before_equals) + 1:].strip()
if not file_included in files_included:
files_included.append(file_included)
file_inclusion = re.findall('["\'{\\[]' + valid_for_filename + '[}\\],"\']', \
line)
if len(file_inclusion) > 0:
# right now it's got the {} or [] or "" or '' around it still
for file_included in file_inclusion:
file_included = file_included[1:-1]
file_included = file_included.strip()
if not file_included in files_included:
files_included.append(file_included)
file_inclusion = re.findall('^' + valid_for_filename + '$', line)
if len(file_inclusion) > 0:
for file_included in file_inclusion:
file_included = file_included.strip()
if not file_included in files_included:
files_included.append(file_included)
file_inclusion = re.findall('^' + valid_for_filename + '[,\\} $]', line)
if len(file_inclusion) > 0:
for file_included in file_inclusion:
file_included = file_included.strip()
if not file_included in files_included:
files_included.append(file_included)
file_inclusion = re.findall('\\s*' + valid_for_filename + '\\s*$', line)
if len(file_inclusion) > 0:
for file_included in file_inclusion:
file_included = file_included.strip()
if not file_included in files_included:
files_included.append(file_included)
if files_included != ['ERROR']:
files_included = files_included[1:] # cut off the dummy
for file_included in files_included:
if file_included == '':
files_included.remove(file_included)
if ' ' in file_included:
for subfile in file_included.split(' '):
if not subfile in files_included:
files_included.append(subfile)
if ',' in file_included:
for subfile in file_included.split(' '):
if not subfile in files_included:
files_included.append(subfile)
return files_included
def upload_to_site(marcxml, yes_i_know):
"""
makes the appropriate calls to bibupload to get the MARCXML record onto
the site.
@param: marcxml (string): the absolute location of the MARCXML that was
generated by this programme
@param: yes_i_know (boolean): if true, no confirmation. if false, prompt.
@output: a new record on the invenio site
@return: None
"""
if not yes_i_know:
wait_for_user(wrap_text_in_a_box('You are going to upload new ' + \
'plots to the server.'))
task_low_level_submission('bibupload', 'admin', '-a', marcxml)
help_string = """
name: plotextractor
usage:
python plotextractor.py -d tar/dir -s scratch/dir
python plotextractor.py -i inputfile -u
python plotextractor.py --arXiv=arXiv_id
python plotextractor.py --recid=recids
example:
python plotextractor.py -d /some/path/with/tarballs
python plotextractor.py -i input.txt --no-sdir --extract-text
python plotextractor.py --arXiv=hep-ex/0101001
python plotextractor.py --recid=13-20,29
options:
-d, --tardir=
if you wish to do a batch of tarballs, search the tree
rooted at this directory for them
-s, --scratchdir=
the directory for scratchwork (untarring, conversion, etc.).
make sure that this directory is one of the allowed dirs in
CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS to avoid errors. with an
sdir selected, one xml file will be generated for the whole
batch of files processed, and it will live in this sdir.
-i, --input=
if you wish to give an input file for downloading files from
arXiv (or wherever), this is the pointer to that file, which
should contain urls to download, no more than 1 per line. each
line should be the url of a tarball or gzipped tarball, and
each downloaded item will then be processed.
-x, --extract-text
if there is a pdf with the same base name as the tarball for each
tarball this is being run on, running with the -x parameter will
run pdftotext on each of these pdfs and store the result in the
folder
-f, --force
if you want to overwrite everything that was done before, just
force the script to overwrite it. otherwise it will only run on
things that haven't been run on yet (for use with tardir).
-c, --clean
if you wish to do delete all non-essential files that were extracted.
-u, --call-bibupload, --yes-i-know
if you want to upload the plots, ask to call bibupload. appending
the --yes-i-know flag bypasses bibupload's prompt to upload
-l, --refno-url
Specify an URL to the invenio-instance to query for refno.
Defaults to CFG_SITE_URL.
-k, --skip-refno
allows you to skip any refno check
-r, --recid=
if you want to process the tarball of one recid, use this tag. it
will also accept ranges (i.e. --recid=13-20)
-a, --arXiv=
if you want to process the tarball of one arXiv id, use this tag.
-t, --tarball=
for processing one tarball.
-q, --squash
if you want to squash all MARC into a single MARC file (for easier
and faster bibuploading)
-h, --help
Print this help and exit.
description: extracts plots from a tarfile from arXiv and generates
MARCXML that links figures and their captions. converts all
images to PNG format.
"""
def usage():
write_message(help_string)
if __name__ == '__main__':
main()

Event Timeline