plotextractor_output_utils.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Jul 8, 22:38

plotextractor_output_utils.py
View Options

	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	import os
	import re
	import sys

	from invenio.config import CFG_TMPDIR
	from invenio.textutils import encode_for_xml
	from invenio.bibrecord import field_xml_output

	DUMMY_IMAGE_TMP = os.path.join(CFG_TMPDIR, 'plotextractor_dummy.png')

	def write_message(message):
	print message

	def write_messages(messages):
	for message in messages:
	write_message(message)



	def find_open_and_close_braces(line_index, start, brace, lines):
	"""
	Take the line where we want to start and the index where we want to start
	and find the first instance of matched open and close braces of the same
	type as brace in file file.

	@param: line (int): the index of the line we want to start searching at
	@param: start (int): the index in the line we want to start searching at
	@param: brace (string): one of the type of brace we are looking for ({, },
	[, or ])
	@param lines ([string, string, ...]): the array of lines in the file we
	are looking in.

	@return: (start, start_line, end, end_line): (int, int, int): the index
	of the start and end of whatever braces we are looking for, and the
	line number that the end is on (since it may be different than the line
	we started on)
	"""

	if brace in ['[', ']']:
	open_brace = '['
	close_brace = ']'
	elif brace in ['{', '}']:
	open_brace = '{'
	close_brace = '}'
	elif brace in ['(', ')']:
	open_brace = '('
	close_brace = ')'
	else:
	# unacceptable brace type!
	return (-1, -1, -1, -1)

	open_braces = []
	line = lines[line_index]

	ret_open_index = line.find(open_brace, start)
	line_index_cpy = line_index
	# sometimes people don't put the braces on the same line
	# as the tag
	while ret_open_index == -1:
	line_index = line_index + 1
	if line_index >= len(lines):
	# failed to find open braces...
	return (0, line_index_cpy, 0, line_index_cpy)
	line = lines[line_index]
	ret_open_index = line.find(open_brace)

	open_braces.append(open_brace)

	ret_open_line = line_index

	open_index = ret_open_index
	close_index = ret_open_index

	while len(open_braces) > 0:
	if open_index == -1 and close_index == -1:
	# we hit the end of the line! oh, noez!
	line_index = line_index + 1

	if line_index >= len(lines):
	# hanging braces!
	return (ret_open_index, ret_open_line, ret_open_index, \
	ret_open_line)

	line = lines[line_index]
	# to not skip things that are at the beginning of the line
	close_index = line.find(close_brace)
	open_index = line.find(open_brace)

	else:
	if close_index != -1:
	close_index = line.find(close_brace, close_index + 1)
	if open_index != -1:
	open_index = line.find(open_brace, open_index + 1)

	if close_index != -1:
	open_braces.pop()
	if len(open_braces) == 0 and \
	(open_index > close_index or open_index == -1):
	break
	if open_index != -1:
	open_braces.append(open_brace)

	ret_close_index = close_index

	return (ret_open_index, ret_open_line, ret_close_index, line_index)

	def assemble_caption(begin_line, begin_index, end_line, end_index, lines):
	"""
	Take write_messageation about the caption of a picture and put it all together
	in a nice way. If it spans multiple lines, put it on one line. If it
	contains controlled characters, strip them out. If it has tags we don't
	want to worry about, get rid of them, etc.

	@param: begin_line (int): the index of the line where the caption begins
	@param: begin_index (int): the index within the line where the caption
	begins
	@param: end_line (int): the index of the line where the caption ends
	@param: end_index (int): the index within the line where the caption ends
	@param: lines ([string, string, ...]): the line strings of the text

	@return: caption (string): the caption, nicely formatted and pieced together
	"""

	# stuff we don't like
	label_head = '\\label{'

	# reassemble that sucker
	if end_line > begin_line:
	# our caption spanned multiple lines
	caption = lines[begin_line][begin_index:]

	for included_line_index in range(begin_line + 1, end_line):
	caption = caption + ' ' + lines[included_line_index]

	caption = caption + ' ' + lines[end_line][:end_index]
	caption = caption.replace('\n', ' ')
	caption = caption.replace(' ', ' ')
	else:
	# it fit on one line
	caption = lines[begin_line][begin_index:end_index]

	# clean out a label tag, if there is one
	label_begin = caption.find(label_head)
	if label_begin > -1:
	# we know that our caption is only one line, so if there's a label
	# tag in it, it will be all on one line. so we make up some args
	dummy_start, dummy_start_line, label_end, dummy_end = \
	find_open_and_close_braces(0, label_begin, '{', [caption])
	caption = caption[:label_begin] + caption[label_end + 1:]

	# clean out characters not allowed in MARCXML
	# not allowed: & < >
	try:
	caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash=True)
	except: # that damn encode thing threw an error on astro-ph/0601014
	sys.stderr.write(caption)
	sys.stderr.write(' cannot be processed\n')
	caption = caption.replace('&', '&').replace('<', '<')
	caption = caption.replace('>', '>')

	caption = caption.strip()

	if len(caption) > 1 and caption[0] == '{' and caption[-1] == '}':
	caption = caption[1:-1]

	return caption

	def prepare_image_data(extracted_image_data, tex_file, image_list):
	"""
	Prepare and clean image-data from duplicates and other garbage.

	@param: extracted_image_data ([(string, string, list, list) ...],
	...])): the images and their captions + contexts, ordered
	@param: tex_file (string): the location of the TeX (used for finding the
	associated images; the TeX is assumed to be in the same directory
	as the converted images)
	@param: image_list ([string, string, ...]): a list of the converted
	image file names
	@return extracted_image_data ([(string, string, list, list) ...],
	...])) again the list of image data cleaned for output
	"""
	sdir = os.path.split(tex_file)[0]
	image_locs_and_captions_and_labels = []
	for (image, caption, label) in extracted_image_data:
	if image == 'ERROR':
	continue
	if not image == '':
	image_loc = get_image_location(image, sdir, image_list)
	if image_loc != None and os.path.exists(image_loc):
	image_locs_and_captions_and_labels.append(
	(image_loc, caption, label))
	else:
	image_locs_and_captions_and_labels.append((image, caption, label))
	return image_locs_and_captions_and_labels

	def remove_dups(extracted_image_data):
	"""
	So now that we spam and get loads and loads of stuff in our lists, we need
	to intelligently get rid of some of it.

	@param: extracted_image_data ([(string, string, list, list),
	(string, string, list, list),...]): the full list of images, captions,
	labels and contexts extracted from this file

	@return: extracted_image_data ([(string, string, list, list),
	(string, string, list, list),...)]: the same list, but if there are
	duplicate images contained in it, then their captions are condensed
	"""

	img_list = {}
	pared_image_data = []

	# combine relevant captions
	for (image, caption, label, contexts) in extracted_image_data:
	if image in img_list:
	if not caption in img_list[image]:
	img_list[image].append(caption)
	else:
	img_list[image] = [caption]

	# order it (we know that the order in the original is correct)
	for (image, caption, label, contexts) in extracted_image_data:
	if image in img_list:
	pared_image_data.append((image, \
	' : '.join(img_list[image]), label, contexts))
	del img_list[image]
	# else we already added it to the new list

	return pared_image_data

	def create_contextfiles(extracted_image_data):
	"""
	Saves the context for each image to a file in the current sub-directory,
	returning a list of tuples per file saved in this form: [(image, filename), ..]

	@param extracted_image_data ([(string, string, list, list), ...]):
	a list of tuples of images matched to labels, captions and contexts from
	this document.
	"""
	for image, dummy2, dummy3, contexts in extracted_image_data:
	if len(contexts) > 0 and image != "":
	context_filepath = image + '.context'
	fd = open(context_filepath, 'w')
	for context_line in contexts:
	fd.write(context_line + '\n\n')
	fd.close()
	#write_message(context_filepath + ' written.')

	def create_MARC(extracted_image_data, tarball, refno):
	"""
	Take the images and their captions and the name of the associated TeX
	file and build a MARCXML record for them.

	@param: extracted_image_data ([(string, string, list, list), ...]):
	a list of tuples of images matched to labels, captions and contexts from
	this document.
	@param: refno (string): the name for the record number field, or None

	@output: a MARCXML record detailing all the arguments as appropriate
	at tarball.insert.xml and a duplicate one at tarball.correct.xml

	@return: the path to the MARCXML record, None if no plots
	"""
	root_dir = os.path.dirname(tarball) + os.sep + os.path.basename(tarball) + \
	'_plots' + os.sep

	# For building result MARCXML
	marcxml = ['<record>']

	# Datafield := (subfields, ind1, ind2, controlfield)
	# Subfield := (code, value)

	#FIXME: Determine what to do without refno
	if refno and refno.isdigit():
	field = (None, ' ', ' ', refno)
	marcxml.append(field_xml_output(field, '001'))

	index = 0
	for (image_location, caption, dummy, contexts) in extracted_image_data:
	if image_location == '':
	# we don't know the image, but the captions are for separate things
	for cap in caption.split(' : '):
	# Add DUMMY-PLOT MARCXML per loose captions
	subfields = []
	subfields.append(('a', DUMMY_IMAGE_TMP))
	subfields.append(('t', "PlotMisc"))
	subfields.append(('d', "%05d %s" % (index, cap)))
	subfields.append(('n', "fig%05d" % (index,)))
	subfields.append(('o', "HIDDEN"))
	marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT"))
	index = index + 1
	else:
	# Merge subfolder into docname, until root directory
	relative_image_path = image_location.replace(root_dir, '')
	docname = "_".join(relative_image_path.split('.')[:-1]).replace('/', '_')
	if len(caption) < 3:
	subfields = []
	subfields.append(('a', image_location))
	subfields.append(('t', "PlotMisc"))
	subfields.append(('d', "%05d %s" % (index, caption.replace(' : ', ''))))
	subfields.append(('n', docname))
	subfields.append(('o', "HIDDEN"))
	marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT"))
	else:
	# Add PLOT MARCXML
	subfields = []
	subfields.append(('a', image_location))
	subfields.append(('t', "Plot"))
	subfields.append(('d', "%05d %s" % (index, caption.replace(' : ', ''))))
	subfields.append(('n', docname))
	marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT"))
	if contexts:
	# Add CONTEXT MARCXML
	subfields = []
	subfields.append(('a', "%s.context" % (image_location,)))
	subfields.append(('t', "Plot"))
	subfields.append(('f', ".png;context"))
	subfields.append(('n', docname))
	subfields.append(('o', "HIDDEN"))
	marcxml.append(field_xml_output((subfields, ' ', ' ', None), "FFT"))
	index = index + 1
	marcxml.append('</record>')
	return '\n'.join(marcxml)

	def get_image_location(image, sdir, image_list, recurred=False):
	"""
	This function takes a raw image name and a directory and returns the location of the
	(possibly converted) image

	@param: image (string): the name of the raw image from the TeX
	@param: sdir (string): the directory where everything was unzipped to
	@param: image_list ([string, string, ...]): the list of images that
	were extracted from the tarball and possibly converted

	@return: converted_image (string): the full path to the (possibly
	converted) image file
	"""

	if type(image) == list:
	# image is a list, not good
	return None

	image = str(image)

	image = image.strip()

	figure_or_file = '(figure=\|file=)'
	figure_or_file_in_image = re.findall(figure_or_file, image)
	if len(figure_or_file_in_image) > 0:
	image.replace(figure_or_file_in_image[0], '')
	includegraphics = '\\includegraphics{'
	includegraphics_in_image = re.findall(includegraphics, image)
	if len(includegraphics_in_image) > 0:
	image.replace(includegraphics_in_image[0], '')

	image = image.strip()

	some_kind_of_tag = '\\\\\\w+ '

	if image.startswith('./'):
	image = image[2:]
	if re.match(some_kind_of_tag, image):
	image = image[len(image.split(' ')[0]) + 1:]
	if image.startswith('='):
	image = image[1:]

	if len(image) == 1:
	return None

	image = image.strip()

	image_path = os.path.join(sdir, image)
	converted_image_should_be = get_converted_image_name(image_path)

	if image_list == None:
	image_list = os.listdir(sdir)

	for png_image in image_list:
	if converted_image_should_be == png_image:
	return png_image

	# maybe it's in a subfolder called eps (TeX just understands that)
	if os.path.isdir(os.path.join(sdir, 'eps')):
	image_list = os.listdir(os.path.join(sdir, 'eps'))
	for png_image in image_list:
	if converted_image_should_be == png_image:
	return os.path.join('eps', png_image)

	if os.path.isdir(os.path.join(sdir, 'fig')):
	image_list = os.listdir(os.path.join(sdir, 'fig'))
	for png_image in image_list:
	if converted_image_should_be == png_image:
	return os.path.join('fig', png_image)

	if os.path.isdir(os.path.join(sdir, 'figs')):
	image_list = os.listdir(os.path.join(sdir, 'figs'))
	for png_image in image_list:
	if converted_image_should_be == png_image:
	return os.path.join('figs', png_image)

	if os.path.isdir(os.path.join(sdir, 'Figures')):
	image_list = os.listdir(os.path.join(sdir, 'Figures'))
	for png_image in image_list:
	if converted_image_should_be == png_image:
	return os.path.join('Figures', png_image)

	if os.path.isdir(os.path.join(sdir, 'Figs')):
	image_list = os.listdir(os.path.join(sdir, 'Figs'))
	for png_image in image_list:
	if converted_image_should_be == png_image:
	return os.path.join('Figs', png_image)

	# maybe it is actually just loose.
	for png_image in os.listdir(sdir):
	if os.path.split(converted_image_should_be)[-1] == png_image:
	return converted_image_should_be
	if os.path.isdir(os.path.join(sdir, png_image)):
	# try that, too! we just do two levels, because that's all that's
	# reasonable..
	sub_dir = os.path.join(sdir, png_image)
	for sub_dir_file in os.listdir(sub_dir):
	if os.path.split(converted_image_should_be)[-1] == sub_dir_file:
	return converted_image_should_be

	# maybe it's actually up a directory or two: this happens in nested
	# tarballs where the TeX is stored in a different directory from the images
	for png_image in os.listdir(os.path.split(sdir)[0]):
	if os.path.split(converted_image_should_be)[-1] == png_image:
	return converted_image_should_be
	for png_image in os.listdir(os.path.split(os.path.split(sdir)[0])[0]):
	if os.path.split(converted_image_should_be)[-1] == png_image:
	return converted_image_should_be

	if recurred:
	return None

	# agh, this calls for drastic measures
	for piece in image.split(' '):
	res = get_image_location(piece, sdir, image_list, recurred=True)
	if res != None:
	return res

	for piece in image.split(','):
	res = get_image_location(piece, sdir, image_list, recurred=True)
	if res != None:
	return res

	for piece in image.split('='):
	res = get_image_location(piece, sdir, image_list, recurred=True)
	if res != None:
	return res

	#write_message('Unknown image ' + image)
	return None

	def get_converted_image_name(image):
	"""
	Gives the name of the image after it has been converted to png format.
	Strips off the old extension.

	@param: image (string): The fullpath of the image before conversion

	@return: converted_image (string): the fullpath of the image after convert
	"""
	png_extension = '.png'

	if image[(0 - len(png_extension)):] == png_extension:
	# it already ends in png! we're golden
	return image

	img_dir = os.path.split(image)[0]
	image = os.path.split(image)[-1]

	# cut off the old extension
	if len(image.split('.')) > 1:
	old_extension = '.' + image.split('.')[-1]
	converted_image = image[:(0 - len(old_extension))] + png_extension

	else:
	#no extension... damn
	converted_image = image + png_extension

	return os.path.join(img_dir, converted_image)

	def get_tex_location(new_tex_name, current_tex_name, recurred=False):
	"""
	Takes the name of a TeX file and attempts to match it to an actual file
	in the tarball.

	@param: new_tex_name (string): the name of the TeX file to find
	@param: current_tex_name (string): the location of the TeX file where we
	found the reference

	@return: tex_location (string): the location of the other TeX file on
	disk or None if it is not found
	"""

	tex_location = None

	current_dir = os.path.split(current_tex_name)[0]

	some_kind_of_tag = '\\\\\\w+ '

	new_tex_name = new_tex_name.strip()
	if new_tex_name.startswith('input'):
	new_tex_name = new_tex_name[len('input'):]
	if re.match(some_kind_of_tag, new_tex_name):
	new_tex_name = new_tex_name[len(new_tex_name.split(' ')[0]) + 1:]
	if new_tex_name.startswith('./'):
	new_tex_name = new_tex_name[2:]
	if len(new_tex_name) == 0:
	#write_message('TeX has been stripped down to nothing.')
	return None
	new_tex_name = new_tex_name.strip()

	new_tex_file = os.path.split(new_tex_name)[-1]
	new_tex_folder = os.path.split(new_tex_name)[0]
	if new_tex_folder == new_tex_file:
	new_tex_folder = ''

	# could be in the current directory
	for any_file in os.listdir(current_dir):
	if any_file == new_tex_file:
	return os.path.join(current_dir, new_tex_file)

	# could be in a subfolder of the current directory
	if os.path.isdir(os.path.join(current_dir, new_tex_folder)):
	for any_file in os.listdir(os.path.join(current_dir, new_tex_folder)):
	if any_file == new_tex_file:
	return os.path.join(os.path.join(current_dir, new_tex_folder),
	new_tex_file)

	# could be in a subfolder of a higher directory
	one_dir_up = os.path.join(os.path.split(current_dir)[0], new_tex_folder)
	if os.path.isdir(one_dir_up):
	for any_file in os.listdir(one_dir_up):
	if any_file == new_tex_file:
	return os.path.join(one_dir_up, new_tex_file)

	two_dirs_up = os.path.join(os.path.split(os.path.split(current_dir)[0])[0],
	new_tex_folder)
	if os.path.isdir(two_dirs_up):
	for any_file in os.listdir(two_dirs_up):
	if any_file == new_tex_file:
	return os.path.join(two_dirs_up, new_tex_file)

	if tex_location == None and not recurred:
	return get_tex_location(new_tex_name + '.tex', current_tex_name, \
	recurred=True)

	return tex_location

plotextractor_output_utils.pyNo OneTemporaryActions

File Metadata

plotextractor_output_utils.pyView Options

Event Timeline

plotextractor_output_utils.py
No OneTemporary
Actions

plotextractor_output_utils.py
View Options