docextract_webinterface.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Nov 1, 00:10

docextract_webinterface.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""DocExtract REST and Web API

	Exposes document extration facilities to the world
	"""

	from tempfile import NamedTemporaryFile

	from invenio.webinterface_handler import WebInterfaceDirectory
	from invenio.webuser import collect_user_info
	from invenio.webpage import page
	from invenio.config import CFG_TMPSHAREDDIR, CFG_ETCDIR
	from invenio.refextract_api import extract_references_from_file_xml, \
	extract_references_from_url_xml, \
	extract_references_from_string_xml
	from invenio.bibformat_engine import format_record


	def check_login(req):
	"""Check that the user is logged in"""
	user_info = collect_user_info(req)
	if user_info['email'] == 'guest':
	# 1. User is guest: must login prior to upload
	# return 'Please login before uploading file.'
	pass


	def check_url(url):
	"""Check that the url we received is not gibberish"""
	return url.startswith('http://') or \
	url.startswith('https://') or \
	url.startswith('ftp://')


	def extract_from_pdf_string(pdf):
	"""Extract references from a pdf stored in a string

	Given a string representing a pdf, this function writes the string to
	disk and passes it to refextract.
	We need to create a temoporary file because we need to run pdf2text on it"""
	# Save new record to file
	tf = NamedTemporaryFile(prefix='docextract-pdf',
	dir=CFG_TMPSHAREDDIR)
	try:
	tf.write(pdf)
	tf.flush()
	refs = extract_references_from_file_xml(tf.name)
	finally:
	# Also deletes the file
	tf.close()

	return refs


	def make_arxiv_url(arxiv_id):
	"""Make a url we can use to download a pdf from arxiv

	Arguments:
	arxiv_id -- the arxiv id of the record to link to
	"""
	return "http://arxiv.org/pdf/%s.pdf" % arxiv_id


	class WebInterfaceAPIDocExtract(WebInterfaceDirectory):
	"""DocExtract REST API"""
	_exports = [
	('extract-references-pdf', 'extract_references_pdf'),
	('extract-references-pdf-url', 'extract_references_pdf_url'),
	('extract-references-txt', 'extract_references_txt'),
	]

	def extract_references_pdf(self, req, form):
	"""Extract references from uploaded pdf"""
	check_login(req)

	if 'pdf' not in form:
	return 'No PDF file uploaded'

	return extract_from_pdf_string(form['pdf'].file.read())

	def extract_references_pdf_url(self, req, form):
	"""Extract references from the pdf pointed by the passed url"""
	check_login(req)

	if 'url' not in form:
	return 'No URL specified'

	url = form['url'].value

	if not check_url(url):
	return 'Invalid URL specified'

	return extract_references_from_url_xml(url)

	def extract_references_txt(self, req, form):
	"""Extract references from plain text"""
	check_login(req)

	if 'txt' not in form:
	return 'No text specified'

	txt = form['txt'].value

	return extract_references_from_string_xml(txt)


	class WebInterfaceDocExtract(WebInterfaceDirectory):
	"""DocExtract API"""
	_exports = ['api',
	('', 'extract'),
	('example.pdf', 'example_pdf'),
	]

	api = WebInterfaceAPIDocExtract()

	def example_pdf(self, req, _form):
	"""Serve a test pdf for tests"""
	f = open("%s/docextract/example.pdf" % CFG_ETCDIR, 'rb')
	try:
	req.write(f.read())
	finally:
	f.close()

	def extract_template(self):
	"""Template for reference extraction page"""
	return """Please specify a pdf or a url or some references to parse

	<form action="" method="post"
	enctype="multipart/form-data">
	<p>PDF: <input type="file" name="pdf" /></p>
	<p>arXiv: <input type="text" name="arxiv" /></p>
	<p>URL: <input type="text" name="url" style="width: 600px;"/></p>
	<textarea name="txt" style="width: 500px; height: 500px;"></textarea>
	<p><input type="submit" /></p>
	</form>
	"""

	def extract(self, req, form):
	"""Refrences extraction page

	This page can be used for authors to test their pdfs against our
	refrences extraction process"""
	user_info = collect_user_info(req)

	# Handle the 3 POST parameters
	if 'pdf' in form and form['pdf'].value:
	pdf = form['pdf'].value
	references_xml = extract_from_pdf_string(pdf)
	elif 'arxiv' in form and form['arxiv'].value:
	url = make_arxiv_url(arxiv_id=form['arxiv'].value)
	references_xml = extract_references_from_url_xml(url)
	elif 'url' in form and form['url'].value:
	url = form['url'].value
	references_xml = extract_references_from_url_xml(url)
	elif 'txt' in form and form['txt'].value:
	txt = form['txt'].value.decode('utf-8', errors='ignore')
	references_xml = extract_references_from_string_xml(txt)
	else:
	references_xml = None

	# If we have not uploaded anything yet
	# Display the form that allows us to do so
	if not references_xml:
	out = self.extract_template()
	else:
	out = """
	<style type="text/css">
	#referenceinp_link { display: none; }
	</style>
	"""
	out += format_record(0,
	'hdref',
	xml_record=references_xml,
	user_info=user_info)

	# Render the page (including header, footer)
	return page(title='References Extractor',
	body=out,
	uid=user_info['uid'],
	req=req)

docextract_webinterface.pyNo OneTemporaryActions

File Metadata

docextract_webinterface.pyView Options

Event Timeline

docextract_webinterface.py
No OneTemporary
Actions

docextract_webinterface.py
View Options