Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90364444
docextract_webinterface.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Nov 1, 00:10
Size
6 KB
Mime Type
text/x-python
Expires
Sun, Nov 3, 00:10 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22062661
Attached To
R3600 invenio-infoscience
docextract_webinterface.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""DocExtract REST and Web API
Exposes document extration facilities to the world
"""
from
tempfile
import
NamedTemporaryFile
from
invenio.webinterface_handler
import
WebInterfaceDirectory
from
invenio.webuser
import
collect_user_info
from
invenio.webpage
import
page
from
invenio.config
import
CFG_TMPSHAREDDIR
,
CFG_ETCDIR
from
invenio.refextract_api
import
extract_references_from_file_xml
,
\
extract_references_from_url_xml
,
\
extract_references_from_string_xml
from
invenio.bibformat_engine
import
format_record
def
check_login
(
req
):
"""Check that the user is logged in"""
user_info
=
collect_user_info
(
req
)
if
user_info
[
'email'
]
==
'guest'
:
# 1. User is guest: must login prior to upload
# return 'Please login before uploading file.'
pass
def
check_url
(
url
):
"""Check that the url we received is not gibberish"""
return
url
.
startswith
(
'http://'
)
or
\
url
.
startswith
(
'https://'
)
or
\
url
.
startswith
(
'ftp://'
)
def
extract_from_pdf_string
(
pdf
):
"""Extract references from a pdf stored in a string
Given a string representing a pdf, this function writes the string to
disk and passes it to refextract.
We need to create a temoporary file because we need to run pdf2text on it"""
# Save new record to file
tf
=
NamedTemporaryFile
(
prefix
=
'docextract-pdf'
,
dir
=
CFG_TMPSHAREDDIR
)
try
:
tf
.
write
(
pdf
)
tf
.
flush
()
refs
=
extract_references_from_file_xml
(
tf
.
name
)
finally
:
# Also deletes the file
tf
.
close
()
return
refs
def
make_arxiv_url
(
arxiv_id
):
"""Make a url we can use to download a pdf from arxiv
Arguments:
arxiv_id -- the arxiv id of the record to link to
"""
return
"http://arxiv.org/pdf/
%s
.pdf"
%
arxiv_id
class
WebInterfaceAPIDocExtract
(
WebInterfaceDirectory
):
"""DocExtract REST API"""
_exports
=
[
(
'extract-references-pdf'
,
'extract_references_pdf'
),
(
'extract-references-pdf-url'
,
'extract_references_pdf_url'
),
(
'extract-references-txt'
,
'extract_references_txt'
),
]
def
extract_references_pdf
(
self
,
req
,
form
):
"""Extract references from uploaded pdf"""
check_login
(
req
)
if
'pdf'
not
in
form
:
return
'No PDF file uploaded'
return
extract_from_pdf_string
(
form
[
'pdf'
]
.
file
.
read
())
def
extract_references_pdf_url
(
self
,
req
,
form
):
"""Extract references from the pdf pointed by the passed url"""
check_login
(
req
)
if
'url'
not
in
form
:
return
'No URL specified'
url
=
form
[
'url'
]
.
value
if
not
check_url
(
url
):
return
'Invalid URL specified'
return
extract_references_from_url_xml
(
url
)
def
extract_references_txt
(
self
,
req
,
form
):
"""Extract references from plain text"""
check_login
(
req
)
if
'txt'
not
in
form
:
return
'No text specified'
txt
=
form
[
'txt'
]
.
value
return
extract_references_from_string_xml
(
txt
)
class
WebInterfaceDocExtract
(
WebInterfaceDirectory
):
"""DocExtract API"""
_exports
=
[
'api'
,
(
''
,
'extract'
),
(
'example.pdf'
,
'example_pdf'
),
]
api
=
WebInterfaceAPIDocExtract
()
def
example_pdf
(
self
,
req
,
_form
):
"""Serve a test pdf for tests"""
f
=
open
(
"
%s
/docextract/example.pdf"
%
CFG_ETCDIR
,
'rb'
)
try
:
req
.
write
(
f
.
read
())
finally
:
f
.
close
()
def
extract_template
(
self
):
"""Template for reference extraction page"""
return
"""Please specify a pdf or a url or some references to parse
<form action="" method="post"
enctype="multipart/form-data">
<p>PDF: <input type="file" name="pdf" /></p>
<p>arXiv: <input type="text" name="arxiv" /></p>
<p>URL: <input type="text" name="url" style="width: 600px;"/></p>
<textarea name="txt" style="width: 500px; height: 500px;"></textarea>
<p><input type="submit" /></p>
</form>
"""
def
extract
(
self
,
req
,
form
):
"""Refrences extraction page
This page can be used for authors to test their pdfs against our
refrences extraction process"""
user_info
=
collect_user_info
(
req
)
# Handle the 3 POST parameters
if
'pdf'
in
form
and
form
[
'pdf'
]
.
value
:
pdf
=
form
[
'pdf'
]
.
value
references_xml
=
extract_from_pdf_string
(
pdf
)
elif
'arxiv'
in
form
and
form
[
'arxiv'
]
.
value
:
url
=
make_arxiv_url
(
arxiv_id
=
form
[
'arxiv'
]
.
value
)
references_xml
=
extract_references_from_url_xml
(
url
)
elif
'url'
in
form
and
form
[
'url'
]
.
value
:
url
=
form
[
'url'
]
.
value
references_xml
=
extract_references_from_url_xml
(
url
)
elif
'txt'
in
form
and
form
[
'txt'
]
.
value
:
txt
=
form
[
'txt'
]
.
value
.
decode
(
'utf-8'
,
errors
=
'ignore'
)
references_xml
=
extract_references_from_string_xml
(
txt
)
else
:
references_xml
=
None
# If we have not uploaded anything yet
# Display the form that allows us to do so
if
not
references_xml
:
out
=
self
.
extract_template
()
else
:
out
=
"""
<style type="text/css">
#referenceinp_link { display: none; }
</style>
"""
out
+=
format_record
(
0
,
'hdref'
,
xml_record
=
references_xml
,
user_info
=
user_info
)
# Render the page (including header, footer)
return
page
(
title
=
'References Extractor'
,
body
=
out
,
uid
=
user_info
[
'uid'
],
req
=
req
)
Event Timeline
Log In to Comment