Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F93952816
webinterface.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Dec 2, 19:28
Size
14 KB
Mime Type
text/x-python
Expires
Wed, Dec 4, 19:28 (1 d, 21 h)
Engine
blob
Format
Raw Data
Handle
22724353
Attached To
R3600 invenio-infoscience
webinterface.py
View Options
# This file is part of Invenio.
# Copyright (C) 2008, 2009, 2010, 2011, 2013, 2014 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BibClassify's web interface.
This module is NOT standalone safe - this component is never expected
to run in a standalone mode, but always inside invenio.
"""
import
os
import
six
from
cgi
import
escape
from
invenio.base.i18n
import
gettext_set_language
from
invenio.legacy.bibdocfile.api
import
BibRecDocs
from
invenio.legacy.search_engine
import
get_record
from
invenio.legacy.template
import
load
from
invenio.ext.legacy.handler
import
wash_urlargd
import
invenio.modules.access.engine
as
acce
from
invenio.legacy.bibsched
import
bibtask
from
invenio.legacy.bibupload.engine
import
open_marc_file
,
xml_marc_to_records
from
invenio.legacy
import
bibrecord
,
dbquery
from
invenio.legacy.bibclassify.engine
import
get_tmp_file
,
build_marc
,
_parse_marc_code
from
invenio.legacy.bibclassify
import
(
config
as
bconfig
,
ontology_reader
as
bor
)
log
=
bconfig
.
get_logger
(
"bibclassify.webinterface"
)
template
=
load
(
'bibclassify'
)
def
main_page
(
req
,
recid
,
tabs
,
ln
,
template
):
"""Generate the main page for the keyword tab
Url style : http://url/record/[recid]/keywords
:param req: request object
:param recid: int docid
:param tabs: list of tab links
:param ln: language id
:param template: template object
:return: nothing, writes using req object
"""
form
=
req
.
form
argd
=
wash_urlargd
(
form
,
{
'generate'
:
(
str
,
'no'
),
'sorting'
:
(
str
,
'occurences'
),
'type'
:
(
str
,
'tagcloud'
),
'numbering'
:
(
str
,
'off'
),
'showall'
:
(
str
,
'off'
),
})
for
k
,
v
in
argd
.
items
():
argd
[
k
]
=
escape
(
v
)
req
.
write
(
template
.
detailed_record_container_top
(
recid
,
tabs
,
ln
))
# Get the keywords from MARC (if any)
success
,
keywords
,
marcrec
=
record_get_keywords
(
recid
)
if
success
:
# check for the cached file and delete it (we don't need it anymore, data are in the DB)
tmp_file
=
get_tmp_file
(
recid
)
if
os
.
path
.
exists
(
tmp_file
):
try
:
os
.
remove
(
tmp_file
)
except
Exception
as
msg
:
log
.
error
(
'Error removing the cached file:
%s
'
%
tmp_file
)
log
.
error
(
msg
)
else
:
# Give user possibility to generate them ONLY if not available already
# we may have some keywords, but they are the old ones and we want to generate new
new_found
,
new_keywords
,
marcrec
=
generate_keywords
(
req
,
recid
,
argd
)
if
keywords
and
new_keywords
:
for
key
in
keywords
.
keys
():
if
key
in
new_keywords
:
log
.
warning
(
'The old "DESY" keyword will be overwritten by the newly extracted one:
%s
'
%
key
)
keywords
.
update
(
new_keywords
)
if
keywords
:
# Output the keywords or the generate button or some message why kw not available
write_keywords_body
(
keywords
,
req
,
recid
,
argd
,
marcrec
=
marcrec
)
req
.
write
(
template
.
detailed_record_container_bottom
(
recid
,
tabs
,
ln
))
def
write_keywords_body
(
keywords
,
req
,
recid
,
argd
,
marcrec
=
None
):
"""Write the bibclassify keyword output into req object."""
if
not
keywords
:
req
.
write
(
template
.
tmpl_page_no_keywords
(
req
=
req
,
**
argd
))
return
# test if more than half of the entries have weight (0,0) - ie. not weighted
#if argd['type'] == 'tagcloud' and len(filter(lambda x: (0,0) in x[0], keywords.values())) > (len(keywords) * .5):
# argd['type'] = 'list'
if
argd
[
'type'
]
==
'list'
:
# Display keywords as a list.
req
.
write
(
template
.
tmpl_page_list
(
keywords
,
req
=
req
,
**
argd
))
elif
argd
[
'type'
]
==
'tagcloud'
:
# Display keywords as a tag cloud.
req
.
write
(
template
.
tmpl_page_tagcloud
(
keywords
=
keywords
,
req
=
req
,
**
argd
))
elif
argd
[
'type'
]
==
'xml'
:
if
marcrec
:
marcxml
=
filter_marcrec
(
marcrec
)
else
:
marcxml
=
build_marc
(
recid
,
keywords
,
{})
req
.
write
(
template
.
tmpl_page_xml_output
(
keywords
,
marcxml
,
req
=
req
,
**
argd
))
else
:
_
=
gettext_set_language
(
argd
[
'ln'
])
req
.
write
(
template
.
tmpl_page
(
top
=
_
(
'Unknown type:
%(x_type)s
'
,
x_type
=
argd
[
'type'
]),
**
argd
))
def
record_get_keywords
(
record
,
main_field
=
bconfig
.
CFG_MAIN_FIELD
,
others
=
bconfig
.
CFG_OTHER_FIELDS
):
"""Return a dictionary of keywordToken objects from the marc record.
Weight is set to (0,0) if no weight can be found.
This will load keywords from the field 653 and 695__a (which are the
old 'DESY' keywords)
:param record: int or marc record, if int - marc record is loaded
from the database. If you pass record instance, keywords are
extracted from it
:return: tuple (found, keywords, marcxml)
found - int indicating how many main_field keywords were found
the other fields are not counted
keywords - standard dictionary of keywordToken objects
marcrec - marc record object loaded with data
"""
keywords
=
{}
if
isinstance
(
main_field
,
six
.
string_types
):
main_field
=
[
main_field
]
if
isinstance
(
others
,
six
.
string_types
):
others
=
[
others
]
if
isinstance
(
record
,
int
):
rec
=
get_record
(
record
)
else
:
rec
=
record
found
=
0
for
m_field
in
main_field
:
tag
,
ind1
,
ind2
=
_parse_marc_code
(
m_field
)
for
field
in
rec
.
get
(
tag
,
[]):
keyword
=
''
weight
=
0
type
=
''
for
subfield
in
field
[
0
]:
if
subfield
[
0
]
==
'a'
:
keyword
=
subfield
[
1
]
elif
subfield
[
0
]
==
'n'
:
weight
=
int
(
subfield
[
1
])
elif
subfield
[
0
]
==
'9'
:
type
=
subfield
[
1
]
if
keyword
:
found
+=
1
keywords
[
bor
.
KeywordToken
(
keyword
,
type
=
type
)]
=
[[(
0
,
0
)
for
x
in
range
(
weight
)]]
if
others
:
for
field_no
in
others
:
tag
,
ind1
,
ind2
=
_parse_marc_code
(
field_no
)
type
=
'f
%s
'
%
field_no
for
field
in
rec
.
get
(
tag
,
[]):
keyword
=
''
for
subfield
in
field
[
0
]:
if
subfield
[
0
]
==
'a'
:
keyword
=
subfield
[
1
]
keywords
[
bor
.
KeywordToken
(
keyword
,
type
=
type
)]
=
[[(
0
,
0
)]]
break
return
found
,
keywords
,
rec
def
generate_keywords
(
req
,
recid
,
argd
):
"""Extract keywords from the fulltexts.
Do the extraction on the record witth a recid equal to the parameter.
It first checks whether the keywords are not already
stored in the temp file (maybe from the previous run).
:param req: req object.
:param recid: record id.
:param argd: arguments passed from web.
:keyword store_keywords: boolean, whether to save records in the file.
:return: standard dictionary of kw objects or {}.
"""
ln
=
argd
[
'ln'
]
_
=
gettext_set_language
(
ln
)
keywords
=
{}
# check the files were not already generated
abs_path
=
get_tmp_file
(
recid
)
if
os
.
path
.
exists
(
abs_path
):
try
:
# Try to load the data from the tmp file
recs
=
xml_marc_to_records
(
open_marc_file
(
abs_path
))
return
record_get_keywords
(
recs
[
0
])
except
:
pass
# check it is allowed (for this user) to generate pages
(
exit_stat
,
msg
)
=
acce
.
acc_authorize_action
(
req
,
'runbibclassify'
)
if
exit_stat
!=
0
:
log
.
info
(
'Access denied: '
+
msg
)
msg
=
_
(
"The site settings do not allow automatic keyword extraction"
)
req
.
write
(
template
.
tmpl_page_msg
(
msg
=
msg
))
return
0
,
keywords
,
None
# register generation
bibdocfiles
=
BibRecDocs
(
recid
)
.
list_latest_files
()
if
bibdocfiles
:
# User arrived at a page, but no keywords are available
inprogress
,
msg
=
_doc_already_submitted
(
recid
)
if
argd
[
'generate'
]
!=
'yes'
:
# Display a form and give them possibility to generate keywords
if
inprogress
:
req
.
write
(
template
.
tmpl_page_msg
(
msg
=
'<div class="warningbox">
%s
</div>'
%
_
(
msg
)))
else
:
req
.
write
(
template
.
tmpl_page_generate_keywords
(
req
=
req
,
**
argd
))
return
0
,
keywords
,
None
else
:
# after user clicked on "generate" button
if
inprogress
:
req
.
write
(
template
.
tmpl_page_msg
(
msg
=
'<div class="warningbox">
%s
</div>'
%
_
(
msg
)))
else
:
schedule_extraction
(
recid
,
taxonomy
=
bconfig
.
CFG_EXTRACTION_TAXONOMY
)
req
.
write
(
template
.
tmpl_page_msg
(
msg
=
'<div class="warningbox">
%s
</div>'
%
_
(
'We have registered your request, the automated'
'keyword extraction will run after some time. Please return back in a while.'
)))
else
:
req
.
write
(
template
.
tmpl_page_msg
(
msg
=
'<div class="warningbox">
%s
</div>'
%
_
(
"Unfortunately, we don't have a PDF fulltext for this record in the storage,
\
keywords cannot be generated using an automated process."
)))
return
0
,
keywords
,
None
def
upload_keywords
(
filename
,
mode
=
'correct'
,
recids
=
None
):
"""Store the extracted keywords in the database.
:param filename: fullpath to the file with marc record.
:keyword mode: correct|replace|add|delete
use correct to add fields if they are different
replace all fields with fields from the file
add - add (even duplicate) fields
delete - delete fields which are inside the file.
:keyword recids: list of record ids, this arg comes from
the bibclassify daemon and it is used when the recids
contains one entry (recid) - ie. one individual document
was processed. We use it to mark the job title so that
it is possible to query database if the bibclassify
was run over that document (in case of collections with
many recids, we simply construct a general title).
"""
if
mode
==
'correct'
:
m
=
'-c'
elif
mode
==
'replace'
:
m
=
'-r'
elif
mode
==
'add'
:
m
=
'-a'
elif
mode
==
'delete'
:
m
=
'-d'
else
:
raise
Exception
(
'Unknown mode'
)
# let's use the user column to store the information, cause no better alternative in sight...
user_title
=
'bibclassify.upload'
if
recids
and
len
(
recids
)
==
1
:
user_title
=
'extract:
%d
'
%
recids
[
0
]
bibtask
.
task_low_level_submission
(
'bibupload'
,
user_title
,
'-n'
,
m
,
filename
)
def
schedule_extraction
(
recid
,
taxonomy
):
bibtask
.
task_low_level_submission
(
'bibclassify'
,
'extract:
%s
'
%
recid
,
'-k'
,
taxonomy
,
'-i'
,
'
%s
'
%
recid
)
def
_doc_already_submitted
(
recid
):
# check extraction was already registered
sql
=
"SELECT COUNT(proc) FROM schTASK WHERE proc='bibclassify' AND user=
%s
\
AND (status='WAITING' OR status='RUNNING')"
if
dbquery
.
run_sql
(
sql
,
(
"extract:"
+
str
(
recid
),))[
0
][
0
]
>
0
:
return
(
True
,
"The automated keyword extraction
\
for this document has been already scheduled. Please return back in a while."
)
# check the upload is inside the scheduled tasks
sql
=
"SELECT COUNT(proc) FROM schTASK WHERE proc='bibupload' AND user=
%s
\
AND (status='WAITING' OR status='RUNNING')"
if
dbquery
.
run_sql
(
sql
,
(
"extract:"
+
str
(
recid
),))[
0
][
0
]
>
0
:
return
(
True
,
'The document was already processed, '
'it will take a while for it to be ingested.'
)
# or the task was run and is already archived
sql
=
"SELECT COUNT(proc) FROM hstTASK WHERE proc='bibupload' AND user=
%s
"
if
dbquery
.
run_sql
(
sql
,
(
"extract:"
+
str
(
recid
),))[
0
][
0
]
>
0
:
return
(
True
,
'The document was already processed, '
'at this moment, the automated extraction is not available.'
)
# or the task was already ran
sql
=
"SELECT COUNT(proc) FROM schTASK WHERE proc='bibclassify' AND user=
%s
\
AND (status='DONE')"
if
dbquery
.
run_sql
(
sql
,
(
"extract:"
+
str
(
recid
),))[
0
][
0
]
>
0
:
return
(
True
,
'The document was already processed, '
'but automated extraction identified no suitable keywords.'
)
# or the extraction is in error stat
sql
=
"SELECT COUNT(proc) FROM schTASK WHERE proc='bibclassify' AND user=
%s
\
AND (status='ERROR')"
if
dbquery
.
run_sql
(
sql
,
(
"extract:"
+
str
(
recid
),))[
0
][
0
]
>
0
:
return
(
True
,
'The document was already scheduled, '
'but an error happened. This requires an'
'administrator
\'
s intervention. Unfortunately, '
'for the moment we cannot display any data.'
)
return
(
False
,
None
)
def
filter_marcrec
(
marcrec
,
main_field
=
bconfig
.
CFG_MAIN_FIELD
,
others
=
bconfig
.
CFG_OTHER_FIELDS
):
"""Remove the unwanted fields and returns xml."""
if
isinstance
(
main_field
,
six
.
string_types
):
main_field
=
[
main_field
]
if
isinstance
(
others
,
six
.
string_types
):
others
=
[
others
]
key_map
=
[
'001'
]
for
field
in
main_field
+
others
:
tag
,
ind1
,
ind2
=
_parse_marc_code
(
field
)
key_map
.
append
(
tag
)
return
bibrecord
.
print_rec
(
marcrec
,
1
,
tags
=
key_map
)
Event Timeline
Log In to Comment