Page MenuHomec4science
No OneTemporary

File Metadata

Mon, Dec 2, 19:28

# This file is part of Invenio.
# Copyright (C) 2008, 2009, 2010, 2011, 2013, 2014 CERN.
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
BibClassify's web interface.
This module is NOT standalone safe - this component is never expected
to run in a standalone mode, but always inside invenio.
import os
import six
from cgi import escape
from invenio.base.i18n import gettext_set_language
from invenio.legacy.bibdocfile.api import BibRecDocs
from invenio.legacy.search_engine import get_record
from invenio.legacy.template import load
from invenio.ext.legacy.handler import wash_urlargd
import invenio.modules.access.engine as acce
from invenio.legacy.bibsched import bibtask
from invenio.legacy.bibupload.engine import open_marc_file, xml_marc_to_records
from invenio.legacy import bibrecord, dbquery
from invenio.legacy.bibclassify.engine import get_tmp_file, build_marc, _parse_marc_code
from invenio.legacy.bibclassify import (config as bconfig,
ontology_reader as bor)
log = bconfig.get_logger("bibclassify.webinterface")
template = load('bibclassify')
def main_page(req, recid, tabs, ln, template):
"""Generate the main page for the keyword tab
Url style : http://url/record/[recid]/keywords
:param req: request object
:param recid: int docid
:param tabs: list of tab links
:param ln: language id
:param template: template object
:return: nothing, writes using req object
form = req.form
argd = wash_urlargd(form, {
'generate': (str, 'no'),
'sorting': (str, 'occurences'),
'type': (str, 'tagcloud'),
'numbering': (str, 'off'),
'showall': (str, 'off'),
for k, v in argd.items():
argd[k] = escape(v)
req.write(template.detailed_record_container_top(recid, tabs, ln))
# Get the keywords from MARC (if any)
success, keywords, marcrec = record_get_keywords(recid)
if success:
# check for the cached file and delete it (we don't need it anymore, data are in the DB)
tmp_file = get_tmp_file(recid)
if os.path.exists(tmp_file):
except Exception as msg:
log.error('Error removing the cached file: %s' % tmp_file)
# Give user possibility to generate them ONLY if not available already
# we may have some keywords, but they are the old ones and we want to generate new
new_found, new_keywords, marcrec = generate_keywords(req, recid, argd)
if keywords and new_keywords:
for key in keywords.keys():
if key in new_keywords:
log.warning('The old "DESY" keyword will be overwritten by the newly extracted one: %s' % key)
if keywords:
# Output the keywords or the generate button or some message why kw not available
write_keywords_body(keywords, req, recid, argd, marcrec=marcrec)
tabs, ln))
def write_keywords_body(keywords, req, recid, argd, marcrec=None):
"""Write the bibclassify keyword output into req object."""
if not keywords:
req.write(template.tmpl_page_no_keywords(req=req, **argd))
# test if more than half of the entries have weight (0,0) - ie. not weighted
#if argd['type'] == 'tagcloud' and len(filter(lambda x: (0,0) in x[0], keywords.values())) > (len(keywords) * .5):
# argd['type'] = 'list'
if argd['type'] == 'list':
# Display keywords as a list.
req.write(template.tmpl_page_list(keywords, req=req, **argd))
elif argd['type'] == 'tagcloud':
# Display keywords as a tag cloud.
req.write(template.tmpl_page_tagcloud(keywords=keywords, req=req, **argd))
elif argd['type'] == 'xml':
if marcrec:
marcxml = filter_marcrec(marcrec)
marcxml = build_marc(recid, keywords, {})
req=req, **argd))
_ = gettext_set_language(argd['ln'])
req.write(template.tmpl_page(top=_('Unknown type: %(x_type)s', x_type=argd['type']), **argd))
def record_get_keywords(record, main_field=bconfig.CFG_MAIN_FIELD,
"""Return a dictionary of keywordToken objects from the marc record.
Weight is set to (0,0) if no weight can be found.
This will load keywords from the field 653 and 695__a (which are the
old 'DESY' keywords)
:param record: int or marc record, if int - marc record is loaded
from the database. If you pass record instance, keywords are
extracted from it
:return: tuple (found, keywords, marcxml)
found - int indicating how many main_field keywords were found
the other fields are not counted
keywords - standard dictionary of keywordToken objects
marcrec - marc record object loaded with data
keywords = {}
if isinstance(main_field, six.string_types):
main_field = [main_field]
if isinstance(others, six.string_types):
others = [others]
if isinstance(record, int):
rec = get_record(record)
rec = record
found = 0
for m_field in main_field:
tag, ind1, ind2 = _parse_marc_code(m_field)
for field in rec.get(tag, []):
keyword = ''
weight = 0
type = ''
for subfield in field[0]:
if subfield[0] == 'a':
keyword = subfield[1]
elif subfield[0] == 'n':
weight = int(subfield[1])
elif subfield[0] == '9':
type = subfield[1]
if keyword:
found += 1
keywords[bor.KeywordToken(keyword, type=type)] = [[(0, 0) for x in range(weight)]]
if others:
for field_no in others:
tag, ind1, ind2 = _parse_marc_code(field_no)
type = 'f%s' % field_no
for field in rec.get(tag, []):
keyword = ''
for subfield in field[0]:
if subfield[0] == 'a':
keyword = subfield[1]
keywords[bor.KeywordToken(keyword, type=type)] = [[(0, 0)]]
return found, keywords, rec
def generate_keywords(req, recid, argd):
"""Extract keywords from the fulltexts.
Do the extraction on the record witth a recid equal to the parameter.
It first checks whether the keywords are not already
stored in the temp file (maybe from the previous run).
:param req: req object.
:param recid: record id.
:param argd: arguments passed from web.
:keyword store_keywords: boolean, whether to save records in the file.
:return: standard dictionary of kw objects or {}.
ln = argd['ln']
_ = gettext_set_language(ln)
keywords = {}
# check the files were not already generated
abs_path = get_tmp_file(recid)
if os.path.exists(abs_path):
# Try to load the data from the tmp file
recs = xml_marc_to_records(open_marc_file(abs_path))
return record_get_keywords(recs[0])
# check it is allowed (for this user) to generate pages
(exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify')
if exit_stat != 0:'Access denied: ' + msg)
msg = _("The site settings do not allow automatic keyword extraction")
return 0, keywords, None
# register generation
bibdocfiles = BibRecDocs(recid).list_latest_files()
if bibdocfiles:
# User arrived at a page, but no keywords are available
inprogress, msg = _doc_already_submitted(recid)
if argd['generate'] != 'yes':
# Display a form and give them possibility to generate keywords
if inprogress:
req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg)))
req.write(template.tmpl_page_generate_keywords(req=req, **argd))
return 0, keywords, None
else: # after user clicked on "generate" button
if inprogress:
req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg)))
schedule_extraction(recid, taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY)
req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' %
_('We have registered your request, the automated'
'keyword extraction will run after some time. Please return back in a while.')))
req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' %
_("Unfortunately, we don't have a PDF fulltext for this record in the storage, \
keywords cannot be generated using an automated process.")))
return 0, keywords, None
def upload_keywords(filename, mode='correct', recids=None):
"""Store the extracted keywords in the database.
:param filename: fullpath to the file with marc record.
:keyword mode: correct|replace|add|delete
use correct to add fields if they are different
replace all fields with fields from the file
add - add (even duplicate) fields
delete - delete fields which are inside the file.
:keyword recids: list of record ids, this arg comes from
the bibclassify daemon and it is used when the recids
contains one entry (recid) - ie. one individual document
was processed. We use it to mark the job title so that
it is possible to query database if the bibclassify
was run over that document (in case of collections with
many recids, we simply construct a general title).
if mode == 'correct':
m = '-c'
elif mode == 'replace':
m = '-r'
elif mode == 'add':
m = '-a'
elif mode == 'delete':
m = '-d'
raise Exception('Unknown mode')
# let's use the user column to store the information, cause no better alternative in sight...
user_title = 'bibclassify.upload'
if recids and len(recids) == 1:
user_title = 'extract:%d' % recids[0]
user_title, '-n', m, filename)
def schedule_extraction(recid, taxonomy):
'extract:%s' % recid, '-k', taxonomy,
'-i', '%s' % recid)
def _doc_already_submitted(recid):
# check extraction was already registered
sql = "SELECT COUNT(proc) FROM schTASK WHERE proc='bibclassify' AND user=%s\
AND (status='WAITING' OR status='RUNNING')"
if dbquery.run_sql(sql, ("extract:" + str(recid),))[0][0] > 0:
return (True, "The automated keyword extraction \
for this document has been already scheduled. Please return back in a while.")
# check the upload is inside the scheduled tasks
sql = "SELECT COUNT(proc) FROM schTASK WHERE proc='bibupload' AND user=%s\
AND (status='WAITING' OR status='RUNNING')"
if dbquery.run_sql(sql, ("extract:" + str(recid),))[0][0] > 0:
return (True, 'The document was already processed, '
'it will take a while for it to be ingested.')
# or the task was run and is already archived
sql = "SELECT COUNT(proc) FROM hstTASK WHERE proc='bibupload' AND user=%s"
if dbquery.run_sql(sql, ("extract:" + str(recid),))[0][0] > 0:
return (True, 'The document was already processed, '
'at this moment, the automated extraction is not available.')
# or the task was already ran
sql = "SELECT COUNT(proc) FROM schTASK WHERE proc='bibclassify' AND user=%s\
AND (status='DONE')"
if dbquery.run_sql(sql, ("extract:" + str(recid),))[0][0] > 0:
return (True, 'The document was already processed, '
'but automated extraction identified no suitable keywords.')
# or the extraction is in error stat
sql = "SELECT COUNT(proc) FROM schTASK WHERE proc='bibclassify' AND user=%s\
AND (status='ERROR')"
if dbquery.run_sql(sql, ("extract:" + str(recid),))[0][0] > 0:
return (True, 'The document was already scheduled, '
'but an error happened. This requires an'
'administrator\'s intervention. Unfortunately, '
'for the moment we cannot display any data.')
return (False, None)
def filter_marcrec(marcrec, main_field=bconfig.CFG_MAIN_FIELD,
"""Remove the unwanted fields and returns xml."""
if isinstance(main_field, six.string_types):
main_field = [main_field]
if isinstance(others, six.string_types):
others = [others]
key_map = ['001']
for field in main_field + others:
tag, ind1, ind2 = _parse_marc_code(field)
return bibrecord.print_rec(marcrec, 1, tags=key_map)

Event Timeline