Page MenuHomec4science

classification.py
No OneTemporary

File Metadata

Created
Mon, Oct 14, 15:29

classification.py

# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2014 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Set of tasks for classification."""
def classify_paper(obj, eng, callback, data,
taxonomy, rebuild_cache=False, no_cache=False,
output_mode='text', output_limit=20, spires=False,
match_mode='full', with_author_keywords=False,
extract_acronyms=False, only_core_tags=False,
fast_mode=False):
"""Extract keywords from data using callback with given taxonomy."""
from invenio.modules.classifier.errors import TaxonomyError
if not data:
obj.log.error("No classification done due to missing data.")
return
try:
result = callback(data, taxonomy, rebuild_cache,
no_cache, output_mode, output_limit,
spires, match_mode, with_author_keywords,
extract_acronyms, only_core_tags)
except TaxonomyError as e:
obj.log.error(e)
else:
result["fast_mode"] = fast_mode
if fast_mode:
suffix = "fast"
else:
suffix = "full"
name = "classification_{0}".format(suffix)
obj.update_task_results(
name,
[{
"name": name,
"result": result,
"template": "workflows/results/classifier.html"
}]
)
def classify_paper_with_oaiharvester(taxonomy, rebuild_cache=False, no_cache=False,
output_mode='text', output_limit=20, spires=False,
match_mode='full', with_author_keywords=False,
extract_acronyms=False, only_core_tags=False,
fast_mode=False):
"""Extract keywords from a pdf file or metadata in a OAI harvest."""
from invenio.legacy.bibclassify.api import (
bibclassify_exhaustive_call,
bibclassify_exhaustive_call_text,
)
def _classify_paper_with_oaiharvester(obj, eng):
data = None
if not fast_mode:
if "_result" in obj.extra_data and "pdf" in obj.extra_data["_result"]:
data = obj.extra_data["_result"]["pdf"]
callback = bibclassify_exhaustive_call
else:
obj.log.error("No classification done due to missing file.")
if not data:
data = [obj.data.get("title", {}).get("title", ""),
obj.data.get("abstract", {}).get("summary", "")]
callback = bibclassify_exhaustive_call_text
classify_paper(obj, eng, callback, data,
taxonomy, rebuild_cache,
no_cache, output_mode, output_limit,
spires, match_mode, with_author_keywords,
extract_acronyms, only_core_tags, fast_mode)
return _classify_paper_with_oaiharvester
def classify_paper_with_deposit(taxonomy, rebuild_cache=False, no_cache=False,
output_mode='text', output_limit=20, spires=False,
match_mode='full', with_author_keywords=False,
extract_acronyms=False, only_core_tags=False,
fast_mode=False):
"""Extract keywords from a pdf file or metadata in a deposit."""
from invenio.legacy.bibclassify.api import (
bibclassify_exhaustive_call,
bibclassify_exhaustive_call_text,
)
def _classify_paper_with_deposit(obj, eng):
from invenio.modules.deposit.models import Deposition
deposition = Deposition(obj)
data = None
if not fast_mode:
for f in deposition.files:
if f.name and ".pdf" in f.name.lower():
data = f.get_syspath()
break
callback = bibclassify_exhaustive_call
if not data:
try:
metadata = deposition.get_latest_sip().metadata
except AttributeError as err:
obj.log.error("Error getting data: {0}".format(err))
data = [metadata.get("title", {}).get("title", ""),
metadata.get("abstract", {}).get("summary", "")]
callback = bibclassify_exhaustive_call_text
classify_paper(obj, eng, callback, data,
taxonomy, rebuild_cache,
no_cache, output_mode, output_limit,
spires, match_mode, with_author_keywords,
extract_acronyms, only_core_tags, fast_mode)
return _classify_paper_with_deposit

Event Timeline