Page MenuHomec4science

BibIndexJournalTokenizer.py
No OneTemporary

File Metadata

Created
Mon, Nov 18, 19:25

BibIndexJournalTokenizer.py

# -*- coding:utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2010, 2011, 2012, 2014 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""BibIndexJournalTokenizer: useful for journal index.
Agregates info about journal in a specific way given by its variable
journal_pubinfo_standard_form.
Behaves in the same way for all index table types:
- Words
- Pairs
- Phrases
"""
from invenio.legacy.dbquery import run_sql
from invenio.modules.indexer.tokenizers.BibIndexMultiFieldTokenizer import BibIndexMultiFieldTokenizer
from invenio.config import \
CFG_CERN_SITE, \
CFG_INSPIRE_SITE
from invenio.legacy.bibindex.engine_utils import get_values_recursively
from invenio.modules.records.api import get_record
if CFG_CERN_SITE:
CFG_JOURNAL_TAG = '773__%'
CFG_JOURNAL_PUBINFO_STANDARD_FORM = "773__p 773__v (773__y) 773__c"
CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = r'^\w.*\s\w.*\s\(\d+\)\s\w.*$'
elif CFG_INSPIRE_SITE:
CFG_JOURNAL_TAG = '773__%'
CFG_JOURNAL_PUBINFO_STANDARD_FORM = "773__p,773__v,773__c"
CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = r'^\w.*,\w.*,\w.*$'
else:
CFG_JOURNAL_TAG = '909C4%'
CFG_JOURNAL_PUBINFO_STANDARD_FORM = "909C4p 909C4v (909C4y) 909C4c"
CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = r'^\w.*\s\w.*\s\(\d+\)\s\w.*$'
class BibIndexJournalTokenizer(BibIndexMultiFieldTokenizer):
"""
Tokenizer for journal index.
Returns joined title/volume/year/page as a word from journal tag.
Tokenizer works on multiple tags.
For more information on tokenizers working on per-record basis
take a look on BibIndexJournalTokenizer base class.
"""
def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
self.tag = CFG_JOURNAL_TAG
self.nonmarc_tag = 'journal_info'
self.journal_pubinfo_standard_form = CFG_JOURNAL_PUBINFO_STANDARD_FORM
self.journal_pubinfo_standard_form_regexp_check = CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
def tokenize(self, recID):
"""
Special procedure to extract words from journal tags. Joins
title/volume/year/page into a standard form that is also used for
citations.
"""
# get all journal tags/subfields:
bibXXx = "bib" + self.tag[0] + self.tag[1] + "x"
bibrec_bibXXx = "bibrec_" + bibXXx
query = """SELECT bb.field_number,b.tag,b.value FROM %s AS b, %s AS bb
WHERE bb.id_bibrec=%%s
AND bb.id_bibxxx=b.id AND tag LIKE %%s""" % (bibXXx, bibrec_bibXXx)
res = run_sql(query, (recID, self.tag))
# construct journal pubinfo:
dpubinfos = {}
for row in res:
nb_instance, subfield, value = row
if subfield.endswith("c"):
# delete pageend if value is pagestart-pageend
# FIXME: pages may not be in 'c' subfield
value = value.split('-', 1)[0]
if nb_instance in dpubinfos:
dpubinfos[nb_instance][subfield] = value
else:
dpubinfos[nb_instance] = {subfield: value}
# construct standard format:
lwords = []
for dpubinfo in dpubinfos.values():
# index all journal subfields separately
for tag, val in dpubinfo.items():
lwords.append(val)
# index journal standard format:
pubinfo = self.journal_pubinfo_standard_form
for tag, val in dpubinfo.items():
pubinfo = pubinfo.replace(tag, val)
if self.tag[:-1] in pubinfo:
# some subfield was missing, do nothing
pass
else:
lwords.append(pubinfo)
# return list of words and pubinfos:
return lwords
def tokenize_via_recjson(self, recID):
"""
Tokenizes for journal info.
Uses bibfield.
"""
phrases = []
rec = get_record(recID)
recjson_field = rec.get(self.nonmarc_tag)
get_values_recursively(recjson_field, phrases)
final = []
append = final.append
for phrase in phrases:
info = phrase.split("-", 1)
append(info[0])
return final
def tokenize_for_words(self, recID):
return self.tokenize(recID)
def tokenize_for_pairs(self, recID):
return self.tokenize(recID)
def tokenize_for_phrases(self, recID):
return self.tokenize(recID)
def get_tokenizing_function(self, wordtable_type):
return self.tokenize
def get_nonmarc_tokenizing_function(self, table_type):
return self.tokenize_via_recjson

Event Timeline