Page MenuHomec4science

BibIndexTokenizer.py
No OneTemporary

File Metadata

Created
Mon, Oct 14, 17:17

BibIndexTokenizer.py

# -*- coding:utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2010, 2011, 2012 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BibIndexTokenizer: generic, not implemented tokenizer for inheritance
Inheritance tree for tokenizers in Invenio:
BibIndexTokenizer
^
|
|----BibIndexStringTokenizer<---|
| |
| BibIndexDefaultTokenizer<---|
| |
| BibIndexAuthorTokenizer
| BibIndexExactAuthorTokenizer
| (...)
|
|----BibIndexRecJsonTokenizer<---|
| |
| BibIndexFiletypeTokenizer
| (...)
|
|----BibIndexMultiFieldTokenizer<---|
|
BibIndexJournalTokenizer
BibIndexAuthorCountTokenizer
(...)
"""
class BibIndexTokenizer(object):
"""
Base class for the tokenizers.
Tokenizers are components that find terms which need to be
indexed and stored in DB.
Different types of tokenizers work in different ways.
Tokenizers are divided into three groups:
- tokenizers that take string as an input and split it into
tokens/terms which later are indexed
- tokenizers that take recID of the record and find terms
by processing many fields/tags from the record
- tokenizers that use bibfield module and their functions
which precomputes terms to index
"""
#words part
def scan_string_for_words(self, s):
"""Return an intermediate representation of the tokens in s.
Every tokenizer should have a scan_string function, which scans the
input string and lexically tags its components. These units are
grouped together sequentially. The output of scan_string is usually
something like:
{
'TOKEN_TAG_LIST' : a list of valid keys in this output set,
'key1' : [val1, val2, val3] - where key describes the in some
meaningful way
}
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items
In a sample Tokenizer where scan_string simply splits s on
space, scan_string might output the following for
"Assam and Darjeeling":
{
'TOKEN_TAG_LIST' : 'word_list',
'word_list' : ['Assam', 'and', 'Darjeeling']
}
@rtype: dict
"""
raise NotImplementedError
def parse_scanned_for_words(self, o):
"""Calculate the token list from the intermediate representation o.
While this should be an interesting computation over the intermediate
representation generated by scan_string, obviously in the split-on-
space example we need only return o['word_list'].
@param t: a dictionary with a 'word_list' key
@type t: dict
@return: the token items from 'word_list'
@rtype: list of string
"""
raise NotImplementedError
def tokenize_for_words(self, s):
"""Main entry point. Return token list from input string s.
Simply composes the functionality above.
@param s: the input to be lexically tagged
@type s: string
@return: the token items derived from s
@rtype: list of string
"""
raise NotImplementedError
#pairs part
def scan_string_for_pairs(self, s):
""" See: scan_string_for_words """
raise NotImplementedError
def parse_scanned_for_pairs(self, o):
""" See: parse_scanned_for_words """
raise NotImplementedError
def tokenize_for_pairs(self, s):
""" See: tokenize_for_words """
raise NotImplementedError
#phrases part
def scan_string_for_phrases(self, s):
""" See: scan_string_for_words """
raise NotImplementedError
def parse_scanned_for_phrases(self, o):
""" See: parse_scanned_for_words """
raise NotImplementedError
def tokenize_for_phrases(self, s):
""" See: tokenize_for_words """
raise NotImplementedError
def get_tokenizing_function(self, wordtable_type):
"""Chooses tokenize_for_words, tokenize_for_phrases or tokenize_for_pairs
depending on type of tokenization we want to perform."""
raise NotImplementedError
def get_nonmarc_tokenizing_function(self, table_type):
"""Chooses best tokenizing function
depending on type of tokenization we want to perform.
Non-marc version.
"""
raise NotImplementedError
@property
def implemented(self):
try:
self.get_tokenizing_function("")
except NotImplementedError:
return False
except AttributeError:
return False
return True
@property
def implemented_nonmarc(self):
try:
self.get_nonmarc_tokenizing_function("")
except NotImplementedError:
return False
except AttributeError:
return False
return True

Event Timeline