Page MenuHomec4science

BibIndexTokenizer.py
No OneTemporary

File Metadata

Created
Sat, Apr 26, 20:26

BibIndexTokenizer.py

# -*- coding:utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2010, 2011, 2012 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""BibIndexTokenizer: generic, not implemented tokenizer for inheritance
"""
class BibIndexTokenizer(object):
"""Base class for the tokenizers
Tokenizers act as filters which turn input strings into lists of strings
which represent the idexable components of that string.
"""
#words part
def scan_string_for_words(self, s):
"""Return an intermediate representation of the tokens in s.
Every tokenizer should have a scan_string function, which scans the
input string and lexically tags its components. These units are
grouped together sequentially. The output of scan_string is usually
something like:
{
'TOKEN_TAG_LIST' : a list of valid keys in this output set,
'key1' : [val1, val2, val3] - where key describes the in some
meaningful way
}
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items
In a sample Tokenizer where scan_string simply splits s on
space, scan_string might output the following for
"Assam and Darjeeling":
{
'TOKEN_TAG_LIST' : 'word_list',
'word_list' : ['Assam', 'and', 'Darjeeling']
}
@rtype: dict
"""
raise NotImplementedError
def parse_scanned_for_words(self, o):
"""Calculate the token list from the intermediate representation o.
While this should be an interesting computation over the intermediate
representation generated by scan_string, obviously in the split-on-
space example we need only return o['word_list'].
@param t: a dictionary with a 'word_list' key
@type t: dict
@return: the token items from 'word_list'
@rtype: list of string
"""
raise NotImplementedError
def tokenize_for_words(self, s):
"""Main entry point. Return token list from input string s.
Simply composes the functionality above.
@param s: the input to be lexically tagged
@type s: string
@return: the token items derived from s
@rtype: list of string
"""
raise NotImplementedError
#pairs part
def scan_string_for_pairs(self, s):
""" See: scan_string_for_words """
raise NotImplementedError
def parse_scanned_for_pairs(self, o):
""" See: parse_scanned_for_words """
raise NotImplementedError
def tokenize_for_pairs(self, s):
""" See: tokenize_for_words """
raise NotImplementedError
#phrases part
def scan_string_for_phrases(self, s):
""" See: scan_string_for_words """
raise NotImplementedError
def parse_scanned_for_phrases(self, o):
""" See: parse_scanned_for_words """
raise NotImplementedError
def tokenize_for_phrases(self, s):
""" See: tokenize_for_words """
raise NotImplementedError
def get_tokenizing_function(self, wordtable_type):
"""Chooses tokenize_for_words, tokenize_for_phrases or tokenize_for_pairs
depending on type of tokenization we want to perform."""
raise NotImplementedError

Event Timeline