File Metadata

Created: Wed, Jul 2, 09:11

BibIndexTokenizer.py
View Options

	# -- coding:utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2010, 2011, 2012 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""BibIndexTokenizer: generic, not implemented tokenizer for inheritance
	"""


	class BibIndexTokenizer(object):
	"""Base class for the tokenizers

	Tokenizers act as filters which turn input strings into lists of strings
	which represent the idexable components of that string.
	"""
	#words part
	def scan_string_for_words(self, s):
	"""Return an intermediate representation of the tokens in s.

	Every tokenizer should have a scan_string function, which scans the
	input string and lexically tags its components. These units are
	grouped together sequentially. The output of scan_string is usually
	something like:
	{
	'TOKEN_TAG_LIST' : a list of valid keys in this output set,
	'key1' : [val1, val2, val3] - where key describes the in some
	meaningful way
	}

	@param s: the input to be lexically tagged
	@type s: string

	@return: dict of lexically tagged input items
	In a sample Tokenizer where scan_string simply splits s on
	space, scan_string might output the following for
	"Assam and Darjeeling":
	{
	'TOKEN_TAG_LIST' : 'word_list',
	'word_list' : ['Assam', 'and', 'Darjeeling']
	}
	@rtype: dict
	"""
	raise NotImplementedError

	def parse_scanned_for_words(self, o):
	"""Calculate the token list from the intermediate representation o.

	While this should be an interesting computation over the intermediate
	representation generated by scan_string, obviously in the split-on-
	space example we need only return o['word_list'].

	@param t: a dictionary with a 'word_list' key
	@type t: dict

	@return: the token items from 'word_list'
	@rtype: list of string
	"""
	raise NotImplementedError

	def tokenize_for_words(self, s):
	"""Main entry point. Return token list from input string s.

	Simply composes the functionality above.

	@param s: the input to be lexically tagged
	@type s: string

	@return: the token items derived from s
	@rtype: list of string
	"""
	raise NotImplementedError

	#pairs part
	def scan_string_for_pairs(self, s):
	""" See: scan_string_for_words """
	raise NotImplementedError

	def parse_scanned_for_pairs(self, o):
	""" See: parse_scanned_for_words """
	raise NotImplementedError

	def tokenize_for_pairs(self, s):
	""" See: tokenize_for_words """
	raise NotImplementedError

	#phrases part
	def scan_string_for_phrases(self, s):
	""" See: scan_string_for_words """
	raise NotImplementedError

	def parse_scanned_for_phrases(self, o):
	""" See: parse_scanned_for_words """
	raise NotImplementedError

	def tokenize_for_phrases(self, s):
	""" See: tokenize_for_words """
	raise NotImplementedError


	def get_tokenizing_function(self, wordtable_type):
	"""Chooses tokenize_for_words, tokenize_for_phrases or tokenize_for_pairs
	depending on type of tokenization we want to perform."""
	raise NotImplementedError

BibIndexTokenizer.py
No OneTemporary
Actions

File Metadata

BibIndexTokenizer.py
View Options

Event Timeline

BibIndexTokenizer.pyNo OneTemporaryActions

File Metadata

BibIndexTokenizer.pyView Options

Event Timeline

BibIndexTokenizer.py
No OneTemporary
Actions

BibIndexTokenizer.py
View Options