BibIndexTokenizer.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Oct 14, 17:17

BibIndexTokenizer.py
View Options

	# -- coding:utf-8 --
	#
	# This file is part of Invenio.
	# Copyright (C) 2010, 2011, 2012 CERN.
	#
	# Invenio is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License as
	# published by the Free Software Foundation; either version 2 of the
	# License, or (at your option) any later version.
	#
	# Invenio is distributed in the hope that it will be useful, but
	# WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with Invenio; if not, write to the Free Software Foundation, Inc.,
	# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""
	BibIndexTokenizer: generic, not implemented tokenizer for inheritance

	Inheritance tree for tokenizers in Invenio:

	BibIndexTokenizer
	^
	\|
	\|----BibIndexStringTokenizer<---\|
	\| \|
	\| BibIndexDefaultTokenizer<---\|
	\| \|
	\| BibIndexAuthorTokenizer
	\| BibIndexExactAuthorTokenizer
	\| (...)
	\|
	\|----BibIndexRecJsonTokenizer<---\|
	\| \|
	\| BibIndexFiletypeTokenizer
	\| (...)
	\|
	\|----BibIndexMultiFieldTokenizer<---\|
	\|
	BibIndexJournalTokenizer
	BibIndexAuthorCountTokenizer
	(...)
	"""


	class BibIndexTokenizer(object):
	"""
	Base class for the tokenizers.

	Tokenizers are components that find terms which need to be
	indexed and stored in DB.
	Different types of tokenizers work in different ways.
	Tokenizers are divided into three groups:
	- tokenizers that take string as an input and split it into
	tokens/terms which later are indexed
	- tokenizers that take recID of the record and find terms
	by processing many fields/tags from the record
	- tokenizers that use bibfield module and their functions
	which precomputes terms to index
	"""
	#words part
	def scan_string_for_words(self, s):
	"""Return an intermediate representation of the tokens in s.

	Every tokenizer should have a scan_string function, which scans the
	input string and lexically tags its components. These units are
	grouped together sequentially. The output of scan_string is usually
	something like:
	{
	'TOKEN_TAG_LIST' : a list of valid keys in this output set,
	'key1' : [val1, val2, val3] - where key describes the in some
	meaningful way
	}

	@param s: the input to be lexically tagged
	@type s: string

	@return: dict of lexically tagged input items
	In a sample Tokenizer where scan_string simply splits s on
	space, scan_string might output the following for
	"Assam and Darjeeling":
	{
	'TOKEN_TAG_LIST' : 'word_list',
	'word_list' : ['Assam', 'and', 'Darjeeling']
	}
	@rtype: dict
	"""
	raise NotImplementedError

	def parse_scanned_for_words(self, o):
	"""Calculate the token list from the intermediate representation o.

	While this should be an interesting computation over the intermediate
	representation generated by scan_string, obviously in the split-on-
	space example we need only return o['word_list'].

	@param t: a dictionary with a 'word_list' key
	@type t: dict

	@return: the token items from 'word_list'
	@rtype: list of string
	"""
	raise NotImplementedError

	def tokenize_for_words(self, s):
	"""Main entry point. Return token list from input string s.

	Simply composes the functionality above.

	@param s: the input to be lexically tagged
	@type s: string

	@return: the token items derived from s
	@rtype: list of string
	"""
	raise NotImplementedError

	#pairs part
	def scan_string_for_pairs(self, s):
	""" See: scan_string_for_words """
	raise NotImplementedError

	def parse_scanned_for_pairs(self, o):
	""" See: parse_scanned_for_words """
	raise NotImplementedError

	def tokenize_for_pairs(self, s):
	""" See: tokenize_for_words """
	raise NotImplementedError

	#phrases part
	def scan_string_for_phrases(self, s):
	""" See: scan_string_for_words """
	raise NotImplementedError

	def parse_scanned_for_phrases(self, o):
	""" See: parse_scanned_for_words """
	raise NotImplementedError

	def tokenize_for_phrases(self, s):
	""" See: tokenize_for_words """
	raise NotImplementedError

	def get_tokenizing_function(self, wordtable_type):
	"""Chooses tokenize_for_words, tokenize_for_phrases or tokenize_for_pairs
	depending on type of tokenization we want to perform."""
	raise NotImplementedError

	def get_nonmarc_tokenizing_function(self, table_type):
	"""Chooses best tokenizing function
	depending on type of tokenization we want to perform.
	Non-marc version.
	"""
	raise NotImplementedError

	@property
	def implemented(self):
	try:
	self.get_tokenizing_function("")
	except NotImplementedError:
	return False
	except AttributeError:
	return False
	return True

	@property
	def implemented_nonmarc(self):
	try:
	self.get_nonmarc_tokenizing_function("")
	except NotImplementedError:
	return False
	except AttributeError:
	return False
	return True

BibIndexTokenizer.pyNo OneTemporaryActions

File Metadata

BibIndexTokenizer.pyView Options

Event Timeline

BibIndexTokenizer.py
No OneTemporary
Actions

BibIndexTokenizer.py
View Options