BibIndexDefaultTokenizer.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Aug 27, 02:16

BibIndexDefaultTokenizer.py
View Options

	# -- coding: utf-8 --
	#
	# This file is part of Invenio.
	# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN.
	#
	# Invenio is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License as
	# published by the Free Software Foundation; either version 2 of the
	# License, or (at your option) any later version.
	#
	# Invenio is distributed in the hope that it will be useful, but
	# WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with Invenio; if not, write to the Free Software Foundation, Inc.,
	# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""BibIndexDefaultTokenizer: useful for almost all indexes.
	It performs standard tokenization. It splits phrases into words/pairs or doesnt split at all, strips accents,
	removes alphanumeric characters and html and latex markup if we want to. Also can stem words for a given language.
	"""

	from invenio.legacy.bibindex.engine_config import \
	CFG_BIBINDEX_INDEX_TABLE_TYPE
	from invenio.utils.html import remove_html_markup
	from invenio.utils.text import wash_for_utf8, strip_accents
	from invenio.legacy.bibindex.engine_washer import \
	lower_index_term, remove_latex_markup, \
	apply_stemming, remove_stopwords, length_check
	from invenio.legacy.bibindex.engine_utils import latex_formula_re, \
	re_block_punctuation_begin, \
	re_block_punctuation_end, \
	re_punctuation, \
	re_separators, \
	re_arxiv
	from invenio.modules.indexer.tokenizers.BibIndexStringTokenizer import BibIndexStringTokenizer



	class BibIndexDefaultTokenizer(BibIndexStringTokenizer):
	"""
	It's a standard tokenizer. It is useful for most of the indexes.
	Its behaviour depends on stemming, remove stopwords, remove html markup and remove latex markup parameters.
	"""

	def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
	"""initialization"""
	self.stemming_language = stemming_language
	self.remove_stopwords = remove_stopwords
	self.remove_html_markup = remove_html_markup
	self.remove_latex_markup = remove_latex_markup


	def get_tokenizing_function(self, wordtable_type):
	"""Picks correct tokenize_for_xxx function depending on type of tokenization (wordtable_type)"""
	if wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"]:
	return self.tokenize_for_words
	elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Pairs"]:
	return self.tokenize_for_pairs
	elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Phrases"]:
	return self.tokenize_for_phrases



	def tokenize_for_words(self, phrase):
	"""Return list of words found in PHRASE. Note that the phrase is
	split into groups depending on the alphanumeric characters and
	punctuation characters definition present in the config file.
	"""

	words = {}
	formulas = []
	if self.remove_html_markup and phrase.find("</") > -1:
	phrase = remove_html_markup(phrase)
	if self.remove_latex_markup:
	formulas = latex_formula_re.findall(phrase)
	phrase = remove_latex_markup(phrase)
	phrase = latex_formula_re.sub(' ', phrase)
	phrase = wash_for_utf8(phrase)
	phrase = lower_index_term(phrase)
	# 1st split phrase into blocks according to whitespace
	for block in strip_accents(phrase).split():
	# 2nd remove leading/trailing punctuation and add block:
	block = re_block_punctuation_begin.sub("", block)
	block = re_block_punctuation_end.sub("", block)
	if block:
	stemmed_block = remove_stopwords(block, self.remove_stopwords)
	stemmed_block = length_check(stemmed_block)
	stemmed_block = apply_stemming(stemmed_block, self.stemming_language)
	if stemmed_block:
	words[stemmed_block] = 1
	if re_arxiv.match(block):
	# special case for blocks like `arXiv:1007.5048' where
	# we would like to index the part after the colon
	# regardless of dot or other punctuation characters:
	words[block.split(':', 1)[1]] = 1
	# 3rd break each block into subblocks according to punctuation and add subblocks:
	for subblock in re_punctuation.split(block):
	stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords)
	stemmed_subblock = length_check(stemmed_subblock)
	stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language)
	if stemmed_subblock:
	words[stemmed_subblock] = 1
	# 4th break each subblock into alphanumeric groups and add groups:
	for alphanumeric_group in re_separators.split(subblock):
	stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
	stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group)
	stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language)
	if stemmed_alphanumeric_group:
	words[stemmed_alphanumeric_group] = 1
	for block in formulas:
	words[block] = 1
	return words.keys()


	def tokenize_for_pairs(self, phrase):
	"""Return list of words found in PHRASE. Note that the phrase is
	split into groups depending on the alphanumeric characters and
	punctuation characters definition present in the config file.
	"""

	words = {}
	if self.remove_html_markup and phrase.find("</") > -1:
	phrase = remove_html_markup(phrase)
	if self.remove_latex_markup:
	phrase = remove_latex_markup(phrase)
	phrase = latex_formula_re.sub(' ', phrase)
	phrase = wash_for_utf8(phrase)
	phrase = lower_index_term(phrase)
	# 1st split phrase into blocks according to whitespace
	last_word = ''
	for block in strip_accents(phrase).split():
	# 2nd remove leading/trailing punctuation and add block:
	block = re_block_punctuation_begin.sub("", block)
	block = re_block_punctuation_end.sub("", block)
	if block:
	block = remove_stopwords(block, self.remove_stopwords)
	block = length_check(block)
	block = apply_stemming(block, self.stemming_language)
	# 3rd break each block into subblocks according to punctuation and add subblocks:
	for subblock in re_punctuation.split(block):
	subblock = remove_stopwords(subblock, self.remove_stopwords)
	subblock = length_check(subblock)
	subblock = apply_stemming(subblock, self.stemming_language)
	if subblock:
	# 4th break each subblock into alphanumeric groups and add groups:
	for alphanumeric_group in re_separators.split(subblock):
	alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
	alphanumeric_group = length_check(alphanumeric_group)
	alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language)
	if alphanumeric_group:
	if last_word:
	words['%s %s' % (last_word, alphanumeric_group)] = 1
	last_word = alphanumeric_group
	return words.keys()


	def tokenize_for_phrases(self, phrase):
	"""Return list of phrases found in PHRASE. Note that the phrase is
	split into groups depending on the alphanumeric characters and
	punctuation characters definition present in the config file.
	"""
	phrase = wash_for_utf8(phrase)
	return [phrase]

	def get_nonmarc_tokenizing_function(self, table_type):
	"""
	Picks correct tokenize_for_xxx function
	depending on the type of tokenization
	for non-marc standards.
	"""
	return self.get_tokenizing_function(table_type)

BibIndexDefaultTokenizer.pyNo OneTemporaryActions

File Metadata

BibIndexDefaultTokenizer.pyView Options

Event Timeline

BibIndexDefaultTokenizer.py
No OneTemporary
Actions

BibIndexDefaultTokenizer.py
View Options