bibindex_engine_tokenizer.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Jul 22, 13:46

bibindex_engine_tokenizer.py
View Options

	# -- coding:utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2010, 2011, 2012 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""bibindex_engine_tokenizer: a set of classes implementing index tokenization

	The idea is that Tokenizer classes provide a method, tokenize(), which turns
	input strings into lists of strings. The output strings are calculated based
	on the input string as tokens suitable for word or phrase indexing.
	"""

	import re

	from invenio.config import \
	CFG_BIBINDEX_REMOVE_HTML_MARKUP, \
	CFG_BIBINDEX_REMOVE_LATEX_MARKUP, \
	CFG_BIBINDEX_CHARS_PUNCTUATION, \
	CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS
	from invenio.htmlutils import remove_html_markup
	from invenio.textutils import wash_for_utf8, strip_accents

	from invenio.bibindex_engine_washer import \
	lower_index_term, remove_latex_markup, \
	apply_stemming_and_stopwords_and_length_check, \
	wash_author_name

	latex_formula_re = re.compile(r'\$.?\$\|\\\[.?\\\]')
	phrase_delimiter_re = re.compile(r'[\.:;\?\!]')
	space_cleaner_re = re.compile(r'\s+')
	re_block_punctuation_begin = re.compile(r"^" + CFG_BIBINDEX_CHARS_PUNCTUATION + "+")
	re_block_punctuation_end = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION + "+$")
	re_punctuation = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION)
	re_separators = re.compile(CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS)
	re_arxiv = re.compile(r'^arxiv:\d\d\d\d\.\d\d\d\d')

	re_pattern_fuzzy_author_trigger = re.compile(r'[\s\,\.]')
	# FIXME: re_pattern_fuzzy_author_trigger could be removed and an
	# BibAuthorID API function could be called instead after we
	# double-check that there are no circular imports.
	re_pattern_author_canonical_id = re.compile(r'\.[0-9]+$')

	def author_name_requires_phrase_search(p):
	"""
	Detect whether author query pattern p requires phrase search.
	Notably, look for presence of spaces and commas.
	"""
	if re_pattern_fuzzy_author_trigger.search(p):
	return True
	return False

	class BibIndexTokenizer(object):
	"""Base class for the tokenizers

	Tokenizers act as filters which turn input strings into lists of strings
	which represent the idexable components of that string.
	"""

	def scan_string(self, s):
	"""Return an intermediate representation of the tokens in s.

	Every tokenizer should have a scan_string function, which scans the
	input string and lexically tags its components. These units are
	grouped together sequentially. The output of scan_string is usually
	something like:
	{
	'TOKEN_TAG_LIST' : a list of valid keys in this output set,
	'key1' : [val1, val2, val3] - where key describes the in some
	meaningful way
	}

	@param s: the input to be lexically tagged
	@type s: string

	@return: dict of lexically tagged input items
	In a sample Tokenizer where scan_string simply splits s on
	space, scan_string might output the following for
	"Assam and Darjeeling":
	{
	'TOKEN_TAG_LIST' : 'word_list',
	'word_list' : ['Assam', 'and', 'Darjeeling']
	}
	@rtype: dict
	"""
	raise NotImplementedError

	def parse_scanned(self, o):
	"""Calculate the token list from the intermediate representation o.

	While this should be an interesting computation over the intermediate
	representation generated by scan_string, obviously in the split-on-
	space example we need only return o['word_list'].

	@param t: a dictionary with a 'word_list' key
	@type t: dict

	@return: the token items from 'word_list'
	@rtype: list of string
	"""
	raise NotImplementedError

	def tokenize(self, s):
	"""Main entry point. Return token list from input string s.

	Simply composes the functionality above.

	@param s: the input to be lexically tagged
	@type s: string

	@return: the token items derived from s
	@rtype: list of string
	"""
	raise NotImplementedError


	class BibIndexPhraseTokenizer(BibIndexTokenizer):
	"""The original phrase is returned"""

	def __init__(self, stemming_language = None):
	self.stemming_language = stemming_language

	def tokenize(self, phrase):
	"""Return list of phrases found in PHRASE. Note that the phrase is
	split into groups depending on the alphanumeric characters and
	punctuation characters definition present in the config file.
	"""
	phrase = wash_for_utf8(phrase)
	return [phrase]
	## Note that we don't break phrases, they are used for exact style
	## of searching.
	words = {}
	phrase = strip_accents(phrase)
	# 1st split phrase into blocks according to whitespace
	for block1 in phrase_delimiter_re.split(strip_accents(phrase)):
	block1 = block1.strip()
	if block1 and self.stemming_language:
	new_words = []
	for block2 in re_punctuation.split(block1):
	block2 = block2.strip()
	if block2:
	for block3 in block2.split():
	block3 = block3.strip()
	if block3:
	# Note that we don't stem phrases, they
	# are used for exact style of searching.
	new_words.append(block3)
	block1 = ' '.join(new_words)
	if block1:
	words[block1] = 1
	return words.keys()


	class BibIndexWordTokenizer(BibIndexTokenizer):
	"""A phrase is split into words"""

	def __init__(self, stemming_language = None):
	self.stemming_language = stemming_language

	def tokenize(self, phrase):
	"""Return list of words found in PHRASE. Note that the phrase is
	split into groups depending on the alphanumeric characters and
	punctuation characters definition present in the config file.
	"""
	words = {}
	formulas = []
	if CFG_BIBINDEX_REMOVE_HTML_MARKUP and phrase.find("</") > -1:
	phrase = remove_html_markup(phrase)
	if CFG_BIBINDEX_REMOVE_LATEX_MARKUP:
	formulas = latex_formula_re.findall(phrase)
	phrase = remove_latex_markup(phrase)
	phrase = latex_formula_re.sub(' ', phrase)
	phrase = wash_for_utf8(phrase)
	phrase = lower_index_term(phrase)
	# 1st split phrase into blocks according to whitespace
	for block in strip_accents(phrase).split():
	# 2nd remove leading/trailing punctuation and add block:
	block = re_block_punctuation_begin.sub("", block)
	block = re_block_punctuation_end.sub("", block)
	if block:
	stemmed_block = apply_stemming_and_stopwords_and_length_check(block, self.stemming_language)
	if stemmed_block:
	words[stemmed_block] = 1
	if re_arxiv.match(block):
	# special case for blocks like `arXiv:1007.5048' where
	# we would like to index the part after the colon
	# regardless of dot or other punctuation characters:
	words[block.split(':', 1)[1]] = 1
	# 3rd break each block into subblocks according to punctuation and add subblocks:
	for subblock in re_punctuation.split(block):
	stemmed_subblock = apply_stemming_and_stopwords_and_length_check(subblock, self.stemming_language)
	if stemmed_subblock:
	words[stemmed_subblock] = 1
	# 4th break each subblock into alphanumeric groups and add groups:
	for alphanumeric_group in re_separators.split(subblock):
	stemmed_alphanumeric_group = apply_stemming_and_stopwords_and_length_check(alphanumeric_group, self.stemming_language)
	if stemmed_alphanumeric_group:
	words[stemmed_alphanumeric_group] = 1
	for block in formulas:
	words[block] = 1
	return words.keys()

	class BibIndexPairTokenizer(BibIndexTokenizer):
	"""A phrase is split into pairs of words"""

	def __init__(self, stemming_language = None):
	self.stemming_language = stemming_language

	def tokenize(self, phrase):
	"""Return list of words found in PHRASE. Note that the phrase is
	split into groups depending on the alphanumeric characters and
	punctuation characters definition present in the config file.
	"""
	words = {}
	if CFG_BIBINDEX_REMOVE_HTML_MARKUP and phrase.find("</") > -1:
	phrase = remove_html_markup(phrase)
	if CFG_BIBINDEX_REMOVE_LATEX_MARKUP:
	phrase = remove_latex_markup(phrase)
	phrase = latex_formula_re.sub(' ', phrase)
	phrase = wash_for_utf8(phrase)
	phrase = lower_index_term(phrase)
	# 1st split phrase into blocks according to whitespace
	last_word = ''
	for block in strip_accents(phrase).split():
	# 2nd remove leading/trailing punctuation and add block:
	block = re_block_punctuation_begin.sub("", block)
	block = re_block_punctuation_end.sub("", block)
	if block:
	if self.stemming_language:
	block = apply_stemming_and_stopwords_and_length_check(block, self.stemming_language)
	# 3rd break each block into subblocks according to punctuation and add subblocks:
	for subblock in re_punctuation.split(block):
	if self.stemming_language:
	subblock = apply_stemming_and_stopwords_and_length_check(subblock, self.stemming_language)
	if subblock:
	# 4th break each subblock into alphanumeric groups and add groups:
	for alphanumeric_group in re_separators.split(subblock):
	if self.stemming_language:
	alphanumeric_group = apply_stemming_and_stopwords_and_length_check(alphanumeric_group, self.stemming_language)
	if alphanumeric_group:
	if last_word:
	words['%s %s' % (last_word, alphanumeric_group)] = 1
	last_word = alphanumeric_group
	return words.keys()

	class BibIndexExactNameTokenizer(BibIndexTokenizer):
	"""
	Human name exact tokenizer.
	"""

	def tokenize(self, s):
	"""
	Main API.
	"""
	return [wash_author_name(s)]

	class BibIndexFuzzyNameTokenizer(BibIndexTokenizer):
	"""Human name tokenizer.

	Human names are divided into three classes of tokens:
	'lastnames', i.e., family, tribal or group identifiers,
	'nonlastnames', i.e., personal names distinguishing individuals,
	'titles', both incidental and permanent, e.g., 'VIII', '(ed.)', 'Msc'
	"""

	def __init__(self):
	self.single_initial_re = re.compile('^\w\.$')
	self.split_on_re = re.compile('[\.\s-]')
	# lastname_stopwords describes terms which should not be used for indexing,
	# in multiple-word last names. These are purely conjunctions, serving the
	# same function as the American hyphen, but using linguistic constructs.
	self.lastname_stopwords = set(['y', 'of', 'and', 'de'])

	def scan(self, s):
	"""Scan a name string and output an object representing its structure.

	@param s: the input to be lexically tagged
	@type s: string

	@return: dict of lexically tagged input items.

	Sample output for the name 'Jingleheimer Schmitt, John Jacob, XVI.' is:
	{
	'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles', 'raw'],
	'lastnames' : ['Jingleheimer', 'Schmitt'],
	'nonlastnames' : ['John', 'Jacob'],
	'titles' : ['XVI.'],
	'raw' : 'Jingleheimer Schmitt, John Jacob, XVI.'
	}
	@rtype: dict
	"""
	retval = {'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles', 'raw'],
	'lastnames' : [],
	'nonlastnames' : [],
	'titles' : [],
	'raw' : s}
	l = s.split(',')
	if len(l) < 2:
	# No commas means a simple name
	new = s.strip()
	new = s.split(' ')
	if len(new) == 1:
	retval['lastnames'] = new # rare single-name case
	else:
	retval['lastnames'] = new[-1:]
	retval['nonlastnames'] = new[:-1]
	for tag in ['lastnames', 'nonlastnames']:
	retval[tag] = [x.strip() for x in retval[tag]]
	retval[tag] = [re.split(self.split_on_re, x) for x in retval[tag]]
	# flatten sublists
	retval[tag] = [item for sublist in retval[tag] for item in sublist]
	retval[tag] = [x for x in retval[tag] if x != '']
	else:
	# Handle lastname-first multiple-names case
	retval['titles'] = l[2:] # no titles? no problem
	retval['nonlastnames'] = l[1]
	retval['lastnames'] = l[0]
	for tag in ['lastnames', 'nonlastnames']:
	retval[tag] = retval[tag].strip()
	retval[tag] = re.split(self.split_on_re, retval[tag])
	# filter empty strings
	retval[tag] = [x for x in retval[tag] if x != '']
	retval['titles'] = [x.strip() for x in retval['titles'] if x != '']

	return retval

	def parse_scanned(self, scanned):
	"""Return all the indexable variations for a tagged token dictionary.

	Does this via the combinatoric expansion of the following rules:
	- Expands first names as name, first initial with period, first initial
	without period.
	- Expands compound last names as each of their non-stopword subparts.
	- Titles are treated literally, but applied serially.

	Please note that titles will be applied to complete last names only.
	So for example, if there is a compound last name of the form,
	"Ibanez y Gracia", with the title, "(ed.)", then only the combination
	of those two strings will do, not "Ibanez" and not "Gracia".

	@param scanned: lexically tagged input items in the form of the output
	from scan()
	@type scanned: dict

	@return: combinatorically expanded list of strings for indexing
	@rtype: list of string
	"""

	def _fully_expanded_last_name(first, lastlist, title = None):
	"""Return a list of all of the first / last / title combinations.

	@param first: one possible non-last name
	@type first: string

	@param lastlist: the strings of the tokens in the (possibly compound) last name
	@type lastlist: list of string

	@param title: one possible title
	@type title: string
	"""
	retval = []
	title_word = ''
	if title != None:
	title_word = ', ' + title

	last = ' '.join(lastlist)
	retval.append(first + ' ' + last + title_word)
	retval.append(last + ', ' + first + title_word)
	for last in lastlist:
	if last in self.lastname_stopwords:
	continue
	retval.append(first + ' ' + last + title_word)
	retval.append(last + ', ' + first + title_word)

	return retval

	last_parts = scanned['lastnames']
	first_parts = scanned['nonlastnames']
	titles = scanned['titles']
	raw = scanned['raw']

	if len(first_parts) == 0: # rare single-name case
	return scanned['lastnames']

	expanded = []
	for exp in self.__expand_nonlastnames(first_parts):
	expanded.extend(_fully_expanded_last_name(exp, last_parts, None))
	for title in titles:
	# Drop titles which are parenthesized. This eliminates (ed.) from the index, but
	# leaves XI, for example. This gets rid of the surprising behavior that searching
	# for 'author:ed' retrieves people who have been editors, but whose names aren't
	# Ed.
	# TODO: Make editorship and other special statuses a MARC field.
	if title.find('(') != -1:
	continue
	# XXX: remember to document that titles can only be applied to complete last names
	expanded.extend(_fully_expanded_last_name(exp, [' '.join(last_parts)], title))

	return sorted(list(set(expanded)))

	def __expand_nonlastnames(self, namelist):
	"""Generate every expansion of a series of human non-last names.

	Example:
	"Michael Edward" -> "Michael Edward", "Michael E.", "Michael E", "M. Edward", "M Edward",
	"M. E.", "M. E", "M E.", "M E", "M.E."
	...but never:
	"ME"

	@param namelist: a collection of names
	@type namelist: list of string

	@return: a greatly expanded collection of names
	@rtype: list of string
	"""

	def _expand_name(name):
	"""Lists [name, initial, empty]"""
	if name == None:
	return []
	return [name, name[0]]

	def _pair_items(head, tail):
	"""Lists every combination of head with each and all of tail"""
	if len(tail) == 0:
	return [head]
	l = []
	l.extend([head + ' ' + tail[0]])
	#l.extend([head + '-' + tail[0]])
	l.extend(_pair_items(head, tail[1:]))
	return l

	def _collect(head, tail):
	"""Brings together combinations of things"""

	def _cons(a, l):
	l2 = l[:]
	l2.insert(0, a)
	return l2

	if len(tail) == 0:
	return [head]
	l = []
	l.extend(_pair_items(head, _expand_name(tail[0])))
	l.extend([' '.join(_cons(head, tail)).strip()])
	#l.extend(['-'.join(_cons(head, tail)).strip()])
	l.extend(_collect(head, tail[1:]))
	return l

	def _expand_contract(namelist):
	"""Runs collect with every head in namelist and its tail"""
	val = []
	for i in range(len(namelist)):
	name = namelist[i]
	for expansion in _expand_name(name):
	val.extend(_collect(expansion, namelist[i+1:]))
	return val

	def _add_squashed(namelist):
	"""Finds cases like 'M. E.' and adds 'M.E.'"""
	val = namelist

	def __check_parts(parts):
	if len(parts) < 2:
	return False
	for part in parts:
	if not self.single_initial_re.match(part):
	return False
	return True

	for name in namelist:
	parts = name.split(' ')
	if not __check_parts(parts):
	continue
	val.extend([''.join(parts)])

	return val

	return _add_squashed(_expand_contract(namelist))

	def tokenize(self, s):
	"""Main entry point. Output the list of strings expanding s.

	Does this via the combinatoric expansion of the following rules:
	- Expands first names as name, first initial with period, first initial
	without period.
	- Expands compound last names as each of their non-stopword subparts.
	- Titles are treated literally, but applied serially.

	Please note that titles will be applied to complete last names only.
	So for example, if there is a compound last name of the form,
	"Ibanez y Gracia", with the title, "(ed.)", then only the combination
	of those two strings will do, not "Ibanez" and not "Gracia".

	@param s: the input to be lexically tagged
	@type s: string

	@return: combinatorically expanded list of strings for indexing
	@rtype: list of string

	@note: A simple wrapper around scan and parse_scanned.
	"""
	return self.parse_scanned(self.scan(s))


	if __name__ == "__main__":
	"""Trivial manual test framework"""
	import sys
	args = sys.argv[1:]

	test_str = ''
	if len(args) == 0:
	test_str = "Michael Peskin"
	elif len(args) == 1:
	test_str = args[0]
	else:
	test_str = ' '.join(args)

	tokenizer = BibIndexFuzzyNameTokenizer()
	print "Tokenizes as:", tokenizer.tokenize(test_str)

bibindex_engine_tokenizer.pyNo OneTemporaryActions

File Metadata

bibindex_engine_tokenizer.pyView Options

Event Timeline

bibindex_engine_tokenizer.py
No OneTemporary
Actions

bibindex_engine_tokenizer.py
View Options