bibindex_engine_tokenizer.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Oct 3, 00:56

bibindex_engine_tokenizer.py
View Options

	# -- coding:utf-8 --
	##
	## This file is part of CDS Invenio.
	## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
	##
	## CDS Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## CDS Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""bibindex_engine_tokenizer: a set of classes implementing index tokenization

	The idea is that Tokenizer classes provide a method, tokenize(), which turns
	input strings into lists of strings. The output strings are calculated based
	on the input string as tokens suitable for word or phrase indexing.
	"""

	import re

	re_pattern_fuzzy_author_dots = re.compile(r'[\.\-]+')
	re_pattern_fuzzy_author_spaces = re.compile(r'\s+')
	re_pattern_fuzzy_author_trigger = re.compile(r'[\s\,\.\-]')

	def wash_author_name(p):
	"""
	Wash author name suitable for author searching. Notably, replace
	dots and hyphens with spaces, and collapse spaces.
	"""
	out = re_pattern_fuzzy_author_dots.sub(" ", p)
	return re_pattern_fuzzy_author_spaces.sub(" ", out)

	def author_name_requires_phrase_search(p):
	"""
	Detect whether author query pattern p requires phrase search.
	Notably, look for presence of spaces and commas.
	"""
	if re_pattern_fuzzy_author_trigger.search(p):
	return True
	return False

	class BibIndexTokenizer(object):
	"""Base class for the tokenizers

	Tokenizers act as filters which turn input strings into lists of strings
	which represent the idexable components of that string.
	"""

	def scan_string(self, s):
	"""Return an intermediate representation of the tokens in s.

	Every tokenizer should have a scan_string function, which scans the
	input string and lexically tags its components. These units are
	grouped together sequentially. The output of scan_string is usually
	something like:
	{
	'TOKEN_TAG_LIST' : a list of valid keys in this output set,
	'key1' : [val1, val2, val3] - where key describes the in some
	meaningful way
	}

	@param s: the input to be lexically tagged
	@type s: string

	@return: dict of lexically tagged input items
	In a sample Tokenizer where scan_string simply splits s on
	space, scan_string might output the following for
	"Assam and Darjeeling":
	{
	'TOKEN_TAG_LIST' : 'word_list',
	'word_list' : ['Assam', 'and', 'Darjeeling']
	}
	@rtype: dict
	"""
	raise NotImplementedError

	def parse_scanned(self, o):
	"""Calculate the token list from the intermediate representation o.

	While this should be an interesting computation over the intermediate
	representation generated by scan_string, obviously in the split-on-
	space example we need only return o['word_list'].

	@param t: a dictionary with a 'word_list' key
	@type t: dict

	@return: the token items from 'word_list'
	@rtype: list of string
	"""
	raise NotImplementedError

	def tokenize(self, s):
	"""Main entry point. Return token list from input string s.

	Simply composes the functionality above.

	@param s: the input to be lexically tagged
	@type s: string

	@return: the token items derived from s
	@rtype: list of string
	"""
	raise NotImplementedError

	class BibIndexExactNameTokenizer(BibIndexTokenizer):
	"""
	Human name exact tokenizer.
	"""

	def tokenize(self, s):
	"""
	Main API.
	"""
	return [wash_author_name(s)]

	class BibIndexFuzzyNameTokenizer(BibIndexTokenizer):
	"""Human name tokenizer.

	Human names are divided into three classes of tokens:
	'lastnames', i.e., family, tribal or group identifiers,
	'nonlastnames', i.e., personal names distinguishing individuals,
	'titles', both incidental and permanent, e.g., 'VIII', '(ed.)', 'Msc'
	"""

	def __init__(self):
	self.single_initial_re = re.compile('^\w\.$')
	self.split_on_re = re.compile('[\.\s-]')
	# lastname_stopwords describes terms which should not be used for indexing,
	# in multiple-word last names. These are purely conjunctions, serving the
	# same function as the American hyphen, but using linguistic constructs.
	self.lastname_stopwords = set(['y', 'of', 'and', 'de'])

	def scan(self, s):
	"""Scan a name string and output an object representing its structure.

	@param s: the input to be lexically tagged
	@type s: string

	@return: dict of lexically tagged input items.

	Sample output for the name 'Jingleheimer Schmitt, John Jacob, XVI.' is:
	{
	'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles'],
	'lastnames' : ['Jingleheimer', 'Schmitt'],
	'nonlastnames' : ['John', 'Jacob'],
	'titles' : ['XVI.']
	}
	@rtype: dict
	"""
	retval = {'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles'],
	'lastnames' : [],
	'nonlastnames' : [],
	'titles' : []}
	l = s.split(',')
	if len(l) < 2:
	# No commas means a simple name
	new = s.strip()
	new = s.split(' ')
	if len(new) == 1:
	retval['lastnames'] = new # rare single-name case
	else:
	retval['lastnames'] = new[-1:]
	retval['nonlastnames'] = new[:-1]
	for tag in ['lastnames', 'nonlastnames']:
	retval[tag] = [x.strip() for x in retval[tag]]
	retval[tag] = [re.split(self.split_on_re, x) for x in retval[tag]]
	# flatten sublists
	retval[tag] = [item for sublist in retval[tag] for item in sublist]
	retval[tag] = [x for x in retval[tag] if x != '']
	else:
	# Handle lastname-first multiple-names case
	retval['titles'] = l[2:] # no titles? no problem
	retval['nonlastnames'] = l[1]
	retval['lastnames'] = l[0]
	for tag in ['lastnames', 'nonlastnames']:
	retval[tag] = retval[tag].strip()
	retval[tag] = re.split(self.split_on_re, retval[tag])
	# filter empty strings
	retval[tag] = [x for x in retval[tag] if x != '']
	retval['titles'] = [x.strip() for x in retval['titles'] if x != '']

	return retval

	def parse_scanned(self, scanned):
	"""Return all the indexable variations for a tagged token dictionary.

	Does this via the combinatoric expansion of the following rules:
	- Expands first names as name, first initial with period, first initial
	without period.
	- Expands compound last names as each of their non-stopword subparts.
	- Titles are treated literally, but applied serially.

	Please note that titles will be applied to complete last names only.
	So for example, if there is a compound last name of the form,
	"Ibanez y Gracia", with the title, "(ed.)", then only the combination
	of those two strings will do, not "Ibanez" and not "Gracia".

	@param scanned: lexically tagged input items in the form of the output
	from scan()
	@type scanned: dict

	@return: combinatorically expanded list of strings for indexing
	@rtype: list of string
	"""

	def _fully_expanded_last_name(first, lastlist, title = None):
	"""Return a list of all of the first / last / title combinations.

	@param first: one possible non-last name
	@type first: string

	@param lastlist: the strings of the tokens in the (possibly compound) last name
	@type lastlist: list of string

	@param title: one possible title
	@type title: string
	"""
	retval = []
	title_word = ''
	if title != None:
	title_word = ', ' + title

	last = ' '.join(lastlist)
	retval.append(first + ' ' + last + title_word)
	retval.append(last + ', ' + first + title_word)
	for last in lastlist:
	if last in self.lastname_stopwords:
	continue
	retval.append(first + ' ' + last + title_word)
	retval.append(last + ', ' + first + title_word)

	return retval

	last_parts = scanned['lastnames']
	first_parts = scanned['nonlastnames']
	titles = scanned['titles']

	if len(first_parts) == 0: # rare single-name case
	return scanned['lastnames']

	expanded = []
	for exp in self.__expand_nonlastnames(first_parts):
	expanded.extend(_fully_expanded_last_name(exp, last_parts, None))
	for title in titles:
	# Drop titles which are parenthesized. This eliminates (ed.) from the index, but
	# leaves XI, for example. This gets rid of the surprising behavior that searching
	# for 'author:ed' retrieves people who have been editors, but whose names aren't
	# Ed.
	# TODO: Make editorship and other special statuses a MARC field.
	if title.find('(') != -1:
	continue
	# XXX: remember to document that titles can only be applied to complete last names
	expanded.extend(_fully_expanded_last_name(exp, [' '.join(last_parts)], title))

	return sorted(list(set(expanded)))

	def __expand_nonlastnames(self, namelist):
	"""Generate every expansion of a series of human non-last names.

	Example:
	"Michael Edward" -> "Michael Edward", "Michael E.", "Michael E", "M. Edward", "M Edward",
	"M. E.", "M. E", "M E.", "M E", "M.E."
	...but never:
	"ME"

	@param namelist: a collection of names
	@type namelist: list of string

	@return: a greatly expanded collection of names
	@rtype: list of string
	"""

	def _expand_name(name):
	"""Lists [name, initial, empty]"""
	if name == None:
	return []
	return [name, name[0]]

	def _pair_items(head, tail):
	"""Lists every combination of head with each and all of tail"""
	if len(tail) == 0:
	return [head]
	l = []
	l.extend([head + ' ' + tail[0]])
	#l.extend([head + '-' + tail[0]])
	l.extend(_pair_items(head, tail[1:]))
	return l

	def _collect(head, tail):
	"""Brings together combinations of things"""

	def _cons(a, l):
	l2 = l[:]
	l2.insert(0, a)
	return l2

	if len(tail) == 0:
	return [head]
	l = []
	l.extend(_pair_items(head, _expand_name(tail[0])))
	l.extend([' '.join(_cons(head, tail)).strip()])
	#l.extend(['-'.join(_cons(head, tail)).strip()])
	l.extend(_collect(head, tail[1:]))
	return l

	def _expand_contract(namelist):
	"""Runs collect with every head in namelist and its tail"""
	val = []
	for i in range(len(namelist)):
	name = namelist[i]
	for expansion in _expand_name(name):
	val.extend(_collect(expansion, namelist[i+1:]))
	return val

	def _add_squashed(namelist):
	"""Finds cases like 'M. E.' and adds 'M.E.'"""
	val = namelist

	def __check_parts(parts):
	if len(parts) < 2:
	return False
	for part in parts:
	if not self.single_initial_re.match(part):
	return False
	return True

	for name in namelist:
	parts = name.split(' ')
	if not __check_parts(parts):
	continue
	val.extend([''.join(parts)])

	return val

	return _add_squashed(_expand_contract(namelist))

	def tokenize(self, s):
	"""Main entry point. Output the list of strings expanding s.

	Does this via the combinatoric expansion of the following rules:
	- Expands first names as name, first initial with period, first initial
	without period.
	- Expands compound last names as each of their non-stopword subparts.
	- Titles are treated literally, but applied serially.

	Please note that titles will be applied to complete last names only.
	So for example, if there is a compound last name of the form,
	"Ibanez y Gracia", with the title, "(ed.)", then only the combination
	of those two strings will do, not "Ibanez" and not "Gracia".

	@param s: the input to be lexically tagged
	@type s: string

	@return: combinatorically expanded list of strings for indexing
	@rtype: list of string

	@note: A simple wrapper around scan and parse_scanned.
	"""
	return self.parse_scanned(self.scan(s))


	if __name__ == "__main__":
	"""Trivial manual test framework"""
	import sys
	args = sys.argv[1:]

	test_str = ''
	if len(args) == 0:
	test_str = "Michael Peskin"
	elif len(args) == 1:
	test_str = args[0]
	else:
	test_str = ' '.join(args)

	tokenizer = BibIndexFuzzyNameTokenizer()
	print "Tokenizes as:", tokenizer.tokenize(test_str)

bibindex_engine_tokenizer.pyNo OneTemporaryActions

File Metadata

bibindex_engine_tokenizer.pyView Options

Event Timeline

bibindex_engine_tokenizer.py
No OneTemporary
Actions

bibindex_engine_tokenizer.py
View Options