Page MenuHomec4science

bibindex_engine_tokenizer.py
No OneTemporary

File Metadata

Created
Tue, Jul 9, 08:37

bibindex_engine_tokenizer.py

# -*- coding:utf-8 -*-
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""bibindex_engine_tokenizer: a set of classes implementing index tokenization
The idea is that Tokenizer classes provide a method, tokenize(), which turns
input strings into lists of strings. The output strings are calculated based
on the input string as tokens suitable for word or phrase indexing.
"""
import re
re_pattern_fuzzy_author_dots = re.compile(r'[\.\-]+')
re_pattern_fuzzy_author_spaces = re.compile(r'\s+')
re_pattern_fuzzy_author_trigger = re.compile(r'[\s\,\.\-]')
def wash_author_name(p):
"""
Wash author name suitable for author searching. Notably, replace
dots and hyphens with spaces, and collapse spaces.
"""
out = re_pattern_fuzzy_author_dots.sub(" ", p)
return re_pattern_fuzzy_author_spaces.sub(" ", out)
def author_name_requires_phrase_search(p):
"""
Detect whether author query pattern p requires phrase search.
Notably, look for presence of spaces and commas.
"""
if re_pattern_fuzzy_author_trigger.search(p):
return True
return False
class BibIndexTokenizer(object):
"""Base class for the tokenizers
Tokenizers act as filters which turn input strings into lists of strings
which represent the idexable components of that string.
"""
def scan_string(self, s):
"""Return an intermediate representation of the tokens in s.
Every tokenizer should have a scan_string function, which scans the
input string and lexically tags its components. These units are
grouped together sequentially. The output of scan_string is usually
something like:
{
'TOKEN_TAG_LIST' : a list of valid keys in this output set,
'key1' : [val1, val2, val3] - where key describes the in some
meaningful way
}
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items
In a sample Tokenizer where scan_string simply splits s on
space, scan_string might output the following for
"Assam and Darjeeling":
{
'TOKEN_TAG_LIST' : 'word_list',
'word_list' : ['Assam', 'and', 'Darjeeling']
}
@rtype: dict
"""
raise NotImplementedError
def parse_scanned(self, o):
"""Calculate the token list from the intermediate representation o.
While this should be an interesting computation over the intermediate
representation generated by scan_string, obviously in the split-on-
space example we need only return o['word_list'].
@param t: a dictionary with a 'word_list' key
@type t: dict
@return: the token items from 'word_list'
@rtype: list of string
"""
raise NotImplementedError
def tokenize(self, s):
"""Main entry point. Return token list from input string s.
Simply composes the functionality above.
@param s: the input to be lexically tagged
@type s: string
@return: the token items derived from s
@rtype: list of string
"""
raise NotImplementedError
class BibIndexExactNameTokenizer(BibIndexTokenizer):
"""
Human name exact tokenizer.
"""
def tokenize(self, s):
"""
Main API.
"""
return [wash_author_name(s)]
class BibIndexFuzzyNameTokenizer(BibIndexTokenizer):
"""Human name tokenizer.
Human names are divided into three classes of tokens:
'lastnames', i.e., family, tribal or group identifiers,
'nonlastnames', i.e., personal names distinguishing individuals,
'titles', both incidental and permanent, e.g., 'VIII', '(ed.)', 'Msc'
"""
def __init__(self):
self.single_initial_re = re.compile('^\w\.$')
self.split_on_re = re.compile('[\.\s-]')
# lastname_stopwords describes terms which should not be used for indexing,
# in multiple-word last names. These are purely conjunctions, serving the
# same function as the American hyphen, but using linguistic constructs.
self.lastname_stopwords = set(['y', 'of', 'and', 'de'])
def scan(self, s):
"""Scan a name string and output an object representing its structure.
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items.
Sample output for the name 'Jingleheimer Schmitt, John Jacob, XVI.' is:
{
'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles'],
'lastnames' : ['Jingleheimer', 'Schmitt'],
'nonlastnames' : ['John', 'Jacob'],
'titles' : ['XVI.']
}
@rtype: dict
"""
retval = {'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles'],
'lastnames' : [],
'nonlastnames' : [],
'titles' : []}
l = s.split(',')
if len(l) < 2:
# No commas means a simple name
new = s.strip()
new = s.split(' ')
if len(new) == 1:
retval['lastnames'] = new # rare single-name case
else:
retval['lastnames'] = new[-1:]
retval['nonlastnames'] = new[:-1]
for tag in ['lastnames', 'nonlastnames']:
retval[tag] = [x.strip() for x in retval[tag]]
retval[tag] = [re.split(self.split_on_re, x) for x in retval[tag]]
# flatten sublists
retval[tag] = [item for sublist in retval[tag] for item in sublist]
retval[tag] = [x for x in retval[tag] if x != '']
else:
# Handle lastname-first multiple-names case
retval['titles'] = l[2:] # no titles? no problem
retval['nonlastnames'] = l[1]
retval['lastnames'] = l[0]
for tag in ['lastnames', 'nonlastnames']:
retval[tag] = retval[tag].strip()
retval[tag] = re.split(self.split_on_re, retval[tag])
# filter empty strings
retval[tag] = [x for x in retval[tag] if x != '']
retval['titles'] = [x.strip() for x in retval['titles'] if x != '']
return retval
def parse_scanned(self, scanned):
"""Return all the indexable variations for a tagged token dictionary.
Does this via the combinatoric expansion of the following rules:
- Expands first names as name, first initial with period, first initial
without period.
- Expands compound last names as each of their non-stopword subparts.
- Titles are treated literally, but applied serially.
Please note that titles will be applied to complete last names only.
So for example, if there is a compound last name of the form,
"Ibanez y Gracia", with the title, "(ed.)", then only the combination
of those two strings will do, not "Ibanez" and not "Gracia".
@param scanned: lexically tagged input items in the form of the output
from scan()
@type scanned: dict
@return: combinatorically expanded list of strings for indexing
@rtype: list of string
"""
def _fully_expanded_last_name(first, lastlist, title = None):
"""Return a list of all of the first / last / title combinations.
@param first: one possible non-last name
@type first: string
@param lastlist: the strings of the tokens in the (possibly compound) last name
@type lastlist: list of string
@param title: one possible title
@type title: string
"""
retval = []
title_word = ''
if title != None:
title_word = ', ' + title
last = ' '.join(lastlist)
retval.append(first + ' ' + last + title_word)
retval.append(last + ', ' + first + title_word)
for last in lastlist:
if last in self.lastname_stopwords:
continue
retval.append(first + ' ' + last + title_word)
retval.append(last + ', ' + first + title_word)
return retval
last_parts = scanned['lastnames']
first_parts = scanned['nonlastnames']
titles = scanned['titles']
if len(first_parts) == 0: # rare single-name case
return scanned['lastnames']
expanded = []
for exp in self.__expand_nonlastnames(first_parts):
expanded.extend(_fully_expanded_last_name(exp, last_parts, None))
for title in titles:
# Drop titles which are parenthesized. This eliminates (ed.) from the index, but
# leaves XI, for example. This gets rid of the surprising behavior that searching
# for 'author:ed' retrieves people who have been editors, but whose names aren't
# Ed.
# TODO: Make editorship and other special statuses a MARC field.
if title.find('(') != -1:
continue
# XXX: remember to document that titles can only be applied to complete last names
expanded.extend(_fully_expanded_last_name(exp, [' '.join(last_parts)], title))
return sorted(list(set(expanded)))
def __expand_nonlastnames(self, namelist):
"""Generate every expansion of a series of human non-last names.
Example:
"Michael Edward" -> "Michael Edward", "Michael E.", "Michael E", "M. Edward", "M Edward",
"M. E.", "M. E", "M E.", "M E", "M.E."
...but never:
"ME"
@param namelist: a collection of names
@type namelist: list of string
@return: a greatly expanded collection of names
@rtype: list of string
"""
def _expand_name(name):
"""Lists [name, initial, empty]"""
if name == None:
return []
return [name, name[0]]
def _pair_items(head, tail):
"""Lists every combination of head with each and all of tail"""
if len(tail) == 0:
return [head]
l = []
l.extend([head + ' ' + tail[0]])
#l.extend([head + '-' + tail[0]])
l.extend(_pair_items(head, tail[1:]))
return l
def _collect(head, tail):
"""Brings together combinations of things"""
def _cons(a, l):
l2 = l[:]
l2.insert(0, a)
return l2
if len(tail) == 0:
return [head]
l = []
l.extend(_pair_items(head, _expand_name(tail[0])))
l.extend([' '.join(_cons(head, tail)).strip()])
#l.extend(['-'.join(_cons(head, tail)).strip()])
l.extend(_collect(head, tail[1:]))
return l
def _expand_contract(namelist):
"""Runs collect with every head in namelist and its tail"""
val = []
for i in range(len(namelist)):
name = namelist[i]
for expansion in _expand_name(name):
val.extend(_collect(expansion, namelist[i+1:]))
return val
def _add_squashed(namelist):
"""Finds cases like 'M. E.' and adds 'M.E.'"""
val = namelist
def __check_parts(parts):
if len(parts) < 2:
return False
for part in parts:
if not self.single_initial_re.match(part):
return False
return True
for name in namelist:
parts = name.split(' ')
if not __check_parts(parts):
continue
val.extend([''.join(parts)])
return val
return _add_squashed(_expand_contract(namelist))
def tokenize(self, s):
"""Main entry point. Output the list of strings expanding s.
Does this via the combinatoric expansion of the following rules:
- Expands first names as name, first initial with period, first initial
without period.
- Expands compound last names as each of their non-stopword subparts.
- Titles are treated literally, but applied serially.
Please note that titles will be applied to complete last names only.
So for example, if there is a compound last name of the form,
"Ibanez y Gracia", with the title, "(ed.)", then only the combination
of those two strings will do, not "Ibanez" and not "Gracia".
@param s: the input to be lexically tagged
@type s: string
@return: combinatorically expanded list of strings for indexing
@rtype: list of string
@note: A simple wrapper around scan and parse_scanned.
"""
return self.parse_scanned(self.scan(s))
if __name__ == "__main__":
"""Trivial manual test framework"""
import sys
args = sys.argv[1:]
test_str = ''
if len(args) == 0:
test_str = "Michael Peskin"
elif len(args) == 1:
test_str = args[0]
else:
test_str = ' '.join(args)
tokenizer = BibIndexFuzzyNameTokenizer()
print "Tokenizes as:", tokenizer.tokenize(test_str)

Event Timeline