Page MenuHomec4science

bibindex_engine_tokenizer.py
No OneTemporary

File Metadata

Created
Mon, Jul 22, 13:46

bibindex_engine_tokenizer.py

# -*- coding:utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2010, 2011, 2012 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""bibindex_engine_tokenizer: a set of classes implementing index tokenization
The idea is that Tokenizer classes provide a method, tokenize(), which turns
input strings into lists of strings. The output strings are calculated based
on the input string as tokens suitable for word or phrase indexing.
"""
import re
from invenio.config import \
CFG_BIBINDEX_REMOVE_HTML_MARKUP, \
CFG_BIBINDEX_REMOVE_LATEX_MARKUP, \
CFG_BIBINDEX_CHARS_PUNCTUATION, \
CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS
from invenio.htmlutils import remove_html_markup
from invenio.textutils import wash_for_utf8, strip_accents
from invenio.bibindex_engine_washer import \
lower_index_term, remove_latex_markup, \
apply_stemming_and_stopwords_and_length_check, \
wash_author_name
latex_formula_re = re.compile(r'\$.*?\$|\\\[.*?\\\]')
phrase_delimiter_re = re.compile(r'[\.:;\?\!]')
space_cleaner_re = re.compile(r'\s+')
re_block_punctuation_begin = re.compile(r"^" + CFG_BIBINDEX_CHARS_PUNCTUATION + "+")
re_block_punctuation_end = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION + "+$")
re_punctuation = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION)
re_separators = re.compile(CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS)
re_arxiv = re.compile(r'^arxiv:\d\d\d\d\.\d\d\d\d')
re_pattern_fuzzy_author_trigger = re.compile(r'[\s\,\.]')
# FIXME: re_pattern_fuzzy_author_trigger could be removed and an
# BibAuthorID API function could be called instead after we
# double-check that there are no circular imports.
re_pattern_author_canonical_id = re.compile(r'\.[0-9]+$')
def author_name_requires_phrase_search(p):
"""
Detect whether author query pattern p requires phrase search.
Notably, look for presence of spaces and commas.
"""
if re_pattern_fuzzy_author_trigger.search(p):
return True
return False
class BibIndexTokenizer(object):
"""Base class for the tokenizers
Tokenizers act as filters which turn input strings into lists of strings
which represent the idexable components of that string.
"""
def scan_string(self, s):
"""Return an intermediate representation of the tokens in s.
Every tokenizer should have a scan_string function, which scans the
input string and lexically tags its components. These units are
grouped together sequentially. The output of scan_string is usually
something like:
{
'TOKEN_TAG_LIST' : a list of valid keys in this output set,
'key1' : [val1, val2, val3] - where key describes the in some
meaningful way
}
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items
In a sample Tokenizer where scan_string simply splits s on
space, scan_string might output the following for
"Assam and Darjeeling":
{
'TOKEN_TAG_LIST' : 'word_list',
'word_list' : ['Assam', 'and', 'Darjeeling']
}
@rtype: dict
"""
raise NotImplementedError
def parse_scanned(self, o):
"""Calculate the token list from the intermediate representation o.
While this should be an interesting computation over the intermediate
representation generated by scan_string, obviously in the split-on-
space example we need only return o['word_list'].
@param t: a dictionary with a 'word_list' key
@type t: dict
@return: the token items from 'word_list'
@rtype: list of string
"""
raise NotImplementedError
def tokenize(self, s):
"""Main entry point. Return token list from input string s.
Simply composes the functionality above.
@param s: the input to be lexically tagged
@type s: string
@return: the token items derived from s
@rtype: list of string
"""
raise NotImplementedError
class BibIndexPhraseTokenizer(BibIndexTokenizer):
"""The original phrase is returned"""
def __init__(self, stemming_language = None):
self.stemming_language = stemming_language
def tokenize(self, phrase):
"""Return list of phrases found in PHRASE. Note that the phrase is
split into groups depending on the alphanumeric characters and
punctuation characters definition present in the config file.
"""
phrase = wash_for_utf8(phrase)
return [phrase]
## Note that we don't break phrases, they are used for exact style
## of searching.
words = {}
phrase = strip_accents(phrase)
# 1st split phrase into blocks according to whitespace
for block1 in phrase_delimiter_re.split(strip_accents(phrase)):
block1 = block1.strip()
if block1 and self.stemming_language:
new_words = []
for block2 in re_punctuation.split(block1):
block2 = block2.strip()
if block2:
for block3 in block2.split():
block3 = block3.strip()
if block3:
# Note that we don't stem phrases, they
# are used for exact style of searching.
new_words.append(block3)
block1 = ' '.join(new_words)
if block1:
words[block1] = 1
return words.keys()
class BibIndexWordTokenizer(BibIndexTokenizer):
"""A phrase is split into words"""
def __init__(self, stemming_language = None):
self.stemming_language = stemming_language
def tokenize(self, phrase):
"""Return list of words found in PHRASE. Note that the phrase is
split into groups depending on the alphanumeric characters and
punctuation characters definition present in the config file.
"""
words = {}
formulas = []
if CFG_BIBINDEX_REMOVE_HTML_MARKUP and phrase.find("</") > -1:
phrase = remove_html_markup(phrase)
if CFG_BIBINDEX_REMOVE_LATEX_MARKUP:
formulas = latex_formula_re.findall(phrase)
phrase = remove_latex_markup(phrase)
phrase = latex_formula_re.sub(' ', phrase)
phrase = wash_for_utf8(phrase)
phrase = lower_index_term(phrase)
# 1st split phrase into blocks according to whitespace
for block in strip_accents(phrase).split():
# 2nd remove leading/trailing punctuation and add block:
block = re_block_punctuation_begin.sub("", block)
block = re_block_punctuation_end.sub("", block)
if block:
stemmed_block = apply_stemming_and_stopwords_and_length_check(block, self.stemming_language)
if stemmed_block:
words[stemmed_block] = 1
if re_arxiv.match(block):
# special case for blocks like `arXiv:1007.5048' where
# we would like to index the part after the colon
# regardless of dot or other punctuation characters:
words[block.split(':', 1)[1]] = 1
# 3rd break each block into subblocks according to punctuation and add subblocks:
for subblock in re_punctuation.split(block):
stemmed_subblock = apply_stemming_and_stopwords_and_length_check(subblock, self.stemming_language)
if stemmed_subblock:
words[stemmed_subblock] = 1
# 4th break each subblock into alphanumeric groups and add groups:
for alphanumeric_group in re_separators.split(subblock):
stemmed_alphanumeric_group = apply_stemming_and_stopwords_and_length_check(alphanumeric_group, self.stemming_language)
if stemmed_alphanumeric_group:
words[stemmed_alphanumeric_group] = 1
for block in formulas:
words[block] = 1
return words.keys()
class BibIndexPairTokenizer(BibIndexTokenizer):
"""A phrase is split into pairs of words"""
def __init__(self, stemming_language = None):
self.stemming_language = stemming_language
def tokenize(self, phrase):
"""Return list of words found in PHRASE. Note that the phrase is
split into groups depending on the alphanumeric characters and
punctuation characters definition present in the config file.
"""
words = {}
if CFG_BIBINDEX_REMOVE_HTML_MARKUP and phrase.find("</") > -1:
phrase = remove_html_markup(phrase)
if CFG_BIBINDEX_REMOVE_LATEX_MARKUP:
phrase = remove_latex_markup(phrase)
phrase = latex_formula_re.sub(' ', phrase)
phrase = wash_for_utf8(phrase)
phrase = lower_index_term(phrase)
# 1st split phrase into blocks according to whitespace
last_word = ''
for block in strip_accents(phrase).split():
# 2nd remove leading/trailing punctuation and add block:
block = re_block_punctuation_begin.sub("", block)
block = re_block_punctuation_end.sub("", block)
if block:
if self.stemming_language:
block = apply_stemming_and_stopwords_and_length_check(block, self.stemming_language)
# 3rd break each block into subblocks according to punctuation and add subblocks:
for subblock in re_punctuation.split(block):
if self.stemming_language:
subblock = apply_stemming_and_stopwords_and_length_check(subblock, self.stemming_language)
if subblock:
# 4th break each subblock into alphanumeric groups and add groups:
for alphanumeric_group in re_separators.split(subblock):
if self.stemming_language:
alphanumeric_group = apply_stemming_and_stopwords_and_length_check(alphanumeric_group, self.stemming_language)
if alphanumeric_group:
if last_word:
words['%s %s' % (last_word, alphanumeric_group)] = 1
last_word = alphanumeric_group
return words.keys()
class BibIndexExactNameTokenizer(BibIndexTokenizer):
"""
Human name exact tokenizer.
"""
def tokenize(self, s):
"""
Main API.
"""
return [wash_author_name(s)]
class BibIndexFuzzyNameTokenizer(BibIndexTokenizer):
"""Human name tokenizer.
Human names are divided into three classes of tokens:
'lastnames', i.e., family, tribal or group identifiers,
'nonlastnames', i.e., personal names distinguishing individuals,
'titles', both incidental and permanent, e.g., 'VIII', '(ed.)', 'Msc'
"""
def __init__(self):
self.single_initial_re = re.compile('^\w\.$')
self.split_on_re = re.compile('[\.\s-]')
# lastname_stopwords describes terms which should not be used for indexing,
# in multiple-word last names. These are purely conjunctions, serving the
# same function as the American hyphen, but using linguistic constructs.
self.lastname_stopwords = set(['y', 'of', 'and', 'de'])
def scan(self, s):
"""Scan a name string and output an object representing its structure.
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items.
Sample output for the name 'Jingleheimer Schmitt, John Jacob, XVI.' is:
{
'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles', 'raw'],
'lastnames' : ['Jingleheimer', 'Schmitt'],
'nonlastnames' : ['John', 'Jacob'],
'titles' : ['XVI.'],
'raw' : 'Jingleheimer Schmitt, John Jacob, XVI.'
}
@rtype: dict
"""
retval = {'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles', 'raw'],
'lastnames' : [],
'nonlastnames' : [],
'titles' : [],
'raw' : s}
l = s.split(',')
if len(l) < 2:
# No commas means a simple name
new = s.strip()
new = s.split(' ')
if len(new) == 1:
retval['lastnames'] = new # rare single-name case
else:
retval['lastnames'] = new[-1:]
retval['nonlastnames'] = new[:-1]
for tag in ['lastnames', 'nonlastnames']:
retval[tag] = [x.strip() for x in retval[tag]]
retval[tag] = [re.split(self.split_on_re, x) for x in retval[tag]]
# flatten sublists
retval[tag] = [item for sublist in retval[tag] for item in sublist]
retval[tag] = [x for x in retval[tag] if x != '']
else:
# Handle lastname-first multiple-names case
retval['titles'] = l[2:] # no titles? no problem
retval['nonlastnames'] = l[1]
retval['lastnames'] = l[0]
for tag in ['lastnames', 'nonlastnames']:
retval[tag] = retval[tag].strip()
retval[tag] = re.split(self.split_on_re, retval[tag])
# filter empty strings
retval[tag] = [x for x in retval[tag] if x != '']
retval['titles'] = [x.strip() for x in retval['titles'] if x != '']
return retval
def parse_scanned(self, scanned):
"""Return all the indexable variations for a tagged token dictionary.
Does this via the combinatoric expansion of the following rules:
- Expands first names as name, first initial with period, first initial
without period.
- Expands compound last names as each of their non-stopword subparts.
- Titles are treated literally, but applied serially.
Please note that titles will be applied to complete last names only.
So for example, if there is a compound last name of the form,
"Ibanez y Gracia", with the title, "(ed.)", then only the combination
of those two strings will do, not "Ibanez" and not "Gracia".
@param scanned: lexically tagged input items in the form of the output
from scan()
@type scanned: dict
@return: combinatorically expanded list of strings for indexing
@rtype: list of string
"""
def _fully_expanded_last_name(first, lastlist, title = None):
"""Return a list of all of the first / last / title combinations.
@param first: one possible non-last name
@type first: string
@param lastlist: the strings of the tokens in the (possibly compound) last name
@type lastlist: list of string
@param title: one possible title
@type title: string
"""
retval = []
title_word = ''
if title != None:
title_word = ', ' + title
last = ' '.join(lastlist)
retval.append(first + ' ' + last + title_word)
retval.append(last + ', ' + first + title_word)
for last in lastlist:
if last in self.lastname_stopwords:
continue
retval.append(first + ' ' + last + title_word)
retval.append(last + ', ' + first + title_word)
return retval
last_parts = scanned['lastnames']
first_parts = scanned['nonlastnames']
titles = scanned['titles']
raw = scanned['raw']
if len(first_parts) == 0: # rare single-name case
return scanned['lastnames']
expanded = []
for exp in self.__expand_nonlastnames(first_parts):
expanded.extend(_fully_expanded_last_name(exp, last_parts, None))
for title in titles:
# Drop titles which are parenthesized. This eliminates (ed.) from the index, but
# leaves XI, for example. This gets rid of the surprising behavior that searching
# for 'author:ed' retrieves people who have been editors, but whose names aren't
# Ed.
# TODO: Make editorship and other special statuses a MARC field.
if title.find('(') != -1:
continue
# XXX: remember to document that titles can only be applied to complete last names
expanded.extend(_fully_expanded_last_name(exp, [' '.join(last_parts)], title))
return sorted(list(set(expanded)))
def __expand_nonlastnames(self, namelist):
"""Generate every expansion of a series of human non-last names.
Example:
"Michael Edward" -> "Michael Edward", "Michael E.", "Michael E", "M. Edward", "M Edward",
"M. E.", "M. E", "M E.", "M E", "M.E."
...but never:
"ME"
@param namelist: a collection of names
@type namelist: list of string
@return: a greatly expanded collection of names
@rtype: list of string
"""
def _expand_name(name):
"""Lists [name, initial, empty]"""
if name == None:
return []
return [name, name[0]]
def _pair_items(head, tail):
"""Lists every combination of head with each and all of tail"""
if len(tail) == 0:
return [head]
l = []
l.extend([head + ' ' + tail[0]])
#l.extend([head + '-' + tail[0]])
l.extend(_pair_items(head, tail[1:]))
return l
def _collect(head, tail):
"""Brings together combinations of things"""
def _cons(a, l):
l2 = l[:]
l2.insert(0, a)
return l2
if len(tail) == 0:
return [head]
l = []
l.extend(_pair_items(head, _expand_name(tail[0])))
l.extend([' '.join(_cons(head, tail)).strip()])
#l.extend(['-'.join(_cons(head, tail)).strip()])
l.extend(_collect(head, tail[1:]))
return l
def _expand_contract(namelist):
"""Runs collect with every head in namelist and its tail"""
val = []
for i in range(len(namelist)):
name = namelist[i]
for expansion in _expand_name(name):
val.extend(_collect(expansion, namelist[i+1:]))
return val
def _add_squashed(namelist):
"""Finds cases like 'M. E.' and adds 'M.E.'"""
val = namelist
def __check_parts(parts):
if len(parts) < 2:
return False
for part in parts:
if not self.single_initial_re.match(part):
return False
return True
for name in namelist:
parts = name.split(' ')
if not __check_parts(parts):
continue
val.extend([''.join(parts)])
return val
return _add_squashed(_expand_contract(namelist))
def tokenize(self, s):
"""Main entry point. Output the list of strings expanding s.
Does this via the combinatoric expansion of the following rules:
- Expands first names as name, first initial with period, first initial
without period.
- Expands compound last names as each of their non-stopword subparts.
- Titles are treated literally, but applied serially.
Please note that titles will be applied to complete last names only.
So for example, if there is a compound last name of the form,
"Ibanez y Gracia", with the title, "(ed.)", then only the combination
of those two strings will do, not "Ibanez" and not "Gracia".
@param s: the input to be lexically tagged
@type s: string
@return: combinatorically expanded list of strings for indexing
@rtype: list of string
@note: A simple wrapper around scan and parse_scanned.
"""
return self.parse_scanned(self.scan(s))
if __name__ == "__main__":
"""Trivial manual test framework"""
import sys
args = sys.argv[1:]
test_str = ''
if len(args) == 0:
test_str = "Michael Peskin"
elif len(args) == 1:
test_str = args[0]
else:
test_str = ' '.join(args)
tokenizer = BibIndexFuzzyNameTokenizer()
print "Tokenizes as:", tokenizer.tokenize(test_str)

Event Timeline