Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F73549008
bibindex_engine_tokenizer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Jul 22, 13:46
Size
21 KB
Mime Type
text/x-python
Expires
Wed, Jul 24, 13:46 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
19222450
Attached To
R3600 invenio-infoscience
bibindex_engine_tokenizer.py
View Options
# -*- coding:utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2010, 2011, 2012 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""bibindex_engine_tokenizer: a set of classes implementing index tokenization
The idea is that Tokenizer classes provide a method, tokenize(), which turns
input strings into lists of strings. The output strings are calculated based
on the input string as tokens suitable for word or phrase indexing.
"""
import
re
from
invenio.config
import
\
CFG_BIBINDEX_REMOVE_HTML_MARKUP
,
\
CFG_BIBINDEX_REMOVE_LATEX_MARKUP
,
\
CFG_BIBINDEX_CHARS_PUNCTUATION
,
\
CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS
from
invenio.htmlutils
import
remove_html_markup
from
invenio.textutils
import
wash_for_utf8
,
strip_accents
from
invenio.bibindex_engine_washer
import
\
lower_index_term
,
remove_latex_markup
,
\
apply_stemming_and_stopwords_and_length_check
,
\
wash_author_name
latex_formula_re
=
re
.
compile
(
r'\$.*?\$|\\\[.*?\\\]'
)
phrase_delimiter_re
=
re
.
compile
(
r'[\.:;\?\!]'
)
space_cleaner_re
=
re
.
compile
(
r'\s+'
)
re_block_punctuation_begin
=
re
.
compile
(
r"^"
+
CFG_BIBINDEX_CHARS_PUNCTUATION
+
"+"
)
re_block_punctuation_end
=
re
.
compile
(
CFG_BIBINDEX_CHARS_PUNCTUATION
+
"+$"
)
re_punctuation
=
re
.
compile
(
CFG_BIBINDEX_CHARS_PUNCTUATION
)
re_separators
=
re
.
compile
(
CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS
)
re_arxiv
=
re
.
compile
(
r'^arxiv:\d\d\d\d\.\d\d\d\d'
)
re_pattern_fuzzy_author_trigger
=
re
.
compile
(
r'[\s\,\.]'
)
# FIXME: re_pattern_fuzzy_author_trigger could be removed and an
# BibAuthorID API function could be called instead after we
# double-check that there are no circular imports.
re_pattern_author_canonical_id
=
re
.
compile
(
r'\.[0-9]+$'
)
def
author_name_requires_phrase_search
(
p
):
"""
Detect whether author query pattern p requires phrase search.
Notably, look for presence of spaces and commas.
"""
if
re_pattern_fuzzy_author_trigger
.
search
(
p
):
return
True
return
False
class
BibIndexTokenizer
(
object
):
"""Base class for the tokenizers
Tokenizers act as filters which turn input strings into lists of strings
which represent the idexable components of that string.
"""
def
scan_string
(
self
,
s
):
"""Return an intermediate representation of the tokens in s.
Every tokenizer should have a scan_string function, which scans the
input string and lexically tags its components. These units are
grouped together sequentially. The output of scan_string is usually
something like:
{
'TOKEN_TAG_LIST' : a list of valid keys in this output set,
'key1' : [val1, val2, val3] - where key describes the in some
meaningful way
}
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items
In a sample Tokenizer where scan_string simply splits s on
space, scan_string might output the following for
"Assam and Darjeeling":
{
'TOKEN_TAG_LIST' : 'word_list',
'word_list' : ['Assam', 'and', 'Darjeeling']
}
@rtype: dict
"""
raise
NotImplementedError
def
parse_scanned
(
self
,
o
):
"""Calculate the token list from the intermediate representation o.
While this should be an interesting computation over the intermediate
representation generated by scan_string, obviously in the split-on-
space example we need only return o['word_list'].
@param t: a dictionary with a 'word_list' key
@type t: dict
@return: the token items from 'word_list'
@rtype: list of string
"""
raise
NotImplementedError
def
tokenize
(
self
,
s
):
"""Main entry point. Return token list from input string s.
Simply composes the functionality above.
@param s: the input to be lexically tagged
@type s: string
@return: the token items derived from s
@rtype: list of string
"""
raise
NotImplementedError
class
BibIndexPhraseTokenizer
(
BibIndexTokenizer
):
"""The original phrase is returned"""
def
__init__
(
self
,
stemming_language
=
None
):
self
.
stemming_language
=
stemming_language
def
tokenize
(
self
,
phrase
):
"""Return list of phrases found in PHRASE. Note that the phrase is
split into groups depending on the alphanumeric characters and
punctuation characters definition present in the config file.
"""
phrase
=
wash_for_utf8
(
phrase
)
return
[
phrase
]
## Note that we don't break phrases, they are used for exact style
## of searching.
words
=
{}
phrase
=
strip_accents
(
phrase
)
# 1st split phrase into blocks according to whitespace
for
block1
in
phrase_delimiter_re
.
split
(
strip_accents
(
phrase
)):
block1
=
block1
.
strip
()
if
block1
and
self
.
stemming_language
:
new_words
=
[]
for
block2
in
re_punctuation
.
split
(
block1
):
block2
=
block2
.
strip
()
if
block2
:
for
block3
in
block2
.
split
():
block3
=
block3
.
strip
()
if
block3
:
# Note that we don't stem phrases, they
# are used for exact style of searching.
new_words
.
append
(
block3
)
block1
=
' '
.
join
(
new_words
)
if
block1
:
words
[
block1
]
=
1
return
words
.
keys
()
class
BibIndexWordTokenizer
(
BibIndexTokenizer
):
"""A phrase is split into words"""
def
__init__
(
self
,
stemming_language
=
None
):
self
.
stemming_language
=
stemming_language
def
tokenize
(
self
,
phrase
):
"""Return list of words found in PHRASE. Note that the phrase is
split into groups depending on the alphanumeric characters and
punctuation characters definition present in the config file.
"""
words
=
{}
formulas
=
[]
if
CFG_BIBINDEX_REMOVE_HTML_MARKUP
and
phrase
.
find
(
"</"
)
>
-
1
:
phrase
=
remove_html_markup
(
phrase
)
if
CFG_BIBINDEX_REMOVE_LATEX_MARKUP
:
formulas
=
latex_formula_re
.
findall
(
phrase
)
phrase
=
remove_latex_markup
(
phrase
)
phrase
=
latex_formula_re
.
sub
(
' '
,
phrase
)
phrase
=
wash_for_utf8
(
phrase
)
phrase
=
lower_index_term
(
phrase
)
# 1st split phrase into blocks according to whitespace
for
block
in
strip_accents
(
phrase
)
.
split
():
# 2nd remove leading/trailing punctuation and add block:
block
=
re_block_punctuation_begin
.
sub
(
""
,
block
)
block
=
re_block_punctuation_end
.
sub
(
""
,
block
)
if
block
:
stemmed_block
=
apply_stemming_and_stopwords_and_length_check
(
block
,
self
.
stemming_language
)
if
stemmed_block
:
words
[
stemmed_block
]
=
1
if
re_arxiv
.
match
(
block
):
# special case for blocks like `arXiv:1007.5048' where
# we would like to index the part after the colon
# regardless of dot or other punctuation characters:
words
[
block
.
split
(
':'
,
1
)[
1
]]
=
1
# 3rd break each block into subblocks according to punctuation and add subblocks:
for
subblock
in
re_punctuation
.
split
(
block
):
stemmed_subblock
=
apply_stemming_and_stopwords_and_length_check
(
subblock
,
self
.
stemming_language
)
if
stemmed_subblock
:
words
[
stemmed_subblock
]
=
1
# 4th break each subblock into alphanumeric groups and add groups:
for
alphanumeric_group
in
re_separators
.
split
(
subblock
):
stemmed_alphanumeric_group
=
apply_stemming_and_stopwords_and_length_check
(
alphanumeric_group
,
self
.
stemming_language
)
if
stemmed_alphanumeric_group
:
words
[
stemmed_alphanumeric_group
]
=
1
for
block
in
formulas
:
words
[
block
]
=
1
return
words
.
keys
()
class
BibIndexPairTokenizer
(
BibIndexTokenizer
):
"""A phrase is split into pairs of words"""
def
__init__
(
self
,
stemming_language
=
None
):
self
.
stemming_language
=
stemming_language
def
tokenize
(
self
,
phrase
):
"""Return list of words found in PHRASE. Note that the phrase is
split into groups depending on the alphanumeric characters and
punctuation characters definition present in the config file.
"""
words
=
{}
if
CFG_BIBINDEX_REMOVE_HTML_MARKUP
and
phrase
.
find
(
"</"
)
>
-
1
:
phrase
=
remove_html_markup
(
phrase
)
if
CFG_BIBINDEX_REMOVE_LATEX_MARKUP
:
phrase
=
remove_latex_markup
(
phrase
)
phrase
=
latex_formula_re
.
sub
(
' '
,
phrase
)
phrase
=
wash_for_utf8
(
phrase
)
phrase
=
lower_index_term
(
phrase
)
# 1st split phrase into blocks according to whitespace
last_word
=
''
for
block
in
strip_accents
(
phrase
)
.
split
():
# 2nd remove leading/trailing punctuation and add block:
block
=
re_block_punctuation_begin
.
sub
(
""
,
block
)
block
=
re_block_punctuation_end
.
sub
(
""
,
block
)
if
block
:
if
self
.
stemming_language
:
block
=
apply_stemming_and_stopwords_and_length_check
(
block
,
self
.
stemming_language
)
# 3rd break each block into subblocks according to punctuation and add subblocks:
for
subblock
in
re_punctuation
.
split
(
block
):
if
self
.
stemming_language
:
subblock
=
apply_stemming_and_stopwords_and_length_check
(
subblock
,
self
.
stemming_language
)
if
subblock
:
# 4th break each subblock into alphanumeric groups and add groups:
for
alphanumeric_group
in
re_separators
.
split
(
subblock
):
if
self
.
stemming_language
:
alphanumeric_group
=
apply_stemming_and_stopwords_and_length_check
(
alphanumeric_group
,
self
.
stemming_language
)
if
alphanumeric_group
:
if
last_word
:
words
[
'
%s
%s
'
%
(
last_word
,
alphanumeric_group
)]
=
1
last_word
=
alphanumeric_group
return
words
.
keys
()
class
BibIndexExactNameTokenizer
(
BibIndexTokenizer
):
"""
Human name exact tokenizer.
"""
def
tokenize
(
self
,
s
):
"""
Main API.
"""
return
[
wash_author_name
(
s
)]
class
BibIndexFuzzyNameTokenizer
(
BibIndexTokenizer
):
"""Human name tokenizer.
Human names are divided into three classes of tokens:
'lastnames', i.e., family, tribal or group identifiers,
'nonlastnames', i.e., personal names distinguishing individuals,
'titles', both incidental and permanent, e.g., 'VIII', '(ed.)', 'Msc'
"""
def
__init__
(
self
):
self
.
single_initial_re
=
re
.
compile
(
'^\w\.$'
)
self
.
split_on_re
=
re
.
compile
(
'[\.\s-]'
)
# lastname_stopwords describes terms which should not be used for indexing,
# in multiple-word last names. These are purely conjunctions, serving the
# same function as the American hyphen, but using linguistic constructs.
self
.
lastname_stopwords
=
set
([
'y'
,
'of'
,
'and'
,
'de'
])
def
scan
(
self
,
s
):
"""Scan a name string and output an object representing its structure.
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items.
Sample output for the name 'Jingleheimer Schmitt, John Jacob, XVI.' is:
{
'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles', 'raw'],
'lastnames' : ['Jingleheimer', 'Schmitt'],
'nonlastnames' : ['John', 'Jacob'],
'titles' : ['XVI.'],
'raw' : 'Jingleheimer Schmitt, John Jacob, XVI.'
}
@rtype: dict
"""
retval
=
{
'TOKEN_TAG_LIST'
:
[
'lastnames'
,
'nonlastnames'
,
'titles'
,
'raw'
],
'lastnames'
:
[],
'nonlastnames'
:
[],
'titles'
:
[],
'raw'
:
s
}
l
=
s
.
split
(
','
)
if
len
(
l
)
<
2
:
# No commas means a simple name
new
=
s
.
strip
()
new
=
s
.
split
(
' '
)
if
len
(
new
)
==
1
:
retval
[
'lastnames'
]
=
new
# rare single-name case
else
:
retval
[
'lastnames'
]
=
new
[
-
1
:]
retval
[
'nonlastnames'
]
=
new
[:
-
1
]
for
tag
in
[
'lastnames'
,
'nonlastnames'
]:
retval
[
tag
]
=
[
x
.
strip
()
for
x
in
retval
[
tag
]]
retval
[
tag
]
=
[
re
.
split
(
self
.
split_on_re
,
x
)
for
x
in
retval
[
tag
]]
# flatten sublists
retval
[
tag
]
=
[
item
for
sublist
in
retval
[
tag
]
for
item
in
sublist
]
retval
[
tag
]
=
[
x
for
x
in
retval
[
tag
]
if
x
!=
''
]
else
:
# Handle lastname-first multiple-names case
retval
[
'titles'
]
=
l
[
2
:]
# no titles? no problem
retval
[
'nonlastnames'
]
=
l
[
1
]
retval
[
'lastnames'
]
=
l
[
0
]
for
tag
in
[
'lastnames'
,
'nonlastnames'
]:
retval
[
tag
]
=
retval
[
tag
]
.
strip
()
retval
[
tag
]
=
re
.
split
(
self
.
split_on_re
,
retval
[
tag
])
# filter empty strings
retval
[
tag
]
=
[
x
for
x
in
retval
[
tag
]
if
x
!=
''
]
retval
[
'titles'
]
=
[
x
.
strip
()
for
x
in
retval
[
'titles'
]
if
x
!=
''
]
return
retval
def
parse_scanned
(
self
,
scanned
):
"""Return all the indexable variations for a tagged token dictionary.
Does this via the combinatoric expansion of the following rules:
- Expands first names as name, first initial with period, first initial
without period.
- Expands compound last names as each of their non-stopword subparts.
- Titles are treated literally, but applied serially.
Please note that titles will be applied to complete last names only.
So for example, if there is a compound last name of the form,
"Ibanez y Gracia", with the title, "(ed.)", then only the combination
of those two strings will do, not "Ibanez" and not "Gracia".
@param scanned: lexically tagged input items in the form of the output
from scan()
@type scanned: dict
@return: combinatorically expanded list of strings for indexing
@rtype: list of string
"""
def
_fully_expanded_last_name
(
first
,
lastlist
,
title
=
None
):
"""Return a list of all of the first / last / title combinations.
@param first: one possible non-last name
@type first: string
@param lastlist: the strings of the tokens in the (possibly compound) last name
@type lastlist: list of string
@param title: one possible title
@type title: string
"""
retval
=
[]
title_word
=
''
if
title
!=
None
:
title_word
=
', '
+
title
last
=
' '
.
join
(
lastlist
)
retval
.
append
(
first
+
' '
+
last
+
title_word
)
retval
.
append
(
last
+
', '
+
first
+
title_word
)
for
last
in
lastlist
:
if
last
in
self
.
lastname_stopwords
:
continue
retval
.
append
(
first
+
' '
+
last
+
title_word
)
retval
.
append
(
last
+
', '
+
first
+
title_word
)
return
retval
last_parts
=
scanned
[
'lastnames'
]
first_parts
=
scanned
[
'nonlastnames'
]
titles
=
scanned
[
'titles'
]
raw
=
scanned
[
'raw'
]
if
len
(
first_parts
)
==
0
:
# rare single-name case
return
scanned
[
'lastnames'
]
expanded
=
[]
for
exp
in
self
.
__expand_nonlastnames
(
first_parts
):
expanded
.
extend
(
_fully_expanded_last_name
(
exp
,
last_parts
,
None
))
for
title
in
titles
:
# Drop titles which are parenthesized. This eliminates (ed.) from the index, but
# leaves XI, for example. This gets rid of the surprising behavior that searching
# for 'author:ed' retrieves people who have been editors, but whose names aren't
# Ed.
# TODO: Make editorship and other special statuses a MARC field.
if
title
.
find
(
'('
)
!=
-
1
:
continue
# XXX: remember to document that titles can only be applied to complete last names
expanded
.
extend
(
_fully_expanded_last_name
(
exp
,
[
' '
.
join
(
last_parts
)],
title
))
return
sorted
(
list
(
set
(
expanded
)))
def
__expand_nonlastnames
(
self
,
namelist
):
"""Generate every expansion of a series of human non-last names.
Example:
"Michael Edward" -> "Michael Edward", "Michael E.", "Michael E", "M. Edward", "M Edward",
"M. E.", "M. E", "M E.", "M E", "M.E."
...but never:
"ME"
@param namelist: a collection of names
@type namelist: list of string
@return: a greatly expanded collection of names
@rtype: list of string
"""
def
_expand_name
(
name
):
"""Lists [name, initial, empty]"""
if
name
==
None
:
return
[]
return
[
name
,
name
[
0
]]
def
_pair_items
(
head
,
tail
):
"""Lists every combination of head with each and all of tail"""
if
len
(
tail
)
==
0
:
return
[
head
]
l
=
[]
l
.
extend
([
head
+
' '
+
tail
[
0
]])
#l.extend([head + '-' + tail[0]])
l
.
extend
(
_pair_items
(
head
,
tail
[
1
:]))
return
l
def
_collect
(
head
,
tail
):
"""Brings together combinations of things"""
def
_cons
(
a
,
l
):
l2
=
l
[:]
l2
.
insert
(
0
,
a
)
return
l2
if
len
(
tail
)
==
0
:
return
[
head
]
l
=
[]
l
.
extend
(
_pair_items
(
head
,
_expand_name
(
tail
[
0
])))
l
.
extend
([
' '
.
join
(
_cons
(
head
,
tail
))
.
strip
()])
#l.extend(['-'.join(_cons(head, tail)).strip()])
l
.
extend
(
_collect
(
head
,
tail
[
1
:]))
return
l
def
_expand_contract
(
namelist
):
"""Runs collect with every head in namelist and its tail"""
val
=
[]
for
i
in
range
(
len
(
namelist
)):
name
=
namelist
[
i
]
for
expansion
in
_expand_name
(
name
):
val
.
extend
(
_collect
(
expansion
,
namelist
[
i
+
1
:]))
return
val
def
_add_squashed
(
namelist
):
"""Finds cases like 'M. E.' and adds 'M.E.'"""
val
=
namelist
def
__check_parts
(
parts
):
if
len
(
parts
)
<
2
:
return
False
for
part
in
parts
:
if
not
self
.
single_initial_re
.
match
(
part
):
return
False
return
True
for
name
in
namelist
:
parts
=
name
.
split
(
' '
)
if
not
__check_parts
(
parts
):
continue
val
.
extend
([
''
.
join
(
parts
)])
return
val
return
_add_squashed
(
_expand_contract
(
namelist
))
def
tokenize
(
self
,
s
):
"""Main entry point. Output the list of strings expanding s.
Does this via the combinatoric expansion of the following rules:
- Expands first names as name, first initial with period, first initial
without period.
- Expands compound last names as each of their non-stopword subparts.
- Titles are treated literally, but applied serially.
Please note that titles will be applied to complete last names only.
So for example, if there is a compound last name of the form,
"Ibanez y Gracia", with the title, "(ed.)", then only the combination
of those two strings will do, not "Ibanez" and not "Gracia".
@param s: the input to be lexically tagged
@type s: string
@return: combinatorically expanded list of strings for indexing
@rtype: list of string
@note: A simple wrapper around scan and parse_scanned.
"""
return
self
.
parse_scanned
(
self
.
scan
(
s
))
if
__name__
==
"__main__"
:
"""Trivial manual test framework"""
import
sys
args
=
sys
.
argv
[
1
:]
test_str
=
''
if
len
(
args
)
==
0
:
test_str
=
"Michael Peskin"
elif
len
(
args
)
==
1
:
test_str
=
args
[
0
]
else
:
test_str
=
' '
.
join
(
args
)
tokenizer
=
BibIndexFuzzyNameTokenizer
()
print
"Tokenizes as:"
,
tokenizer
.
tokenize
(
test_str
)
Event Timeline
Log In to Comment