Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F85919464
bibindex_engine_tokenizer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Oct 3, 00:56
Size
14 KB
Mime Type
text/x-python
Expires
Sat, Oct 5, 00:56 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
21299120
Attached To
R3600 invenio-infoscience
bibindex_engine_tokenizer.py
View Options
# -*- coding:utf-8 -*-
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""bibindex_engine_tokenizer: a set of classes implementing index tokenization
The idea is that Tokenizer classes provide a method, tokenize(), which turns
input strings into lists of strings. The output strings are calculated based
on the input string as tokens suitable for word or phrase indexing.
"""
import
re
re_pattern_fuzzy_author_dots
=
re
.
compile
(
r'[\.\-]+'
)
re_pattern_fuzzy_author_spaces
=
re
.
compile
(
r'\s+'
)
re_pattern_fuzzy_author_trigger
=
re
.
compile
(
r'[\s\,\.\-]'
)
def
wash_author_name
(
p
):
"""
Wash author name suitable for author searching. Notably, replace
dots and hyphens with spaces, and collapse spaces.
"""
out
=
re_pattern_fuzzy_author_dots
.
sub
(
" "
,
p
)
return
re_pattern_fuzzy_author_spaces
.
sub
(
" "
,
out
)
def
author_name_requires_phrase_search
(
p
):
"""
Detect whether author query pattern p requires phrase search.
Notably, look for presence of spaces and commas.
"""
if
re_pattern_fuzzy_author_trigger
.
search
(
p
):
return
True
return
False
class
BibIndexTokenizer
(
object
):
"""Base class for the tokenizers
Tokenizers act as filters which turn input strings into lists of strings
which represent the idexable components of that string.
"""
def
scan_string
(
self
,
s
):
"""Return an intermediate representation of the tokens in s.
Every tokenizer should have a scan_string function, which scans the
input string and lexically tags its components. These units are
grouped together sequentially. The output of scan_string is usually
something like:
{
'TOKEN_TAG_LIST' : a list of valid keys in this output set,
'key1' : [val1, val2, val3] - where key describes the in some
meaningful way
}
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items
In a sample Tokenizer where scan_string simply splits s on
space, scan_string might output the following for
"Assam and Darjeeling":
{
'TOKEN_TAG_LIST' : 'word_list',
'word_list' : ['Assam', 'and', 'Darjeeling']
}
@rtype: dict
"""
raise
NotImplementedError
def
parse_scanned
(
self
,
o
):
"""Calculate the token list from the intermediate representation o.
While this should be an interesting computation over the intermediate
representation generated by scan_string, obviously in the split-on-
space example we need only return o['word_list'].
@param t: a dictionary with a 'word_list' key
@type t: dict
@return: the token items from 'word_list'
@rtype: list of string
"""
raise
NotImplementedError
def
tokenize
(
self
,
s
):
"""Main entry point. Return token list from input string s.
Simply composes the functionality above.
@param s: the input to be lexically tagged
@type s: string
@return: the token items derived from s
@rtype: list of string
"""
raise
NotImplementedError
class
BibIndexExactNameTokenizer
(
BibIndexTokenizer
):
"""
Human name exact tokenizer.
"""
def
tokenize
(
self
,
s
):
"""
Main API.
"""
return
[
wash_author_name
(
s
)]
class
BibIndexFuzzyNameTokenizer
(
BibIndexTokenizer
):
"""Human name tokenizer.
Human names are divided into three classes of tokens:
'lastnames', i.e., family, tribal or group identifiers,
'nonlastnames', i.e., personal names distinguishing individuals,
'titles', both incidental and permanent, e.g., 'VIII', '(ed.)', 'Msc'
"""
def
__init__
(
self
):
self
.
single_initial_re
=
re
.
compile
(
'^\w\.$'
)
self
.
split_on_re
=
re
.
compile
(
'[\.\s-]'
)
# lastname_stopwords describes terms which should not be used for indexing,
# in multiple-word last names. These are purely conjunctions, serving the
# same function as the American hyphen, but using linguistic constructs.
self
.
lastname_stopwords
=
set
([
'y'
,
'of'
,
'and'
,
'de'
])
def
scan
(
self
,
s
):
"""Scan a name string and output an object representing its structure.
@param s: the input to be lexically tagged
@type s: string
@return: dict of lexically tagged input items.
Sample output for the name 'Jingleheimer Schmitt, John Jacob, XVI.' is:
{
'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles'],
'lastnames' : ['Jingleheimer', 'Schmitt'],
'nonlastnames' : ['John', 'Jacob'],
'titles' : ['XVI.']
}
@rtype: dict
"""
retval
=
{
'TOKEN_TAG_LIST'
:
[
'lastnames'
,
'nonlastnames'
,
'titles'
],
'lastnames'
:
[],
'nonlastnames'
:
[],
'titles'
:
[]}
l
=
s
.
split
(
','
)
if
len
(
l
)
<
2
:
# No commas means a simple name
new
=
s
.
strip
()
new
=
s
.
split
(
' '
)
if
len
(
new
)
==
1
:
retval
[
'lastnames'
]
=
new
# rare single-name case
else
:
retval
[
'lastnames'
]
=
new
[
-
1
:]
retval
[
'nonlastnames'
]
=
new
[:
-
1
]
for
tag
in
[
'lastnames'
,
'nonlastnames'
]:
retval
[
tag
]
=
[
x
.
strip
()
for
x
in
retval
[
tag
]]
retval
[
tag
]
=
[
re
.
split
(
self
.
split_on_re
,
x
)
for
x
in
retval
[
tag
]]
# flatten sublists
retval
[
tag
]
=
[
item
for
sublist
in
retval
[
tag
]
for
item
in
sublist
]
retval
[
tag
]
=
[
x
for
x
in
retval
[
tag
]
if
x
!=
''
]
else
:
# Handle lastname-first multiple-names case
retval
[
'titles'
]
=
l
[
2
:]
# no titles? no problem
retval
[
'nonlastnames'
]
=
l
[
1
]
retval
[
'lastnames'
]
=
l
[
0
]
for
tag
in
[
'lastnames'
,
'nonlastnames'
]:
retval
[
tag
]
=
retval
[
tag
]
.
strip
()
retval
[
tag
]
=
re
.
split
(
self
.
split_on_re
,
retval
[
tag
])
# filter empty strings
retval
[
tag
]
=
[
x
for
x
in
retval
[
tag
]
if
x
!=
''
]
retval
[
'titles'
]
=
[
x
.
strip
()
for
x
in
retval
[
'titles'
]
if
x
!=
''
]
return
retval
def
parse_scanned
(
self
,
scanned
):
"""Return all the indexable variations for a tagged token dictionary.
Does this via the combinatoric expansion of the following rules:
- Expands first names as name, first initial with period, first initial
without period.
- Expands compound last names as each of their non-stopword subparts.
- Titles are treated literally, but applied serially.
Please note that titles will be applied to complete last names only.
So for example, if there is a compound last name of the form,
"Ibanez y Gracia", with the title, "(ed.)", then only the combination
of those two strings will do, not "Ibanez" and not "Gracia".
@param scanned: lexically tagged input items in the form of the output
from scan()
@type scanned: dict
@return: combinatorically expanded list of strings for indexing
@rtype: list of string
"""
def
_fully_expanded_last_name
(
first
,
lastlist
,
title
=
None
):
"""Return a list of all of the first / last / title combinations.
@param first: one possible non-last name
@type first: string
@param lastlist: the strings of the tokens in the (possibly compound) last name
@type lastlist: list of string
@param title: one possible title
@type title: string
"""
retval
=
[]
title_word
=
''
if
title
!=
None
:
title_word
=
', '
+
title
last
=
' '
.
join
(
lastlist
)
retval
.
append
(
first
+
' '
+
last
+
title_word
)
retval
.
append
(
last
+
', '
+
first
+
title_word
)
for
last
in
lastlist
:
if
last
in
self
.
lastname_stopwords
:
continue
retval
.
append
(
first
+
' '
+
last
+
title_word
)
retval
.
append
(
last
+
', '
+
first
+
title_word
)
return
retval
last_parts
=
scanned
[
'lastnames'
]
first_parts
=
scanned
[
'nonlastnames'
]
titles
=
scanned
[
'titles'
]
if
len
(
first_parts
)
==
0
:
# rare single-name case
return
scanned
[
'lastnames'
]
expanded
=
[]
for
exp
in
self
.
__expand_nonlastnames
(
first_parts
):
expanded
.
extend
(
_fully_expanded_last_name
(
exp
,
last_parts
,
None
))
for
title
in
titles
:
# Drop titles which are parenthesized. This eliminates (ed.) from the index, but
# leaves XI, for example. This gets rid of the surprising behavior that searching
# for 'author:ed' retrieves people who have been editors, but whose names aren't
# Ed.
# TODO: Make editorship and other special statuses a MARC field.
if
title
.
find
(
'('
)
!=
-
1
:
continue
# XXX: remember to document that titles can only be applied to complete last names
expanded
.
extend
(
_fully_expanded_last_name
(
exp
,
[
' '
.
join
(
last_parts
)],
title
))
return
sorted
(
list
(
set
(
expanded
)))
def
__expand_nonlastnames
(
self
,
namelist
):
"""Generate every expansion of a series of human non-last names.
Example:
"Michael Edward" -> "Michael Edward", "Michael E.", "Michael E", "M. Edward", "M Edward",
"M. E.", "M. E", "M E.", "M E", "M.E."
...but never:
"ME"
@param namelist: a collection of names
@type namelist: list of string
@return: a greatly expanded collection of names
@rtype: list of string
"""
def
_expand_name
(
name
):
"""Lists [name, initial, empty]"""
if
name
==
None
:
return
[]
return
[
name
,
name
[
0
]]
def
_pair_items
(
head
,
tail
):
"""Lists every combination of head with each and all of tail"""
if
len
(
tail
)
==
0
:
return
[
head
]
l
=
[]
l
.
extend
([
head
+
' '
+
tail
[
0
]])
#l.extend([head + '-' + tail[0]])
l
.
extend
(
_pair_items
(
head
,
tail
[
1
:]))
return
l
def
_collect
(
head
,
tail
):
"""Brings together combinations of things"""
def
_cons
(
a
,
l
):
l2
=
l
[:]
l2
.
insert
(
0
,
a
)
return
l2
if
len
(
tail
)
==
0
:
return
[
head
]
l
=
[]
l
.
extend
(
_pair_items
(
head
,
_expand_name
(
tail
[
0
])))
l
.
extend
([
' '
.
join
(
_cons
(
head
,
tail
))
.
strip
()])
#l.extend(['-'.join(_cons(head, tail)).strip()])
l
.
extend
(
_collect
(
head
,
tail
[
1
:]))
return
l
def
_expand_contract
(
namelist
):
"""Runs collect with every head in namelist and its tail"""
val
=
[]
for
i
in
range
(
len
(
namelist
)):
name
=
namelist
[
i
]
for
expansion
in
_expand_name
(
name
):
val
.
extend
(
_collect
(
expansion
,
namelist
[
i
+
1
:]))
return
val
def
_add_squashed
(
namelist
):
"""Finds cases like 'M. E.' and adds 'M.E.'"""
val
=
namelist
def
__check_parts
(
parts
):
if
len
(
parts
)
<
2
:
return
False
for
part
in
parts
:
if
not
self
.
single_initial_re
.
match
(
part
):
return
False
return
True
for
name
in
namelist
:
parts
=
name
.
split
(
' '
)
if
not
__check_parts
(
parts
):
continue
val
.
extend
([
''
.
join
(
parts
)])
return
val
return
_add_squashed
(
_expand_contract
(
namelist
))
def
tokenize
(
self
,
s
):
"""Main entry point. Output the list of strings expanding s.
Does this via the combinatoric expansion of the following rules:
- Expands first names as name, first initial with period, first initial
without period.
- Expands compound last names as each of their non-stopword subparts.
- Titles are treated literally, but applied serially.
Please note that titles will be applied to complete last names only.
So for example, if there is a compound last name of the form,
"Ibanez y Gracia", with the title, "(ed.)", then only the combination
of those two strings will do, not "Ibanez" and not "Gracia".
@param s: the input to be lexically tagged
@type s: string
@return: combinatorically expanded list of strings for indexing
@rtype: list of string
@note: A simple wrapper around scan and parse_scanned.
"""
return
self
.
parse_scanned
(
self
.
scan
(
s
))
if
__name__
==
"__main__"
:
"""Trivial manual test framework"""
import
sys
args
=
sys
.
argv
[
1
:]
test_str
=
''
if
len
(
args
)
==
0
:
test_str
=
"Michael Peskin"
elif
len
(
args
)
==
1
:
test_str
=
args
[
0
]
else
:
test_str
=
' '
.
join
(
args
)
tokenizer
=
BibIndexFuzzyNameTokenizer
()
print
"Tokenizes as:"
,
tokenizer
.
tokenize
(
test_str
)
Event Timeline
Log In to Comment