Page MenuHomec4science

bibindex_engine_tokenizer_tests.py
No OneTemporary

File Metadata

Created
Thu, May 30, 23:25

bibindex_engine_tokenizer_tests.py

# -*- coding:utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""bibindex_engine_tokenizer_tests - unit tests for bibindex_engine_tokenizer
There should always be at least one test class for each class in b_e_t.
"""
import unittest
from invenio.testutils import make_test_suite, run_test_suite
import bibindex_engine_tokenizer as tokenizer_lib
class TestFuzzyNameTokenizerScanning(unittest.TestCase):
"""Test BibIndex name tokenization"""
def setUp(self):
self.tokenizer = tokenizer_lib.BibIndexFuzzyNameTokenizer()
self.scan = self.tokenizer.scan
def test_bifnt_scan_single(self):
"""BibIndexFuzzyNameTokenizer - scanning single names like 'Dido'"""
teststr = "Dido"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'], 'lastnames': ['Dido'], 'nonlastnames': [], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_simple_western_forward(self):
"""BibIndexFuzzyNameTokenizer - scanning simple Western-style: first last"""
teststr = "Ringo Starr"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'], 'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_simple_western_reverse(self):
"""BibIndexFuzzyNameTokenizer - scanning simple Western-style: last, first"""
teststr = "Starr, Ringo"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'], 'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_multiname_forward(self):
"""BibIndexFuzzyNameTokenizer - scanning multiword: first middle last"""
teststr = "Michael Edward Peskin"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Peskin'], 'nonlastnames': ['Michael', 'Edward'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_multiname_dotcrammed(self):
"""BibIndexFuzzyNameTokenizer - scanning multiword: f.m. last"""
teststr = "M.E. Peskin"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Peskin'], 'nonlastnames': ['M', 'E'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_multiname_dotcrammed_reversed(self):
"""BibIndexFuzzyNameTokenizer - scanning multiword: last, f.m."""
teststr = "Peskin, M.E."
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Peskin'], 'nonlastnames': ['M', 'E'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_multiname_dashcrammed(self):
"""BibIndexFuzzyNameTokenizer - scanning multiword: first-middle last"""
teststr = "Jean-Luc Picard"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Picard'], 'nonlastnames': ['Jean', 'Luc'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_multiname_dashcrammed_reversed(self):
"""BibIndexFuzzyNameTokenizer - scanning multiword: last, first-middle"""
teststr = "Picard, Jean-Luc"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Picard'], 'nonlastnames': ['Jean', 'Luc'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_compound_lastname_dashes(self):
"""BibIndexFuzzyNameTokenizer - scanning multiword: first middle last-last"""
teststr = "Cantina Octavia Jones-Smith"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Jones', 'Smith'], 'nonlastnames': ['Cantina', 'Octavia'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_compound_lastname_dashes_reverse(self):
"""BibIndexFuzzyNameTokenizer - scanning multiword: last-last, first middle"""
teststr = "Jones-Smith, Cantina Octavia"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Jones', 'Smith'], 'nonlastnames': ['Cantina', 'Octavia'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_compound_lastname_reverse(self):
"""BibIndexFuzzyNameTokenizer - scanning compound last: last last, first"""
teststr = "Alvarez Gaume, Joachim"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Alvarez', 'Gaume'], 'nonlastnames': ['Joachim'], 'titles': []}
self.assertEqual(output, anticipated)
def test_bifnt_scan_titled(self):
"""BibIndexFuzzyNameTokenizer - scanning title-bearing: last, first, title"""
teststr = "Epstein, Brian, The Fifth Beatle"
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Epstein'], 'nonlastnames': ['Brian'], 'titles': ['The Fifth Beatle']}
self.assertEqual(output, anticipated)
def test_bifnt_scan_wildly_interesting(self):
"""BibIndexFuzzyNameTokenizer - scanning last last last, first first, title, title"""
teststr = "Ibanez y Gracia, Maria Luisa, II., ed."
output = self.scan(teststr)
anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Ibanez', 'y', 'Gracia'], 'nonlastnames': ['Maria', 'Luisa'], 'titles': ['II.', 'ed.']}
self.assertEqual(output, anticipated)
class TestFuzzyNameTokenizerTokens(unittest.TestCase):
"""Test BibIndex name variant token generation from scanned and tagged sets"""
def setUp(self):
self.tokenizer = tokenizer_lib.BibIndexFuzzyNameTokenizer()
self.get_index_tokens = self.tokenizer.parse_scanned
def test_bifnt_tokenize_single(self):
"""BibIndexFuzzyNameTokenizer - tokens for single-word name
Ronaldo
"""
tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Ronaldo'], 'nonlastnames': [], 'titles': []}
output = self.get_index_tokens(tagged_data)
anticipated = ['Ronaldo']
self.assertEqual(output, anticipated)
def test_bifnt_tokenize_simple_forward(self):
"""BibIndexFuzzyNameTokenizer - tokens for first last
Ringo Starr
"""
tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': []}
output = self.get_index_tokens(tagged_data)
anticipated = ['R Starr', 'Ringo Starr', 'Starr', 'Starr, R', 'Starr, Ringo']
self.assertEqual(output, anticipated)
def test_bifnt_tokenize_simple_reverse(self):
"""BibIndexFuzzyNameTokenizer - tokens for last, first
Starr, Ringo
"""
tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': []}
output = self.get_index_tokens(tagged_data)
anticipated = ['R Starr', 'Ringo Starr', 'Starr', 'Starr, R', 'Starr, Ringo']
self.assertEqual(output, anticipated)
def test_bifnt_tokenize_twoname_forward(self):
"""BibIndexFuzzyNameTokenizer - tokens for first middle last
Michael Edward Peskin
"""
tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Peskin'], 'nonlastnames': ['Michael', 'Edward'], 'titles': []}
output = self.get_index_tokens(tagged_data)
anticipated = ['E Peskin', 'Edward Peskin', 'M E Peskin', 'M Edward Peskin', 'M Peskin',
'Michael E Peskin', 'Michael Edward Peskin', 'Michael Peskin',
'Peskin', 'Peskin, E', 'Peskin, Edward', 'Peskin, M',
'Peskin, M E', 'Peskin, M Edward', 'Peskin, Michael',
'Peskin, Michael E', 'Peskin, Michael Edward']
self.assertEqual(output, anticipated)
def test_bifnt_tokenize_compound_last(self):
"""BibIndexFuzzyNameTokenizer - tokens for last last, first
Alvarez Gaume, Joachim
"""
tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Alvarez', 'Gaume'], 'nonlastnames': ['Joachim'], 'titles': []}
output = self.get_index_tokens(tagged_data)
anticipated = ['Alvarez', 'Alvarez Gaume', 'Alvarez Gaume, J', 'Alvarez Gaume, Joachim',
'Alvarez, J', 'Alvarez, Joachim', 'Gaume', 'Gaume, J',
'Gaume, Joachim', 'J Alvarez', 'J Alvarez Gaume', 'J Gaume', 'Joachim Alvarez',
'Joachim Alvarez Gaume', 'Joachim Gaume']
self.assertEqual(output, anticipated)
def test_bifnt_tokenize_titled(self):
"""BibIndexFuzzyNameTokenizer - tokens for last, first, title
Epstein, Brian, The Fifth Beatle
"""
tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Epstein'], 'nonlastnames': ['Brian'], 'titles': ['The Fifth Beatle']}
output = self.get_index_tokens(tagged_data)
anticipated = ['B Epstein', 'B Epstein, The Fifth Beatle', 'Brian Epstein',
'Brian Epstein, The Fifth Beatle', 'Epstein', 'Epstein, B',
'Epstein, B, The Fifth Beatle', 'Epstein, Brian',
'Epstein, Brian, The Fifth Beatle']
self.assertEqual(output, anticipated)
def test_bifnt_tokenize_wildly_interesting(self):
"""BibIndexFuzzyNameTokenizer - tokens for last last last, first first, title, title
Ibanez y Gracia, Maria Luisa, II, (ed.)
"""
tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Ibanez', 'y', 'Gracia'], 'nonlastnames': ['Maria', 'Luisa'], 'titles': ['II', '(ed.)']}
output = self.get_index_tokens(tagged_data)
anticipated = ['Gracia', 'Gracia, L', 'Gracia, Luisa', 'Gracia, M', 'Gracia, M L', 'Gracia, M Luisa',
'Gracia, Maria', 'Gracia, Maria L', 'Gracia, Maria Luisa', 'Ibanez', 'Ibanez y Gracia',
'Ibanez y Gracia, L', 'Ibanez y Gracia, L, II',
'Ibanez y Gracia, Luisa', 'Ibanez y Gracia, Luisa, II',
'Ibanez y Gracia, M', 'Ibanez y Gracia, M L', 'Ibanez y Gracia, M L, II',
'Ibanez y Gracia, M Luisa', 'Ibanez y Gracia, M Luisa, II',
'Ibanez y Gracia, M, II',
'Ibanez y Gracia, Maria',
'Ibanez y Gracia, Maria L', 'Ibanez y Gracia, Maria L, II',
'Ibanez y Gracia, Maria Luisa', 'Ibanez y Gracia, Maria Luisa, II',
'Ibanez y Gracia, Maria, II',
'Ibanez, L', 'Ibanez, Luisa',
'Ibanez, M', 'Ibanez, M L', 'Ibanez, M Luisa', 'Ibanez, Maria',
'Ibanez, Maria L', 'Ibanez, Maria Luisa', 'L Gracia', 'L Ibanez',
'L Ibanez y Gracia', 'L Ibanez y Gracia, II', 'Luisa Gracia', 'Luisa Ibanez',
'Luisa Ibanez y Gracia', 'Luisa Ibanez y Gracia, II', 'M Gracia',
'M Ibanez', 'M Ibanez y Gracia', 'M Ibanez y Gracia, II', 'M L Gracia',
'M L Ibanez', 'M L Ibanez y Gracia', 'M L Ibanez y Gracia, II',
'M Luisa Gracia', 'M Luisa Ibanez', 'M Luisa Ibanez y Gracia', 'M Luisa Ibanez y Gracia, II',
'Maria Gracia',
'Maria Ibanez', 'Maria Ibanez y Gracia', 'Maria Ibanez y Gracia, II',
'Maria L Gracia', 'Maria L Ibanez', 'Maria L Ibanez y Gracia', 'Maria L Ibanez y Gracia, II',
'Maria Luisa Gracia', 'Maria Luisa Ibanez', 'Maria Luisa Ibanez y Gracia',
'Maria Luisa Ibanez y Gracia, II']
self.assertEqual(output, anticipated)
def test_bifnt_tokenize_multimiddle_forward(self):
"""BibIndexFuzzyNameTokenizer - tokens for first middle middle last
W K H Panofsky
"""
tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles'],
'lastnames': ['Panofsky'], 'nonlastnames': ['W', 'K', 'H'], 'titles': []}
output = self.get_index_tokens(tagged_data)
anticipated = ['H Panofsky', 'K H Panofsky', 'K Panofsky', 'Panofsky',
'Panofsky, H', 'Panofsky, K', 'Panofsky, K H',
'Panofsky, W', 'Panofsky, W H', 'Panofsky, W K',
'Panofsky, W K H', 'W H Panofsky',
'W K H Panofsky', 'W K Panofsky', 'W Panofsky']
self.assertEqual(output, anticipated)
def test_tokenize(self):
"""BibIndexFuzzyNameTokenizer - check tokenize()
Ringo Starr
"""
teststr = "Ringo Starr"
output = self.tokenizer.tokenize(teststr)
anticipated = ['R Starr', 'Ringo Starr', 'Starr', 'Starr, R', 'Starr, Ringo']
self.assertEqual(output, anticipated)
class TestExactNameTokenizer(unittest.TestCase):
"""Test exact author name tokenizer."""
def setUp(self):
"""setup"""
self.tokenizer = tokenizer_lib.BibIndexExactNameTokenizer()
def test_exact_author_name_tokenizer_bare(self):
"""BibIndexExactNameTokenizer - bare name"""
self.assertEqual(self.tokenizer.tokenize('John Doe'),
['John Doe'])
def test_exact_author_name_tokenizer_dots(self):
"""BibIndexExactNameTokenizer - name with dots"""
self.assertEqual(self.tokenizer.tokenize('J. Doe'),
['J Doe'])
self.assertEqual(self.tokenizer.tokenize('J.R. Doe'),
['J R Doe'])
self.assertEqual(self.tokenizer.tokenize('J. R. Doe'),
['J R Doe'])
def test_exact_author_name_tokenizer_hyphens(self):
"""BibIndexExactNameTokenizer - name with hyphens"""
self.assertEqual(self.tokenizer.tokenize('Doe, Jean-Pierre'),
['Doe, Jean Pierre'])
TEST_SUITE = make_test_suite(TestFuzzyNameTokenizerScanning,
TestFuzzyNameTokenizerTokens,
TestExactNameTokenizer,)
if __name__ == '__main__':
#unittest.main()
run_test_suite(TEST_SUITE)

Event Timeline