Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F86044443
bibindex_regression_tests.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Oct 3, 22:05
Size
76 KB
Mime Type
text/x-python
Expires
Sat, Oct 5, 22:05 (2 d)
Engine
blob
Format
Raw Data
Handle
21331932
Attached To
R3600 invenio-infoscience
bibindex_regression_tests.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2006, 2007, 2008, 2010, 2011, 2013 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""BibIndex Regression Test Suite."""
__revision__
=
"$Id$"
from
invenio.testutils
import
InvenioTestCase
import
os
import
re
from
datetime
import
timedelta
from
time
import
sleep
from
invenio.bibindex_engine
import
WordTable
,
\
VirtualIndexTable
,
\
get_word_tables
,
\
find_affected_records_for_index
,
\
get_recIDs_by_date_authority
,
\
get_recIDs_by_date_bibliographic
,
\
create_range_list
,
\
beautify_range_list
,
\
get_last_updated_all_indexes
,
\
re_prefix
from
invenio.bibindex_engine_utils
import
get_index_id_from_index_name
,
\
get_index_tags
,
\
get_tag_indexes
,
\
get_all_indexes
,
\
make_prefix
from
invenio.bibindex_engine_config
import
CFG_BIBINDEX_ADDING_RECORDS_STARTED_STR
,
\
CFG_BIBINDEX_INDEX_TABLE_TYPE
,
\
CFG_BIBINDEX_UPDATE_MESSAGE
from
invenio.bibtask
import
task_low_level_submission
from
invenio.config
import
CFG_BINDIR
,
CFG_LOGDIR
from
invenio.testutils
import
make_test_suite
,
run_test_suite
,
nottest
from
invenio.dbquery
import
run_sql
,
deserialize_via_marshal
from
invenio.intbitset
import
intbitset
from
invenio.search_engine
import
get_record
from
invenio.search_engine_utils
import
get_fieldvalues
from
invenio.bibauthority_engine
import
(
get_index_strings_by_control_no
,
get_control_nos_from_recID
)
from
invenio.bibindex_engine_utils
import
run_sql_drop_silently
from
invenio.bibupload
import
bibupload
,
xml_marc_to_records
from
invenio.bibupload_regression_tests
import
wipe_out_record_from_all_tables
from
invenio.bibrecord
import
record_get_field_value
from
invenio.bibsort_engine
import
get_max_recid
from
invenio.bibtask
import
task_log_path
from
invenio.dbquery
import
get_table_update_time
from
invenio.search_engine
import
get_index_stemming_language
as
gis
def
reindex_for_type_with_bibsched
(
index_name
,
force_all
=
False
,
*
other_options
):
"""Runs bibindex for the specified index and returns the task_id.
@param index_name: name of the index to reindex
@param force_all: if it's True function will reindex all records
not just affected ones
"""
program
=
os
.
path
.
join
(
CFG_BINDIR
,
'bibindex'
)
args
=
[
'bibindex'
,
'bibindex_regression_tests'
,
'-w'
,
index_name
,
'-u'
,
'admin'
]
args
.
extend
(
other_options
)
if
force_all
:
args
.
append
(
"--force"
)
task_id
=
task_low_level_submission
(
*
args
)
COMMAND
=
"
%s
%s
> /dev/null 2> /dev/null"
%
(
program
,
str
(
task_id
))
os
.
system
(
COMMAND
)
return
task_id
def
prepare_for_index_update
(
index_id
,
parameters
=
{}):
""" Prepares SQL query for an update of an index in the idxINDEX table.
Takes into account remove_stopwords, remove_html_markup, remove_latex_markup,
tokenizer and last_updated as parameters to change.
remove_html_markup and remove_latex_markup accepts these values:
'' to leave it unchanged
'Yes' to change it to 'Yes'
'No' to change it to 'No'.
For remove_stopwords instead of 'Yes' one must give the name of the file (for example: 'stopwords.kb')
from CFG_ETCDIR/bibrank/ directory pointing at stopwords knowledge base.
For tokenizer please specify the name of the tokenizer.
For last_updated provide a date in format: '2013-01-31 00:00:00'
@param index_id: id of the index to change
@param parameters: dict with names of parameters and their new values
"""
if
len
(
parameters
)
==
0
:
return
''
parameter_set
=
False
query_update
=
"UPDATE idxINDEX SET "
for
key
in
parameters
:
if
parameters
[
key
]
is
not
None
:
query_update
+=
parameter_set
and
", "
or
""
query_update
+=
"
%s
='
%s
'"
%
(
key
,
parameters
[
key
])
parameter_set
=
True
query_update
+=
" WHERE id=
%s
"
%
index_id
return
query_update
@nottest
def
reindex_word_tables_into_testtables
(
index_name
,
recids
=
None
,
prefix
=
'test_'
,
parameters
=
{},
turn_off_virtual_indexes
=
True
):
"""Function for setting up a test enviroment. Reindexes an index with a given name to a
new temporary table with a given prefix. During the reindexing it changes some parameters
of chosen index. It's useful for conducting tests concerning the reindexing process.
Reindexes only idxWORDxxx tables.
@param index_name: name of the index we want to reindex
@param recids: None means reindexing all records, set ids of the records to update only part of them
@param prefix: prefix for the new tabels, if it's set to boolean False function will reindex to original table
@param parameters: dict with parameters and their new values; for more specific
description take a look at 'prepare_for_index_update' function.
@param turn_off_virtual_indexes: if True only specific index will be reindexed
without connected virtual indexes
"""
index_id
=
get_index_id_from_index_name
(
index_name
)
query_update
=
prepare_for_index_update
(
index_id
,
parameters
)
last_updated
=
run_sql
(
"""SELECT last_updated FROM idxINDEX WHERE id=%s"""
%
index_id
)[
0
][
0
]
test_tablename
=
"
%s
idxWORD
%02d
"
%
(
prefix
,
index_id
)
query_drop_forward_index_table
=
"""DROP TABLE IF EXISTS %sF"""
%
test_tablename
query_drop_reversed_index_table
=
"""DROP TABLE IF EXISTS %sR"""
%
test_tablename
query_create_forward_index_table
=
"""CREATE TABLE %sF (
id mediumint(9) unsigned NOT NULL auto_increment,
term varchar(50) default NULL,
hitlist longblob,
PRIMARY KEY (id),
UNIQUE KEY term (term)
) ENGINE=MyISAM"""
%
test_tablename
query_create_reversed_index_table
=
"""CREATE TABLE %sR (
id_bibrec mediumint(9) unsigned NOT NULL,
termlist longblob,
type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT',
PRIMARY KEY (id_bibrec,type)
) ENGINE=MyISAM"""
%
test_tablename
if
not
prefix
==
""
:
run_sql_drop_silently
(
query_drop_forward_index_table
)
run_sql_drop_silently
(
query_drop_reversed_index_table
)
run_sql
(
query_create_forward_index_table
)
run_sql
(
query_create_reversed_index_table
)
if
query_update
:
run_sql
(
query_update
)
wordTable
=
WordTable
(
index_name
=
index_name
,
fields_to_index
=
get_index_tags
(
index_name
),
table_prefix
=
prefix
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
if
turn_off_virtual_indexes
:
wordTable
.
turn_off_virtual_indexes
()
if
recids
:
wordTable
.
add_recIDs
(
recids
,
10000
)
else
:
recIDs_for_index
=
find_affected_records_for_index
([
index_name
],
[[
1
,
get_max_recid
()]],
True
)
bib_recIDs
=
get_recIDs_by_date_bibliographic
([],
index_name
)
auth_recIDs
=
get_recIDs_by_date_authority
([],
index_name
)
final_recIDs
=
bib_recIDs
|
auth_recIDs
final_recIDs
=
set
(
final_recIDs
)
&
set
(
recIDs_for_index
[
index_name
])
final_recIDs
=
beautify_range_list
(
create_range_list
(
list
(
final_recIDs
)))
wordTable
.
add_recIDs
(
final_recIDs
,
10000
)
return
last_updated
@nottest
def
remove_reindexed_word_testtables
(
index_name
,
prefix
=
'test_'
):
"""
Removes prefix_idxWORDxxx tables created during tests.
@param index_name: name of the index
@param prefix: prefix for the tables
"""
index_id
=
get_index_id_from_index_name
(
index_name
)
test_tablename
=
"
%s
idxWORD
%02d
"
%
(
prefix
,
index_id
)
query_drop_forward_index_table
=
"""DROP TABLE IF EXISTS %sF"""
%
test_tablename
query_drop_reversed_index_table
=
"""DROP TABLE IF EXISTS %sR"""
%
test_tablename
run_sql
(
query_drop_forward_index_table
)
run_sql
(
query_drop_reversed_index_table
)
class
BibIndexRemoveStopwordsTest
(
InvenioTestCase
):
"""Tests remove_stopwords parameter of an index. Changes it in the database
and reindexes from scratch into a new table to see the diffrence which is brought
by change. Uses 'title' index.
"""
test_counter
=
0
reindexed
=
False
@classmethod
def
setUp
(
self
):
"""reindexation to new table"""
if
not
self
.
reindexed
:
self
.
last_updated
=
reindex_word_tables_into_testtables
(
'title'
,
parameters
=
{
'remove_stopwords'
:
'stopwords.kb'
,
'last_updated'
:
'0000-00-00 00:00:00'
})
self
.
reindexed
=
True
@classmethod
def
tearDown
(
self
):
"""cleaning up"""
self
.
test_counter
+=
1
if
self
.
test_counter
==
4
:
remove_reindexed_word_testtables
(
'title'
)
reverse_changes
=
prepare_for_index_update
(
get_index_id_from_index_name
(
'title'
),
parameters
=
{
'remove_stopwords'
:
'No'
,
'last_updated'
:
self
.
last_updated
})
run_sql
(
reverse_changes
)
def
test_check_occurrences_of_stopwords_in_testable_word_of
(
self
):
"""Tests if term 'of' is in the new reindexed table"""
query
=
"SELECT hitlist FROM test_idxWORD08F WHERE term='of'"
res
=
run_sql
(
query
)
self
.
assertEqual
(
0
,
len
(
res
))
def
test_check_occurrences_of_stopwords_in_testable_word_everything
(
self
):
"""Tests if term 'everything' is in the new reindexed table"""
query
=
"SELECT hitlist FROM test_idxWORD08F WHERE term='everything'"
res
=
run_sql
(
query
)
self
.
assertEqual
(
0
,
len
(
res
))
def
test_compare_non_stopwords_occurrences_in_original_and_test_tables_word_theory
(
self
):
"""Checks if stopwords removing has no influence on indexation of word 'theory' """
word
=
"theori"
#theori not theory, because of default stemming for title index
query
=
"SELECT hitlist FROM test_idxWORD08F WHERE term='
%s
'"
%
word
iset_removed
=
"iset_removed"
iset_original
=
"iset_original"
res
=
run_sql
(
query
)
if
res
:
iset_removed
=
intbitset
(
res
[
0
][
0
])
query
=
"SELECT hitlist FROM idxWORD08F WHERE term='
%s
'"
%
word
res
=
run_sql
(
query
)
if
res
:
iset_original
=
intbitset
(
res
[
0
][
0
])
self
.
assertEqual
(
len
(
iset_removed
),
len
(
iset_original
))
def
test_compare_non_stopwords_occurrences_in_original_and_test_tables_word_on
(
self
):
"""Checks if stopwords removing has no influence on indexation of word 'o(n)' """
word
=
"o(n)"
query
=
"SELECT hitlist FROM test_idxWORD08F WHERE term='
%s
'"
%
word
iset_removed
=
"iset_removed"
iset_original
=
"iset_original"
res
=
run_sql
(
query
)
if
res
:
iset_removed
=
intbitset
(
res
[
0
][
0
])
query
=
"SELECT hitlist FROM idxWORD08F WHERE term='
%s
'"
%
word
res
=
run_sql
(
query
)
if
res
:
iset_original
=
intbitset
(
res
[
0
][
0
])
self
.
assertEqual
(
len
(
iset_removed
),
len
(
iset_original
))
class
BibIndexRemoveLatexTest
(
InvenioTestCase
):
"""Tests remove_latex_markup parameter of an index. Changes it in the database
and reindexes from scratch into a new table to see the diffrence which is brought
by change. Uses 'abstract' index.
"""
test_counter
=
0
reindexed
=
False
@classmethod
def
setUp
(
self
):
"""reindexation to new table"""
if
not
self
.
reindexed
:
self
.
last_updated
=
reindex_word_tables_into_testtables
(
'abstract'
,
parameters
=
{
'remove_latex_markup'
:
'Yes'
,
'last_updated'
:
'0000-00-00 00:00:00'
})
self
.
reindexed
=
True
@classmethod
def
tearDown
(
self
):
"""cleaning up"""
self
.
test_counter
+=
1
if
self
.
test_counter
==
4
:
remove_reindexed_word_testtables
(
'abstract'
)
reverse_changes
=
prepare_for_index_update
(
get_index_id_from_index_name
(
'abstract'
),
parameters
=
{
'remove_latex_markup'
:
'No'
,
'last_updated'
:
self
.
last_updated
})
run_sql
(
reverse_changes
)
def
test_check_occurrences_after_latex_removal_word_u1
(
self
):
"""Tests how many times experssion 'u(1)' occures"""
word
=
"u(1)"
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'abstract'
),
word
)
res
=
run_sql
(
query
)
iset
=
"iset_change"
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
self
.
assertEqual
(
3
,
len
(
iset
))
def
test_check_exact_occurrences_after_latex_removal_word_theta
(
self
):
"""Tests where experssion 'theta' occures"""
word
=
"theta"
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'abstract'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertEqual
([
12
],
ilist
)
def
test_compare_occurrences_after_and_before_latex_removal_math_expression
(
self
):
"""Checks if latex removal has no influence on indexation of expression 's(u(n_1)*u(n_2))' """
word
=
's(u(n_1)*u(n_2))'
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'abstract'
),
word
)
res
=
run_sql
(
query
)
ilist_test
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist_test
=
iset
.
tolist
()
word
=
's(u(n_1)*u(n_2))'
query
=
"SELECT hitlist FROM idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'abstract'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[
"default_not_equal"
]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertEqual
(
ilist
,
ilist_test
)
def
test_check_occurrences_latex_expression_with_u1
(
self
):
"""Tests influence of latex removal on record 80"""
word
=
'
%o
ver u(1)%'
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term LIKE '
%s
'"
%
(
get_index_id_from_index_name
(
'abstract'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertEqual
([
80
],
ilist
)
class
BibIndexRemoveHtmlTest
(
InvenioTestCase
):
"""Tests remove_html_markup parameter of an index. Changes it in the database
and reindexes from scratch into a new table to see the diffrence which is brought
by change. Uses 'abstract' index.
"""
test_counter
=
0
reindexed
=
False
@classmethod
def
setUp
(
self
):
"""reindexation to new table"""
if
not
self
.
reindexed
:
self
.
last_updated
=
reindex_word_tables_into_testtables
(
'abstract'
,
parameters
=
{
'remove_html_markup'
:
'Yes'
,
'last_updated'
:
'0000-00-00 00:00:00'
})
self
.
reindexed
=
True
@classmethod
def
tearDown
(
self
):
"""cleaning up"""
self
.
test_counter
+=
1
if
self
.
test_counter
==
2
:
remove_reindexed_word_testtables
(
'abstract'
)
reverse_changes
=
prepare_for_index_update
(
get_index_id_from_index_name
(
'abstract'
),
parameters
=
{
'remove_html_markup'
:
'No'
,
'last_updated'
:
self
.
last_updated
})
run_sql
(
reverse_changes
)
def
test_check_occurrences_after_html_removal_tag_p
(
self
):
"""Tests if expression 'water-hog</p>' is not indexed after html markup removal"""
word
=
'water-hog</p>'
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'abstract'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertEqual
(
0
,
len
(
ilist
))
def
test_check_occurrences_after_and_before_html_removal_word_style
(
self
):
"""Tests html markup removal influence on expression 'style="width' """
word
=
'style="width'
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'abstract'
),
word
)
res
=
run_sql
(
query
)
ilist_test
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist_test
=
iset
.
tolist
()
query
=
"SELECT hitlist FROM idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'abstract'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertNotEqual
(
ilist
,
ilist_test
)
class
BibIndexYearIndexTest
(
InvenioTestCase
):
"""
Checks year index. Tests are diffrent than those inside WebSearch module because
they only test content and reindexation and not the search itself.
"""
test_counter
=
0
reindexed
=
False
@classmethod
def
setUp
(
self
):
"""reindexation to new table"""
if
not
self
.
reindexed
:
self
.
last_updated
=
reindex_word_tables_into_testtables
(
'year'
,
parameters
=
{
'last_updated'
:
'0000-00-00 00:00:00'
})
self
.
reindexed
=
True
@classmethod
def
tearDown
(
self
):
"""cleaning up"""
self
.
test_counter
+=
1
if
self
.
test_counter
==
3
:
remove_reindexed_word_testtables
(
'year'
)
reverse_changes
=
prepare_for_index_update
(
get_index_id_from_index_name
(
'year'
),
parameters
=
{
'last_updated'
:
self
.
last_updated
})
run_sql
(
reverse_changes
)
def
test_occurrences_in_year_index_1973
(
self
):
"""checks content of year index for year 1973"""
word
=
'1973'
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'year'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertEqual
([
34
],
ilist
)
def
test_occurrences_in_year_index_2001
(
self
):
"""checks content of year index for year 2001"""
word
=
'2001'
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'year'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertEqual
([
2
,
11
,
12
,
15
],
ilist
)
def
test_comparison_for_number_of_items
(
self
):
"""checks the reindexation of year index"""
query_test
=
"SELECT count(*) FROM test_idxWORD
%02d
F"
%
get_index_id_from_index_name
(
'year'
)
query_orig
=
"SELECT count(*) FROM idxWORD
%02d
F"
%
get_index_id_from_index_name
(
'year'
)
num_orig
=
0
num_test
=
1
res
=
run_sql
(
query_test
)
if
res
:
num_test
=
res
[
0
][
0
]
res
=
run_sql
(
query_orig
)
if
res
:
num_orig
=
res
[
0
][
0
]
self
.
assertEqual
(
num_orig
,
num_test
)
class
BibIndexAuthorCountIndexTest
(
InvenioTestCase
):
"""
Checks author count index. Tests are diffrent than those inside WebSearch module because
they only test content and reindexation and not the search itself.
"""
test_counter
=
0
reindexed
=
False
@classmethod
def
setUp
(
self
):
"""reindexation to new table"""
if
not
self
.
reindexed
:
self
.
last_updated
=
reindex_word_tables_into_testtables
(
'authorcount'
,
parameters
=
{
'last_updated'
:
'0000-00-00 00:00:00'
})
self
.
reindexed
=
True
@classmethod
def
tearDown
(
self
):
"""cleaning up"""
self
.
test_counter
+=
1
if
self
.
test_counter
==
2
:
remove_reindexed_word_testtables
(
'authorcount'
)
reverse_changes
=
prepare_for_index_update
(
get_index_id_from_index_name
(
'authorcount'
),
parameters
=
{
'last_updated'
:
self
.
last_updated
})
run_sql
(
reverse_changes
)
def
test_occurrences_in_authorcount_index
(
self
):
"""checks content of authorcount index for papers with 4 authors"""
word
=
'4'
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'authorcount'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertEqual
([
51
,
54
,
59
,
66
,
92
,
96
],
ilist
)
def
test_comparison_for_number_of_items
(
self
):
"""checks the reindexation of authorcount index"""
query_test
=
"SELECT count(*) FROM test_idxWORD
%02d
F"
%
get_index_id_from_index_name
(
'authorcount'
)
query_orig
=
"SELECT count(*) FROM idxWORD
%02d
F"
%
get_index_id_from_index_name
(
'authorcount'
)
num_orig
=
0
num_test
=
1
res
=
run_sql
(
query_test
)
if
res
:
num_test
=
res
[
0
][
0
]
res
=
run_sql
(
query_orig
)
if
res
:
num_orig
=
res
[
0
][
0
]
self
.
assertEqual
(
num_orig
,
num_test
)
class
BibIndexItemCountIndexTest
(
InvenioTestCase
):
"""
Checks item count index. Checks a number of copies of books for records
as well as occurrences of particular number of copies in test data.
"""
def
test_occurrences_in_itemcount_index_two_copies
(
self
):
"""checks content of itemcount index for records with two copies of a book"""
word
=
'2'
query
=
"SELECT hitlist FROM idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'itemcount'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertEqual
([
31
,
34
],
ilist
)
def
test_records_for_number_of_copies_record1
(
self
):
"""checks content of itemcount index for record: 1"""
query
=
"SELECT termlist FROM idxWORD
%02d
R WHERE id_bibrec=1"
\
%
get_index_id_from_index_name
(
'itemcount'
)
res
=
run_sql
(
query
)
self
.
assertEqual
(
deserialize_via_marshal
(
res
[
0
][
0
]),[
'0'
])
def
test_records_for_number_of_copies_record30
(
self
):
"""checks content of itemcount index for record: 30"""
query
=
"SELECT termlist FROM idxWORD
%02d
R WHERE id_bibrec=30"
\
%
get_index_id_from_index_name
(
'itemcount'
)
res
=
run_sql
(
query
)
self
.
assertEqual
(
deserialize_via_marshal
(
res
[
0
][
0
]),[
'1'
])
def
test_records_for_number_of_copies_record32
(
self
):
"""checks content of itemcount index for record: 32"""
query
=
"SELECT termlist FROM idxWORD
%02d
R WHERE id_bibrec=32"
\
%
get_index_id_from_index_name
(
'itemcount'
)
res
=
run_sql
(
query
)
self
.
assertEqual
(
deserialize_via_marshal
(
res
[
0
][
0
]),[
'3'
])
class
BibIndexFiletypeIndexTest
(
InvenioTestCase
):
"""
Checks filetype index. Tests are diffrent than those inside WebSearch module because
they only test content and indexation and not the search itself.
"""
def
test_occurances_of_tif_filetype
(
self
):
"""tests which records has file with 'tif' extension"""
query
=
"SELECT hitlist FROM idxWORD
%02d
F where term='tif'"
\
%
get_index_id_from_index_name
(
'filetype'
)
res
=
run_sql
(
query
)
value
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
value
=
iset
.
tolist
()
self
.
assertEqual
(
sorted
(
value
),
[
66
,
71
])
def
test_filetypes_of_records
(
self
):
"""tests files extensions of record 1 and 77"""
query1
=
"SELECT termlist FROM idxWORD
%02d
R WHERE id_bibrec=1"
\
%
get_index_id_from_index_name
(
'filetype'
)
query2
=
"SELECT termlist FROM idxWORD
%02d
R WHERE id_bibrec=77"
\
%
get_index_id_from_index_name
(
'filetype'
)
res1
=
run_sql
(
query1
)
res2
=
run_sql
(
query2
)
set1
=
deserialize_via_marshal
(
res1
[
0
][
0
])
set2
=
deserialize_via_marshal
(
res2
[
0
][
0
])
self
.
assertEqual
(
set1
,
[
'gif'
,
'jpg'
])
self
.
assertEqual
(
set2
,
[
'pdf'
,
'ps.gz'
])
class
BibIndexJournalIndexTest
(
InvenioTestCase
):
"""
Checks journal index. Tests are diffrent than those inside WebSearch module because
they only test content and reindexation and not the search itself.
"""
test_counter
=
0
reindexed
=
False
@classmethod
def
setUp
(
self
):
"""reindexation to new table"""
if
not
self
.
reindexed
:
self
.
last_updated
=
reindex_word_tables_into_testtables
(
'journal'
,
parameters
=
{
'last_updated'
:
'0000-00-00 00:00:00'
})
self
.
reindexed
=
True
@classmethod
def
tearDown
(
self
):
"""cleaning up"""
self
.
test_counter
+=
1
if
self
.
test_counter
==
2
:
remove_reindexed_word_testtables
(
'journal'
)
reverse_changes
=
prepare_for_index_update
(
get_index_id_from_index_name
(
'journal'
),
parameters
=
{
'last_updated'
:
self
.
last_updated
})
run_sql
(
reverse_changes
)
def
test_occurrences_in_journal_index
(
self
):
"""checks content of journal index for phrase: 'prog. theor. phys.' """
word
=
'prog. theor. phys.'
query
=
"SELECT hitlist FROM test_idxWORD
%02d
F WHERE term='
%s
'"
%
(
get_index_id_from_index_name
(
'journal'
),
word
)
res
=
run_sql
(
query
)
ilist
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
0
])
ilist
=
iset
.
tolist
()
self
.
assertEqual
([
86
],
ilist
)
def
test_comparison_for_number_of_items
(
self
):
"""checks the reindexation of journal index"""
query_test
=
"SELECT count(*) FROM test_idxWORD
%02d
F"
%
get_index_id_from_index_name
(
'journal'
)
query_orig
=
"SELECT count(*) FROM idxWORD
%02d
F"
%
get_index_id_from_index_name
(
'journal'
)
num_orig
=
0
num_test
=
1
res
=
run_sql
(
query_test
)
if
res
:
num_test
=
res
[
0
][
0
]
res
=
run_sql
(
query_orig
)
if
res
:
num_orig
=
res
[
0
][
0
]
self
.
assertEqual
(
num_orig
,
num_test
)
class
BibIndexCJKTokenizerTitleIndexTest
(
InvenioTestCase
):
"""
Checks CJK tokenization on title index.
"""
test_counter
=
0
reindexed
=
False
@classmethod
def
setUp
(
self
):
"""reindexation to new table"""
if
not
self
.
reindexed
:
self
.
last_updated
=
reindex_word_tables_into_testtables
(
'title'
,
parameters
=
{
'tokenizer'
:
'BibIndexCJKTokenizer'
,
'last_updated'
:
'0000-00-00 00:00:00'
})
self
.
reindexed
=
True
@classmethod
def
tearDown
(
self
):
"""cleaning up"""
self
.
test_counter
+=
1
if
self
.
test_counter
==
2
:
remove_reindexed_word_testtables
(
'title'
)
reverse_changes
=
prepare_for_index_update
(
get_index_id_from_index_name
(
'title'
),
parameters
=
{
'tokenizer'
:
'BibIndexDefaultTokenizer'
,
'last_updated'
:
self
.
last_updated
})
run_sql
(
reverse_changes
)
def
test_splliting_and_indexing_CJK_characters_forward_table
(
self
):
"""CJK Tokenizer - searching for a CJK term in title index, forward table"""
query
=
"SELECT * from test_idxWORD
%02d
F where term='
\xe6\x95\xac
'"
%
get_index_id_from_index_name
(
'title'
)
res
=
run_sql
(
query
)
iset
=
[]
if
res
:
iset
=
intbitset
(
res
[
0
][
2
])
iset
=
iset
.
tolist
()
self
.
assertEqual
(
iset
,
[
104
])
def
test_splliting_and_indexing_CJK_characters_reversed_table
(
self
):
"""CJK Tokenizer - comparing terms for record with chinese poetry in title index, reverse table"""
query
=
"SELECT * from test_idxWORD
%02d
R where id_bibrec='104'"
%
get_index_id_from_index_name
(
'title'
)
res
=
run_sql
(
query
)
iset
=
[]
if
res
:
iset
=
deserialize_via_marshal
(
res
[
0
][
1
])
self
.
assertEqual
(
iset
,
[
'
\xe6\x95\xac
'
,
'
\xe7\x8d\xa8
'
,
'
\xe4\xba\xad
'
,
'
\xe5\x9d\x90
'
])
class
BibIndexAuthorityRecordTest
(
InvenioTestCase
):
"""Test if BibIndex correctly knows when to update the index for a
bibliographic record if it is dependent upon an authority record changed
within the given date range"""
def
test_authority_record_recently_updated
(
self
):
"""bibindex - reindexing after recently changed authority record"""
authRecID
=
118
index_name
=
'author'
table
=
"idxWORD
%02d
F"
%
get_index_id_from_index_name
(
index_name
)
reindex_for_type_with_bibsched
(
index_name
)
run_sql
(
"UPDATE bibrec SET modification_date = now() WHERE id =
%s
"
,
(
authRecID
,))
# run bibindex again
task_id
=
reindex_for_type_with_bibsched
(
index_name
,
force_all
=
True
)
filename
=
task_log_path
(
task_id
,
'log'
)
_file
=
open
(
filename
)
text
=
_file
.
read
()
# small file
_file
.
close
()
self
.
assertTrue
(
text
.
find
(
CFG_BIBINDEX_UPDATE_MESSAGE
)
>=
0
)
self
.
assertTrue
(
text
.
find
(
CFG_BIBINDEX_ADDING_RECORDS_STARTED_STR
%
(
table
,
1
,
get_max_recid
()))
>=
0
)
def
test_authority_record_enriched_index
(
self
):
"""bibindex - test whether reverse index for bibliographic record
contains words from referenced authority records"""
bibRecID
=
9
authority_string
=
'jonathan'
index_name
=
'author'
table
=
"idxWORD
%02d
R"
%
get_index_id_from_index_name
(
index_name
)
reindex_for_type_with_bibsched
(
index_name
,
force_all
=
True
)
self
.
assertTrue
(
authority_string
in
deserialize_via_marshal
(
run_sql
(
"SELECT termlist FROM
%s
WHERE id_bibrec =
%s
"
%
(
table
,
bibRecID
))[
0
][
0
]
)
)
def
test_indexing_of_deleted_authority_record
(
self
):
"""bibindex - no info for indexing from deleted authority record"""
recID
=
119
# deleted record
control_nos
=
get_control_nos_from_recID
(
recID
)
info
=
get_index_strings_by_control_no
(
control_nos
[
0
])
self
.
assertEqual
([],
info
)
def
test_authority_record_get_values_by_bibrecID_from_tag
(
self
):
"""bibindex - find authors in authority records for given bibrecID"""
tags
=
[
'100__a'
]
bibRecID
=
9
values
=
[]
for
tag
in
tags
:
authority_tag
=
tag
[
0
:
3
]
+
"__0"
control_nos
=
get_fieldvalues
(
bibRecID
,
authority_tag
)
for
control_no
in
control_nos
:
new_strings
=
get_index_strings_by_control_no
(
control_no
)
values
.
extend
(
new_strings
)
self
.
assertTrue
(
'Ellis, Jonathan Richard'
in
values
)
def
insert_record_one_and_second_revision
():
"""Inserts test record no. 1 and a second revision for that record"""
rev1
=
"""<record>
<controlfield tag="001">123456789</controlfield>
<controlfield tag="005">20110101000000.0</controlfield>
<datafield tag ="100" ind1=" " ind2=" ">
<subfield code="a">Close, John</subfield>
<subfield code="u">DESY</subfield>
</datafield>
<datafield tag="245" ind1=" " ind2=" ">
<subfield code="a">Particles world</subfield>
</datafield>
</record>"""
rev1_final
=
rev1
.
replace
(
'<controlfield tag="001">123456789</controlfield>'
,
''
)
rev1_final
=
rev1_final
.
replace
(
'<controlfield tag="005">20110101000000.0</controlfield>'
,
''
)
rev2
=
rev1
.
replace
(
'<subfield code="a">Close, John</subfield>'
,
'<subfield code="a">Dawkins, Richard</subfield>'
)
rev2
=
rev2
.
replace
(
'Particles world'
,
'Particles universe'
)
rec1
=
xml_marc_to_records
(
rev1_final
)
res
=
bibupload
(
rec1
[
0
],
opt_mode
=
'insert'
)
_id
=
res
[
1
]
rec
=
get_record
(
_id
)
_rev
=
record_get_field_value
(
rec
,
'005'
,
''
,
''
)
#need to index for the first time
indexes
=
get_all_indexes
(
virtual
=
False
)
wtabs
=
get_word_tables
(
indexes
)
for
index_id
,
index_name
,
index_tags
in
wtabs
:
wordTable
=
WordTable
(
index_name
=
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
([[
_id
,
_id
]],
10000
)
#upload the second revision, but don't index
rev2_final
=
rev2
.
replace
(
'123456789'
,
str
(
_id
))
rev2_final
=
rev2_final
.
replace
(
'20110101000000.0'
,
_rev
)
rec2
=
xml_marc_to_records
(
rev2_final
)
res
=
bibupload
(
rec2
[
0
],
opt_mode
=
'correct'
)
return
_id
def
insert_record_two_and_second_revision
():
"""Inserts test record no. 2 and a revision for that record"""
rev1
=
"""<record>
<controlfield tag="001">123456789</controlfield>
<controlfield tag="005">20110101000000.0</controlfield>
<datafield tag ="100" ind1=" " ind2=" ">
<subfield code="a">Locke, John</subfield>
<subfield code="u">UNITRA</subfield>
</datafield>
<datafield tag="245" ind1=" " ind2=" ">
<subfield code="a">Collision course</subfield>
</datafield>
</record>"""
rev1_final
=
rev1
.
replace
(
'<controlfield tag="001">123456789</controlfield>'
,
''
)
rev1_final
=
rev1_final
.
replace
(
'<controlfield tag="005">20110101000000.0</controlfield>'
,
''
)
rev2
=
rev1
.
replace
(
'Collision course'
,
'Course of collision'
)
rec1
=
xml_marc_to_records
(
rev1_final
)
res
=
bibupload
(
rec1
[
0
],
opt_mode
=
'insert'
)
id_bibrec
=
res
[
1
]
rec
=
get_record
(
id_bibrec
)
_rev
=
record_get_field_value
(
rec
,
'005'
,
''
,
''
)
#need to index for the first time
indexes
=
get_all_indexes
(
virtual
=
False
)
wtabs
=
get_word_tables
(
indexes
)
for
index_id
,
index_name
,
index_tags
in
wtabs
:
wordTable
=
WordTable
(
index_name
=
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
([[
id_bibrec
,
id_bibrec
]],
10000
)
#upload the second revision, but don't index
rev2_final
=
rev2
.
replace
(
'123456789'
,
str
(
id_bibrec
))
rev2_final
=
rev2_final
.
replace
(
'20110101000000.0'
,
_rev
)
rec2
=
xml_marc_to_records
(
rev2_final
)
res
=
bibupload
(
rec2
[
0
],
opt_mode
=
'correct'
)
return
id_bibrec
def
create_index_tables
(
index_id
):
query_create
=
"""CREATE TABLE IF NOT EXISTS idxWORD%02dF (
id mediumint(9) unsigned NOT NULL auto_increment,
term varchar(50) default NULL,
hitlist longblob,
PRIMARY KEY (id),
UNIQUE KEY term (term)
) ENGINE=MyISAM"""
query_create_r
=
"""CREATE TABLE IF NOT EXISTS idxWORD%02dR (
id_bibrec mediumint(9) unsigned NOT NULL,
termlist longblob,
type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT',
PRIMARY KEY (id_bibrec,type)
) ENGINE=MyISAM"""
query_create_q
=
"""CREATE TABLE IF NOT EXISTS idxWORD%02dQ (
id mediumint(10) unsigned NOT NULL auto_increment,
runtime datetime NOT NULL default '0000-00-00 00:00:00',
id_bibrec_low mediumint(9) unsigned NOT NULL,
id_bibrec_high mediumint(9) unsigned NOT NULL,
index_name varchar(50) NOT NULL default '',
mode varchar(50) NOT NULL default 'update',
PRIMARY KEY (id),
INDEX (index_name),
INDEX (runtime)
) ENGINE=MyISAM;"""
run_sql
(
query_create
%
index_id
)
run_sql
(
query_create_r
%
index_id
)
run_sql
(
query_create_q
%
index_id
)
def
drop_index_tables
(
index_id
):
query_drop
=
"""DROP TABLE IF EXISTS idxWORD%02d%s"""
run_sql
(
query_drop
%
(
index_id
,
"F"
))
run_sql
(
query_drop
%
(
index_id
,
"R"
))
run_sql
(
query_drop
%
(
index_id
,
"Q"
))
def
create_virtual_index
(
index_id
,
dependent_indexes
):
"""creates new virtual index and binds it to specific dependent indexes"""
index_name
=
'testindex'
query
=
"""INSERT INTO idxINDEX (id, name, tokenizer) VALUES (%s, '%s', 'BibIndexDefaultTokenizer')"""
run_sql
(
query
%
(
index_id
,
index_name
))
query
=
"""INSERT INTO idxINDEX_idxINDEX VALUES (%s, %s)"""
for
index
in
dependent_indexes
:
run_sql
(
query
%
(
index_id
,
get_index_id_from_index_name
(
index
)))
create_index_tables
(
index_id
)
return
index_name
def
remove_virtual_index
(
index_id
):
"""removes tables and other traces after virtual index"""
drop_index_tables
(
index_id
)
query
=
"""DELETE FROM idxINDEX WHERE id=%s"""
%
index_id
run_sql
(
query
)
query
=
"""DELETE FROM idxINDEX_idxINDEX WHERE id_virtual=%s"""
run_sql
(
query
%
index_id
)
class
BibIndexFindingAffectedIndexes
(
InvenioTestCase
):
"""
Checks if function 'find_affected_records_for_index'
works correctly.
"""
counter
=
0
indexes
=
[
'global'
,
'fulltext'
,
'caption'
,
'journal'
,
'miscellaneous'
,
'reportnumber'
,
'year'
]
@classmethod
def
setUp
(
self
):
if
self
.
counter
==
0
:
self
.
last_updated
=
dict
(
get_last_updated_all_indexes
())
res
=
run_sql
(
"SELECT job_date FROM hstRECORD WHERE id_bibrec=10 AND affected_fields<>''"
)
self
.
hst_date
=
res
[
0
][
0
]
date_to_set
=
self
.
hst_date
-
timedelta
(
seconds
=
1
)
for
index
in
self
.
indexes
:
run_sql
(
"""UPDATE idxINDEX SET last_updated=%s
WHERE name=%s"""
,
(
str
(
date_to_set
),
index
))
@classmethod
def
tearDown
(
self
):
self
.
counter
+=
1
if
self
.
counter
>=
8
:
for
index
in
self
.
indexes
:
run_sql
(
"""UPDATE idxINDEX SET last_updated=%s
WHERE name=%s"""
,
(
self
.
last_updated
[
index
],
index
))
def
test_find_proper_indexes
(
self
):
"""bibindex - checks if affected indexes are found correctly"""
records_for_indexes
=
find_affected_records_for_index
(
get_all_indexes
(
virtual
=
False
),
[[
1
,
20
]])
self
.
assertEqual
(
sorted
([
'miscellaneous'
,
'fulltext'
,
'caption'
,
'journal'
,
'reportnumber'
,
'year'
]),
sorted
(
records_for_indexes
.
keys
()))
def
test_find_proper_recrods_for_miscellaneous_index
(
self
):
"""bibindex - checks if affected recids are found correctly for miscellaneous index"""
records_for_indexes
=
find_affected_records_for_index
(
get_all_indexes
(
virtual
=
False
),
[[
1
,
20
]])
self
.
assertEqual
(
records_for_indexes
[
'miscellaneous'
],
[
10
,
12
])
def
test_find_proper_records_for_year_index
(
self
):
"""bibindex - checks if affected recids are found correctly for year index"""
records_for_indexes
=
find_affected_records_for_index
(
get_all_indexes
(
virtual
=
False
),
[[
1
,
20
]])
self
.
assertEqual
(
records_for_indexes
[
'year'
],
[
10
,
12
])
def
test_find_proper_records_for_caption_index
(
self
):
"""bibindex - checks if affected recids are found correctly for caption index"""
records_for_indexes
=
find_affected_records_for_index
(
get_all_indexes
(
virtual
=
False
),
[[
1
,
100
]])
self
.
assertEqual
(
records_for_indexes
[
'caption'
],
[
10
,
12
,
55
,
98
])
def
test_find_proper_records_for_journal_index
(
self
):
"""bibindex - checks if affected recids are found correctly for journal index"""
records_for_indexes
=
find_affected_records_for_index
(
get_all_indexes
(
virtual
=
False
),
[[
1
,
100
]])
self
.
assertEqual
(
records_for_indexes
[
'journal'
],
[
10
])
def
test_find_proper_records_specified_only_year
(
self
):
"""bibindex - checks if affected recids are found correctly for year index if we specify only year index as input"""
records_for_indexes
=
find_affected_records_for_index
([
"year"
],
[[
1
,
100
]])
self
.
assertEqual
(
records_for_indexes
[
"year"
],
[
10
,
12
,
55
])
def
test_find_proper_records_force_all
(
self
):
"""bibindex - checks if all recids will be assigned to all specified indexes"""
records_for_indexes
=
find_affected_records_for_index
([
"year"
,
"title"
],
[[
10
,
15
]],
True
)
self
.
assertEqual
(
records_for_indexes
[
"year"
],
records_for_indexes
[
"title"
])
self
.
assertEqual
(
records_for_indexes
[
"year"
],
[
10
,
11
,
12
,
13
,
14
,
15
])
def
test_find_proper_records_nothing_for_title_index
(
self
):
"""bibindex - checks if nothing was found for title index in range of records: 1 - 20"""
records_for_indexes
=
find_affected_records_for_index
([
"title"
],
[[
1
,
20
]])
self
.
assertRaises
(
KeyError
,
lambda
:
records_for_indexes
[
"title"
])
class
BibIndexIndexingAffectedIndexes
(
InvenioTestCase
):
started
=
False
records
=
[]
counter
=
0
@classmethod
def
setUp
(
self
):
self
.
counter
+=
1
if
not
self
.
started
:
self
.
records
.
append
(
insert_record_one_and_second_revision
())
self
.
records
.
append
(
insert_record_two_and_second_revision
())
records_for_indexes
=
find_affected_records_for_index
(
get_all_indexes
(
virtual
=
False
),
[
self
.
records
])
wtabs
=
get_word_tables
(
records_for_indexes
.
keys
())
for
index_id
,
index_name
,
index_tags
in
wtabs
:
wordTable
=
WordTable
(
index_name
=
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
([
self
.
records
],
10000
)
vit
=
VirtualIndexTable
(
'global'
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
run_update
()
self
.
started
=
True
@classmethod
def
tearDown
(
self
):
if
self
.
counter
==
3
:
for
rec
in
self
.
records
:
wipe_out_record_from_all_tables
(
rec
)
indexes
=
get_all_indexes
(
virtual
=
False
)
wtabs
=
get_word_tables
(
indexes
)
for
index_id
,
index_name
,
index_tags
in
wtabs
:
wordTable
=
WordTable
(
index_name
=
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
del_recIDs
([
self
.
records
])
vit
=
VirtualIndexTable
(
'global'
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
run_update
()
def
test_proper_content_in_title_index
(
self
):
"""bibindex - checks reindexation of title index for test records.."""
index_id
=
get_index_id_from_index_name
(
'title'
)
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec IN ("""
%
(
index_id
,)
query
=
query
+
", "
.
join
(
map
(
str
,
self
.
records
))
+
")"
resp
=
run_sql
(
query
)
affiliation_rec1
=
deserialize_via_marshal
(
resp
[
0
][
0
])
affiliation_rec2
=
deserialize_via_marshal
(
resp
[
1
][
0
])
self
.
assertEqual
([
'univers'
,
'particl'
],
affiliation_rec1
)
self
.
assertEqual
([
'of'
,
'cours'
,
'collis'
],
affiliation_rec2
)
def
test_proper_content_in_author_index
(
self
):
"""bibindex - checks reindexation of author index for test records.."""
index_id
=
get_index_id_from_index_name
(
'author'
)
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec IN ("""
%
(
index_id
,)
query
=
query
+
", "
.
join
(
map
(
str
,
self
.
records
))
+
")"
resp
=
run_sql
(
query
)
author_rec1
=
deserialize_via_marshal
(
resp
[
0
][
0
])
author_rec2
=
deserialize_via_marshal
(
resp
[
1
][
0
])
self
.
assertEqual
([
'dawkins'
,
'richard'
,
],
author_rec1
)
self
.
assertEqual
([
'john'
,
'locke'
],
author_rec2
)
def
test_proper_content_in_global_index
(
self
):
"""bibindex - checks reindexation of global index for test records.."""
index_id
=
get_index_id_from_index_name
(
'global'
)
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec IN ("""
%
(
index_id
,)
query
=
query
+
", "
.
join
(
map
(
str
,
self
.
records
))
+
")"
resp
=
run_sql
(
query
)
global_rec1
=
deserialize_via_marshal
(
resp
[
0
][
0
])
global_rec2
=
deserialize_via_marshal
(
resp
[
1
][
0
])
misc_prefix
=
make_prefix
(
"miscellaneous"
)
title_prefix
=
make_prefix
(
"title"
)
self
.
assertEqual
(
True
,
misc_prefix
+
'dawkin'
in
global_rec1
)
self
.
assertEqual
(
False
,
misc_prefix
+
'close'
in
global_rec1
)
self
.
assertEqual
(
True
,
title_prefix
+
'univers'
in
global_rec1
)
self
.
assertEqual
(
True
,
misc_prefix
+
'john'
in
global_rec2
)
self
.
assertEqual
(
False
,
misc_prefix
+
'john'
in
global_rec1
)
class
BibIndexFindingIndexesForTags
(
InvenioTestCase
):
""" Tests function 'get_tag_indexes' """
def
test_fulltext_tag_virtual_indexes_on
(
self
):
"""bibindex - checks if 'get_tag_indexes' for tag 8564_u will find only 'fulltext' index"""
self
.
assertEqual
((
'fulltext'
,),
zip
(
*
get_tag_indexes
(
'8564_u'
))[
1
])
def
test_title_tag_virtual_indexes_on
(
self
):
"""bibindex - checks if 'get_tag_indexes' for tag 245__% will find also 'global' index"""
self
.
assertEqual
((
'title'
,
'exacttitle'
,
'global'
),
zip
(
*
get_tag_indexes
(
'245__%'
))[
1
])
def
test_title_tag_virtual_indexes_off
(
self
):
"""bibindex - checks if 'get_tag_indexes' for tag 245__% wont find 'global' index (with virtual=False)"""
self
.
assertEqual
((
'title'
,
'exacttitle'
),
zip
(
*
get_tag_indexes
(
'245__%'
,
virtual
=
False
))[
1
])
def
test_author_tag_virtual_indexes_on
(
self
):
"""bibindex - checks 'get_tag_indexes' for tag '100'"""
self
.
assertEqual
((
'author'
,
'affiliation'
,
'exactauthor'
,
'firstauthor'
,
'exactfirstauthor'
,
'authorcount'
,
'authorityauthor'
,
'miscellaneous'
,
'global'
),
zip
(
*
get_tag_indexes
(
'100'
))[
1
])
def
test_author_exact_tag_virtual_indexes_off
(
self
):
"""bibindex - checks 'get_tag_indexes' for tag '100__a'"""
self
.
assertEqual
((
'author'
,
'exactauthor'
,
'firstauthor'
,
'exactfirstauthor'
,
'authorcount'
,
'authorityauthor'
,
'miscellaneous'
),
zip
(
*
get_tag_indexes
(
'100__a'
,
virtual
=
False
))[
1
])
def
test_wide_tag_virtual_indexes_off
(
self
):
"""bibindex - checks 'get_tag_indexes' for tag like '86%'"""
self
.
assertEqual
((
'miscellaneous'
,),
zip
(
*
get_tag_indexes
(
'86%'
,
virtual
=
False
))[
1
])
def
test_909_tags_in_misc_index
(
self
):
"""bibindex - checks connection between misc index and tags: 909C1%, 909C4%"""
self
.
assertEqual
((
'miscellaneous'
,),
zip
(
*
get_tag_indexes
(
'909C1%'
,
virtual
=
False
))[
1
])
self
.
assertEqual
(
'miscellaneous'
in
zip
(
*
get_tag_indexes
(
'909C4%'
,
virtual
=
False
))[
1
],
False
)
def
test_year_tag_virtual_indexes_on
(
self
):
"""bibindex - checks 'get_tag_indexes' for tag 909C0y"""
self
.
assertEqual
((
'year'
,
'global'
),
zip
(
*
get_tag_indexes
(
'909C0y'
))[
1
])
def
test_wide_tag_authority_index_virtual_indexes_off
(
self
):
"""bibindex - checks 'get_tag_indexes' for tag like '15%'"""
self
.
assertEqual
((
'authoritysubject'
,
'miscellaneous'
),
zip
(
*
get_tag_indexes
(
'15%'
,
virtual
=
False
))[
1
])
class
BibIndexFindingTagsForIndexes
(
InvenioTestCase
):
""" Tests function 'get_index_tags' """
def
test_tags_for_author_index
(
self
):
"""bibindex - checks if 'get_index_tags' find proper tags for 'author' index """
self
.
assertEqual
(
get_index_tags
(
'author'
),
[
'100__a'
,
'700__a'
])
def
test_tags_for_global_index_virtual_indexes_off
(
self
):
"""bibindex - checks if 'get_index_tags' find proper tags for 'global' index """
self
.
assertEqual
(
get_index_tags
(
'global'
,
virtual
=
False
),[])
def
test_tags_for_global_index_virtual_indexes_on
(
self
):
"""bibindex - checks if 'get_index_tags' find proper tags for 'global' index """
tags
=
get_index_tags
(
'global'
)
self
.
assertEqual
(
'86%'
in
tags
,
True
)
self
.
assertEqual
(
'100__a'
in
tags
,
True
)
self
.
assertEqual
(
'245__%'
in
tags
,
True
)
class
BibIndexGlobalIndexContentTest
(
InvenioTestCase
):
""" Tests if virtual global index is correctly indexed"""
def
is_part_of
(
self
,
container
,
content
):
"""checks if content is a part of container"""
ctr
=
set
(
container
)
cont
=
set
(
content
)
return
cont
.
issubset
(
ctr
)
def
test_title_index_compatibility_reversed_table
(
self
):
"""bibindex - checks if the same words are in title and global index, reversed table"""
global_id
=
get_index_id_from_index_name
(
'global'
)
title_id
=
get_index_id_from_index_name
(
'title'
)
prefix
=
make_prefix
(
"title"
)
for
rec
in
range
(
1
,
4
):
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=%s"""
%
(
title_id
,
rec
)
res
=
run_sql
(
query
)
termlist_title
=
deserialize_via_marshal
(
res
[
0
][
0
])
termlist_title
=
[
prefix
+
item
for
item
in
termlist_title
]
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=%s"""
%
(
global_id
,
rec
)
glob
=
run_sql
(
query
)
termlist_global
=
deserialize_via_marshal
(
glob
[
0
][
0
])
self
.
assertEqual
(
self
.
is_part_of
(
termlist_global
,
termlist_title
),
True
)
def
test_abstract_index_compatibility_reversed_table
(
self
):
"""bibindex - checks if the same words are in abstract and global index, reversed table"""
global_id
=
get_index_id_from_index_name
(
'global'
)
abstract_id
=
get_index_id_from_index_name
(
'abstract'
)
prefix
=
make_prefix
(
"abstract"
)
for
rec
in
range
(
6
,
9
):
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=%s"""
%
(
abstract_id
,
rec
)
res
=
run_sql
(
query
)
termlist_abstract
=
deserialize_via_marshal
(
res
[
0
][
0
])
termlist_abstract
=
[
prefix
+
item
for
item
in
termlist_abstract
]
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=%s"""
%
(
global_id
,
rec
)
glob
=
run_sql
(
query
)
termlist_global
=
deserialize_via_marshal
(
glob
[
0
][
0
])
self
.
assertEqual
(
self
.
is_part_of
(
termlist_global
,
termlist_abstract
),
True
)
def
test_misc_index_compatibility_reversed_table
(
self
):
"""bibindex - checks if the same words are in misc and global index, reversed table"""
global_id
=
get_index_id_from_index_name
(
'global'
)
misc_id
=
get_index_id_from_index_name
(
'miscellaneous'
)
prefix
=
make_prefix
(
"miscellaneous"
)
for
rec
in
range
(
10
,
14
):
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=%s"""
%
(
misc_id
,
rec
)
res
=
run_sql
(
query
)
termlist_misc
=
deserialize_via_marshal
(
res
[
0
][
0
])
termlist_misc
=
[
prefix
+
item
for
item
in
termlist_misc
]
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=%s"""
%
(
global_id
,
rec
)
glob
=
run_sql
(
query
)
termlist_global
=
deserialize_via_marshal
(
glob
[
0
][
0
])
self
.
assertEqual
(
self
.
is_part_of
(
termlist_global
,
termlist_misc
),
True
)
def
test_journal_index_compatibility_forward_table
(
self
):
"""bibindex - checks if the same words are in journal and global index, forward table"""
global_id
=
get_index_id_from_index_name
(
'global'
)
journal_id
=
get_index_id_from_index_name
(
'journal'
)
query
=
"""SELECT term FROM idxWORD%02dF"""
%
journal_id
res
=
zip
(
*
run_sql
(
query
))[
0
]
query
=
"""SELECT term FROM idxWORD%02dF"""
%
global_id
glob
=
zip
(
*
run_sql
(
query
))[
0
]
self
.
assertEqual
(
self
.
is_part_of
(
glob
,
res
),
True
)
def
test_keyword_index_compatibility_forward_table
(
self
):
"""bibindex - checks if the same pairs are in keyword and global index, forward table"""
global_id
=
get_index_id_from_index_name
(
'global'
)
keyword_id
=
get_index_id_from_index_name
(
'keyword'
)
query
=
"""SELECT term FROM idxPAIR%02dF"""
%
keyword_id
res
=
zip
(
*
run_sql
(
query
))[
0
]
query
=
"""SELECT term FROM idxPAIR%02dF"""
%
global_id
glob
=
zip
(
*
run_sql
(
query
))[
0
]
self
.
assertEqual
(
self
.
is_part_of
(
glob
,
res
),
True
)
def
test_affiliation_index_compatibility_forward_table
(
self
):
"""bibindex - checks if the same phrases are in affiliation and global index, forward table"""
global_id
=
get_index_id_from_index_name
(
'global'
)
affiliation_id
=
get_index_id_from_index_name
(
'affiliation'
)
query
=
"""SELECT term FROM idxPHRASE%02dF"""
%
affiliation_id
res
=
zip
(
*
run_sql
(
query
))[
0
]
query
=
"""SELECT term FROM idxPHRASE%02dF"""
%
global_id
glob
=
zip
(
*
run_sql
(
query
))[
0
]
self
.
assertEqual
(
self
.
is_part_of
(
glob
,
res
),
True
)
class
BibIndexVirtualIndexAlsoChangesTest
(
InvenioTestCase
):
""" Tests if virtual index changes after changes in dependent index"""
counter
=
0
indexes
=
[
"title"
]
_id
=
39
new_index_name
=
""
@classmethod
def
prepare_virtual_index
(
self
):
"""creates new virtual index and binds it to specific normal index"""
self
.
new_index_name
=
create_virtual_index
(
self
.
_id
,
self
.
indexes
)
wtabs
=
get_word_tables
(
self
.
indexes
)
for
index_id
,
index_name
,
index_tags
in
wtabs
:
wordTable
=
WordTable
(
index_name
=
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
([[
1
,
10
]],
1000
)
vit
=
VirtualIndexTable
(
self
.
new_index_name
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
run_update
()
vit
=
VirtualIndexTable
(
'global'
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
run_update
()
@classmethod
def
reindex_virtual_index
(
self
,
special_tokenizer
=
False
):
"""reindexes virtual and dependent indexes with different tokenizer"""
def
tokenize_for_words
(
phrase
):
return
phrase
.
split
(
" "
)
wtabs
=
get_word_tables
(
self
.
indexes
)
for
index_id
,
index_name
,
index_tags
in
wtabs
:
wordTable
=
WordTable
(
index_name
=
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
if
special_tokenizer
==
True
:
wordTable
.
default_tokenizer_function
=
tokenize_for_words
wordTable
.
add_recIDs
([[
1
,
10
]],
1000
)
vit
=
VirtualIndexTable
(
self
.
new_index_name
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
run_update
()
vit
=
VirtualIndexTable
(
'global'
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
run_update
()
@classmethod
def
setUp
(
self
):
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
prepare_virtual_index
()
elif
self
.
counter
==
2
:
self
.
reindex_virtual_index
(
special_tokenizer
=
True
)
@classmethod
def
tearDown
(
self
):
if
self
.
counter
==
3
:
self
.
reindex_virtual_index
()
elif
self
.
counter
==
4
:
remove_virtual_index
(
self
.
_id
)
def
test_virtual_index_1_has_10_records
(
self
):
"""bibindex - checks if virtual index was filled with only ten records from title index"""
query
=
"SELECT count(*) FROM idxWORD
%02d
R"
%
self
.
_id
self
.
assertEqual
(
10
,
run_sql
(
query
)[
0
][
0
])
def
test_virtual_index_2_correct_content_record_1
(
self
):
"""bibindex - after reindexing with different tokenizer virtual index also changes - record 1"""
query
=
"SELECT termlist FROM idxWORD
%02d
R WHERE id_bibrec=
%s
"
%
(
self
.
_id
,
1
)
prefix
=
make_prefix
(
"title"
)
self
.
assertEqual
(
prefix
+
'Higgs'
in
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
]),
True
)
def
test_virtual_index_3_correct_content_record_3
(
self
):
"""bibindex - after reindexing with different tokenizer virtual index also changes - record 3"""
query
=
"SELECT termlist FROM idxWORD
%02d
R WHERE id_bibrec=
%s
"
%
(
self
.
_id
,
3
)
prefix
=
make_prefix
(
"title"
)
self
.
assertEqual
([
prefix
+
item
for
item
in
(
'Conference'
,
'Biology'
,
'Molecular'
,
'European'
)],
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
]))
def
test_virtual_index_4_cleaned_up
(
self
):
"""bibindex - after reindexing with normal title tokenizer everything is back to normal"""
#this is version of test for installation with PyStemmer package
#without this package word 'biology' is stemmed differently
query
=
"SELECT termlist FROM idxWORD
%02d
R WHERE id_bibrec=
%s
"
%
(
self
.
_id
,
3
)
prefix
=
make_prefix
(
"title"
)
self
.
assertEqual
([
prefix
+
item
for
item
in
(
'biolog'
,
'molecular'
,
'confer'
,
'european'
)],
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
]))
class
BibIndexVirtualIndexRemovalTest
(
InvenioTestCase
):
counter
=
0
indexes
=
[
"authorcount"
,
"journal"
,
"year"
]
_id
=
40
new_index_name
=
""
@classmethod
def
setUp
(
self
):
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
new_index_name
=
create_virtual_index
(
self
.
_id
,
self
.
indexes
)
wtabs
=
get_word_tables
(
self
.
indexes
)
for
index_id
,
index_name
,
index_tags
in
wtabs
:
wordTable
=
WordTable
(
index_name
=
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexFulltextTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
([[
1
,
113
]],
1000
)
vit
=
VirtualIndexTable
(
self
.
new_index_name
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
run_update
()
#removal part
vit
.
remove_dependent_index
(
"authorcount"
)
@classmethod
def
tearDown
(
self
):
if
self
.
counter
==
9
:
remove_virtual_index
(
self
.
_id
)
def
test_authorcount_removal_number_of_items
(
self
):
"""bibindex - checks virtual index after authorcount index removal - number of items"""
query
=
"""SELECT count(*) FROM idxWORD%02dF"""
res
=
run_sql
(
query
%
self
.
_id
)
self
.
assertEqual
(
157
,
res
[
0
][
0
])
def
test_authorcount_removal_common_terms_intact
(
self
):
"""bibindex - checks virtual index after authorcount index removal - common terms"""
query
=
"""SELECT term FROM idxWORD%02dF WHERE term IN ('10', '2', '4', '7')"""
res
=
run_sql
(
query
%
self
.
_id
)
self
.
assertEqual
(
4
,
len
(
res
))
def
test_authorcount_removal_no_315_term
(
self
):
"""bibindex - checks virtual index after authorcount index removal - no '315' term in virtual index"""
query
=
"""SELECT term FROM idxWORD%02dF WHERE term='315'"""
res
=
run_sql
(
query
%
self
.
_id
)
self
.
assertEqual
(
0
,
len
(
res
))
def
test_authorcount_removal_term_10_hitlist
(
self
):
"""bibindex - checks virtual index after authorcount index removal - hitlist for '10' term"""
query
=
"""SELECT hitlist FROM idxWORD%02dF WHERE term='10'"""
res
=
run_sql
(
query
%
self
.
_id
)
self
.
assertEqual
([
80
,
92
],
intbitset
(
res
[
0
][
0
])
.
tolist
())
def
test_authorcount_removal_term_1985_hitlist
(
self
):
"""bibindex - checks virtual index after authorcount index removal - hitlist for '1985' term"""
query
=
"""SELECT hitlist FROM idxWORD%02dF WHERE term='1985'"""
res
=
run_sql
(
query
%
self
.
_id
)
self
.
assertEqual
([
16
,
18
],
intbitset
(
res
[
0
][
0
])
.
tolist
())
def
test_authorcount_removal_record_16_hitlist
(
self
):
"""bibindex - checks virtual index after authorcount index removal - termlist for record 16"""
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=16"""
res
=
run_sql
(
query
%
self
.
_id
)
terms
=
deserialize_via_marshal
(
res
[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
([
'1985'
],
terms
)
def
test_authorcount_removal_record_10_hitlist
(
self
):
"""bibindex - checks virtual index after authorcount index removal - termlist for record 10"""
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=10"""
res
=
run_sql
(
query
%
self
.
_id
)
terms
=
deserialize_via_marshal
(
res
[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
(
sorted
([
'2002'
,
'Eur. Phys. J., C'
]),
sorted
(
terms
))
def
test_year_removal_number_of_items
(
self
):
"""bibindex - checks virtual index after year removal - number of items"""
#must be run after: tearDown
vit
=
VirtualIndexTable
(
self
.
new_index_name
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
remove_dependent_index
(
"year"
)
query
=
"""SELECT count(*) FROM idxWORD%02dF"""
res
=
run_sql
(
query
%
self
.
_id
)
self
.
assertEqual
(
134
,
res
[
0
][
0
])
def
test_year_removal_record_18_hitlist
(
self
):
"""bibindex - checks virtual index after year removal - termlist for record 18"""
#must be run after: tearDown, test_year_removal_number_of_items
query
=
"""SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=18"""
res
=
run_sql
(
query
%
self
.
_id
)
terms
=
deserialize_via_marshal
(
res
[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
(
sorted
([
'151'
,
'357'
,
'1985'
,
'Phys. Lett., B 151 (1985) 357'
,
'Phys. Lett., B'
]),
sorted
(
terms
))
class
BibIndexCLICallTest
(
InvenioTestCase
):
"""Tests if calls to bibindex from CLI (bibsched deamon) are run correctly"""
def
test_correct_message_for_wrong_index_names
(
self
):
"""bibindex - checks if correct message for wrong index appears"""
index_name
=
"titlexrg"
task_id
=
reindex_for_type_with_bibsched
(
index_name
,
force_all
=
True
)
filename
=
task_log_path
(
task_id
,
'log'
)
fl
=
open
(
filename
)
text
=
fl
.
read
()
# small file
fl
.
close
()
self
.
assertTrue
(
text
.
find
(
"Specified indexes can't be found."
)
>=
0
)
def
test_correct_message_for_up_to_date_indexes
(
self
):
"""bibindex - checks if correct message for index up to date appears"""
index_name
=
"abstract"
task_id
=
reindex_for_type_with_bibsched
(
index_name
)
filename
=
task_log_path
(
task_id
,
'log'
)
fl
=
open
(
filename
)
text
=
fl
.
read
()
# small file
fl
.
close
()
self
.
assertTrue
(
text
.
find
(
"Selected indexes/recIDs are up to date."
)
>=
0
)
class
BibIndexCommonWordsInVirtualIndexTest
(
InvenioTestCase
):
"""Tests if WordTable indexes virtual index correctly in case when
two or more dependent indexes have common words and we change
only one of them
"""
counter
=
0
index_name
=
'title'
prefix
=
make_prefix
(
"title"
)
@classmethod
def
setUp
(
self
):
self
.
counter
+=
1
if
self
.
counter
==
3
:
index_id
=
get_index_id_from_index_name
(
self
.
index_name
)
index_tags
=
get_index_tags
(
self
.
index_name
)
# tests are too fast for DataCacher timestamp_verifier to notice the difference
sleep
(
1
)
query
=
"""UPDATE idxINDEX SET stemming_language='' WHERE id=8"""
run_sql
(
query
)
wordTable
=
WordTable
(
index_name
=
self
.
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
([[
1
,
9
]],
1000
)
vit
=
VirtualIndexTable
(
'global'
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
run_update
()
wordTable
=
WordTable
(
index_name
=
self
.
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Pairs"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
([[
6
,
9
]],
1000
)
vit
=
VirtualIndexTable
(
'global'
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Pairs"
])
vit
.
run_update
()
def
tearDown
(
self
):
if
self
.
counter
==
8
:
index_id
=
get_index_id_from_index_name
(
self
.
index_name
)
index_tags
=
get_index_tags
(
self
.
index_name
)
# tests are too fast for DataCacher timestamp_verifier to notice the difference
sleep
(
1
)
query
=
"""UPDATE idxINDEX SET stemming_language='en' WHERE id=8"""
run_sql
(
query
)
wordTable
=
WordTable
(
index_name
=
self
.
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
([[
1
,
9
]],
1000
)
vit
=
VirtualIndexTable
(
'global'
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
vit
.
run_update
()
wordTable
=
WordTable
(
index_name
=
self
.
index_name
,
fields_to_index
=
index_tags
,
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Pairs"
],
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
([[
6
,
9
]],
1000
)
vit
=
VirtualIndexTable
(
'global'
,
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Pairs"
])
vit
.
run_update
()
def
test_1_initial_state_of_record_1
(
self
):
"""bibindex - checks if record 1 has proper initial state for word: experiment"""
query
=
"""SELECT termlist FROM idxWORD08R WHERE id_bibrec=1"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
self
.
assertEqual
(
terms
.
count
(
'experi'
),
1
)
query
=
"""SELECT termlist FROM idxWORD01R WHERE id_bibrec=1"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
(
terms
.
count
(
'experi'
),
2
)
self
.
assertEqual
(
terms
.
count
(
'experiment'
),
1
)
def
test_2_initial_state_of_record_3
(
self
):
"""bibindex - checks if record 3 has proper initial state for word: biology"""
query
=
"""SELECT termlist FROM idxWORD08R WHERE id_bibrec=3"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
self
.
assertEqual
(
terms
.
count
(
'biolog'
),
1
)
self
.
assertEqual
(
terms
.
count
(
'biology'
),
0
)
query
=
"""SELECT termlist FROM idxWORD01R WHERE id_bibrec=3"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
(
terms
.
count
(
'biolog'
),
2
)
def
test_3_experiment_in_record_1
(
self
):
"""bibindex - checks count of 'experiment' and 'experi' words in global virtual index"""
query
=
"""SELECT termlist FROM idxWORD01R WHERE id_bibrec=1"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
(
terms
.
count
(
'experi'
),
1
)
self
.
assertEqual
(
terms
.
count
(
'experiment'
),
2
)
def
test_4_boson_in_record_1
(
self
):
"""bibindex - checks count of 'boson' - it doesn't change"""
query
=
"""SELECT termlist FROM idxWORD01R WHERE id_bibrec=1"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
(
terms
.
count
(
'boson'
),
3
)
def
test_5_biology_in_record_3
(
self
):
"""bibindex - checks count of 'biology' word in record 3"""
query
=
"""SELECT termlist FROM idxWORD01R WHERE id_bibrec=3"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
(
terms
.
count
(
'biology'
),
2
)
self
.
assertEqual
(
terms
.
count
(
'biolog'
),
1
)
query
=
"""SELECT termlist FROM idxWORD08R WHERE id_bibrec=3"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
self
.
assertEqual
(
terms
.
count
(
'biolog'
),
0
)
def
test_6_supersymmetry_in_record_9
(
self
):
"""bibindex - checks count of 'supersymmetry' word in record 9"""
query
=
"""SELECT termlist FROM idxWORD01R WHERE id_bibrec=9"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
(
terms
.
count
(
'supersymmetri'
),
0
)
def
test_7_biology_in_record_3_forward_table
(
self
):
"""bibindex - checks if 'biolog' word is in forward table"""
query
=
"""SELECT term FROM idxWORD01F WHERE term='biolog'"""
res
=
run_sql
(
query
)
self
.
assertEqual
(
'biolog'
,
res
[
0
][
0
])
def
test_8_nobel_prizewinners_pair_in_record_6
(
self
):
"""bibindex - checks if 'nobel prizewinners' is in virtual index"""
query
=
"""SELECT termlist FROM idxPAIR08R WHERE id_bibrec=6"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
self
.
assertEqual
(
'nobel prizewinners'
in
terms
,
True
)
query
=
"""SELECT termlist FROM idxPAIR01R WHERE id_bibrec=6"""
terms
=
deserialize_via_marshal
(
run_sql
(
query
)[
0
][
0
])
terms
=
[
re
.
sub
(
re_prefix
,
''
,
term
)
for
term
in
terms
]
self
.
assertEqual
(
'nobel prizewinn'
in
terms
,
True
)
self
.
assertEqual
(
'nobel prizewinners'
in
terms
,
True
)
class
BibIndexVirtualIndexQueueTableTest
(
InvenioTestCase
):
"""Tests communication through Queue tables between virtual index and
dependent indexes"""
@classmethod
def
index_dependent_index
(
self
,
index_name
,
records_range
,
table_type
):
"""indexes a dependent index for given record range"""
index_id
=
get_index_id_from_index_name
(
index_name
)
index_tags
=
get_index_tags
(
index_name
)
wordTable
=
WordTable
(
index_name
=
index_name
,
fields_to_index
=
index_tags
,
table_type
=
table_type
,
tag_to_tokenizer_map
=
{
'8564_u'
:
"BibIndexEmptyTokenizer"
},
wash_index_terms
=
50
)
wordTable
.
add_recIDs
(
records_range
,
10000
)
@classmethod
def
run_update_for_virtual_index
(
self
,
table_type
):
"""triggers an update in virtual 'global' index"""
vit
=
VirtualIndexTable
(
'global'
,
table_type
)
vit
.
run_update
()
def
test_1_correct_entry_in_queue_for_word_table
(
self
):
"""bibindex - checks correct entry in queue table for words"""
self
.
index_dependent_index
(
'title'
,
[[
10
,
14
]],
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
query
=
"SELECT * FROM idxWORD01Q"
res
=
run_sql
(
query
)
self
.
assertEqual
((
10
,
14
),
(
res
[
0
][
2
],
res
[
0
][
3
]))
self
.
run_update_for_virtual_index
(
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
])
def
test_2_correct_entry_in_queue_for_pair_table
(
self
):
"""bibindex - checks correct entry in queue table for pairs"""
self
.
index_dependent_index
(
'collection'
,
[[
1
,
5
],[
20
,
21
]],
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Pairs"
])
query
=
"SELECT * FROM idxPAIR01Q ORDER BY runtime,id DESC"
res
=
run_sql
(
query
)
self
.
assertEqual
(
2
,
len
(
res
))
self
.
assertEqual
((
20
,
21
),
(
res
[
0
][
2
],
res
[
0
][
3
]))
self
.
assertEqual
(
'update'
,
res
[
0
][
5
])
self
.
run_update_for_virtual_index
(
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Pairs"
])
def
test_3_correct_entry_in_queue_for_phrase_table
(
self
):
"""bibindex - checks correct entry in queue table for phrases"""
self
.
index_dependent_index
(
'keyword'
,
[[
19
,
19
]],
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Phrases"
])
query
=
"SELECT * FROM idxPHRASE01Q"
res
=
run_sql
(
query
)
self
.
assertEqual
((
19
,
19
),
(
res
[
0
][
2
],
res
[
0
][
3
]))
self
.
assertEqual
(
'keyword'
,
res
[
0
][
4
])
self
.
run_update_for_virtual_index
(
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Phrases"
])
def
test_4_no_entries_in_queue_table
(
self
):
"""bibindex - checks if virtual index removes entries from queue table after update"""
query
=
"SELECT * FROM idxWORD01Q"
res
=
run_sql
(
query
)
empty
=
tuple
()
self
.
assertEqual
(
empty
,
res
)
def
test_5_remove_duplicates_in_queue_table
(
self
):
"""bibindex - checks if duplicates are removed"""
index_name
=
'title'
table_type
=
CFG_BIBINDEX_INDEX_TABLE_TYPE
[
"Words"
]
self
.
index_dependent_index
(
index_name
,
[[
10
,
14
]],
table_type
)
self
.
index_dependent_index
(
index_name
,
[[
20
,
23
]],
table_type
)
self
.
index_dependent_index
(
index_name
,
[[
10
,
14
]],
table_type
)
query
=
"""SELECT id_bibrec_low, id_bibrec_high, mode FROM idx%s01Q
WHERE index_name='%s' ORDER BY runtime ASC"""
%
(
table_type
,
index_name
)
entries_before
=
run_sql
(
query
)
vit
=
VirtualIndexTable
(
'global'
,
table_type
)
entries_after
=
vit
.
remove_duplicates
(
entries_before
)
self
.
assertEqual
(
len
(
entries_before
),
3
)
self
.
assertEqual
(
len
(
entries_after
),
2
)
self
.
assertTrue
(
entries_before
[
1
]
==
entries_after
[
1
])
self
.
run_update_for_virtual_index
(
table_type
)
TEST_SUITE
=
make_test_suite
(
BibIndexRemoveStopwordsTest
,
BibIndexRemoveLatexTest
,
BibIndexRemoveHtmlTest
,
BibIndexYearIndexTest
,
BibIndexAuthorCountIndexTest
,
BibIndexItemCountIndexTest
,
BibIndexFiletypeIndexTest
,
BibIndexJournalIndexTest
,
BibIndexCJKTokenizerTitleIndexTest
,
BibIndexAuthorityRecordTest
,
BibIndexFindingAffectedIndexes
,
BibIndexIndexingAffectedIndexes
,
BibIndexFindingIndexesForTags
,
BibIndexFindingTagsForIndexes
,
BibIndexGlobalIndexContentTest
,
BibIndexVirtualIndexAlsoChangesTest
,
BibIndexVirtualIndexRemovalTest
,
BibIndexCLICallTest
,
BibIndexCommonWordsInVirtualIndexTest
,
BibIndexVirtualIndexQueueTableTest
)
if
__name__
==
"__main__"
:
run_test_suite
(
TEST_SUITE
,
warn_user
=
True
)
Event Timeline
Log In to Comment