Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91069699
xapianutils_bibrank_searcher.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Nov 7, 13:34
Size
6 KB
Mime Type
text/x-python
Expires
Sat, Nov 9, 13:34 (2 d)
Engine
blob
Format
Raw Data
Handle
22188347
Attached To
R3600 invenio-infoscience
xapianutils_bibrank_searcher.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Xapian utilities.
"""
from
invenio.config
import
CFG_XAPIAN_ENABLED
from
invenio.intbitset
import
intbitset
from
invenio.xapianutils_config
import
INDEXES
,
XAPIAN_DIR
if
CFG_XAPIAN_ENABLED
:
import
xapian
class
MatchDecider
(
xapian
.
MatchDecider
):
def
__init__
(
self
,
ids
):
xapian
.
MatchDecider
.
__init__
(
self
)
self
.
ids
=
ids
def
__call__
(
self
,
document
):
return
document
.
get_docid
()
in
self
.
ids
DATABASES
=
dict
()
def
xapian_get_ranked_index
(
index
,
pattern
,
params
,
hitset
,
ranked_result_amount
):
"""
Queries a Xapian index.
Returns: a list of ranked record ids [(recid, score), ...) contained in hitset
and an intbitset of record ids contained in hitset.
"""
result
=
[]
matched_recs
=
intbitset
()
database
=
DATABASES
[
index
]
enquire
=
xapian
.
Enquire
(
database
)
qp
=
xapian
.
QueryParser
()
stemmer
=
xapian
.
Stem
(
"english"
)
qp
.
set_stemmer
(
stemmer
)
qp
.
set_database
(
database
)
qp
.
set_stemming_strategy
(
xapian
.
QueryParser
.
STEM_SOME
)
# Avoids phrase search to increase performance
if
"avoid_phrase_search_threshold"
in
params
and
len
(
hitset
)
>=
params
[
"avoid_phrase_search_threshold"
]
and
pattern
.
startswith
(
'"'
):
pattern
=
pattern
[
1
:
-
1
]
query_string
=
' AND '
.
join
(
pattern
.
split
(
' '
))
pattern
=
qp
.
parse_query
(
query_string
)
else
:
query_string
=
pattern
pattern
=
qp
.
parse_query
(
query_string
,
xapian
.
QueryParser
.
FLAG_PHRASE
)
enquire
.
set_query
(
pattern
)
matches
=
enquire
.
get_mset
(
0
,
ranked_result_amount
,
None
,
MatchDecider
(
hitset
))
weight
=
params
[
"weight"
]
for
match
in
matches
:
recid
=
match
.
docid
if
recid
in
hitset
:
score
=
int
(
match
.
percent
)
*
weight
result
.
append
((
recid
,
score
))
matched_recs
.
add
(
recid
)
return
(
result
,
matched_recs
)
def
xapian_init_databases
():
"""
Initializes all database objects.
"""
for
field
in
INDEXES
:
database
=
xapian
.
Database
(
XAPIAN_DIR
+
"/"
+
field
)
DATABASES
[
field
]
=
database
def
get_greatest_ranked_records
(
raw_reclist
):
"""
Returns unique records having selecting the ones with the greatest records
in case of duplicates.
"""
unique_records
=
dict
()
for
(
recid
,
score
)
in
raw_reclist
:
if
not
recid
in
unique_records
:
unique_records
[
recid
]
=
score
else
:
current_score
=
unique_records
[
recid
]
if
score
>
current_score
:
unique_records
[
recid
]
=
score
result
=
[]
for
recid
in
unique_records
.
keys
():
result
.
append
((
recid
,
unique_records
[
recid
]))
return
result
def
word_similarity_xapian
(
pattern
,
hitset
,
params
,
verbose
,
field
,
ranked_result_amount
):
"""
Ranking a records containing specified words and returns a sorted list.
input:
hitset - a list of hits for the query found by search_engine
verbose - verbose value
field - field to search (selected in GUI)
ranked_result_amount - amount of results to be ranked
output:
recset - a list of sorted records: [[23,34], [344,24], [1,01]]
prefix - what to show before the rank value
postfix - what to show after the rank value
voutput - contains extra information, content dependent on verbose value
"""
voutput
=
""
search_units
=
[]
if
pattern
:
xapian_init_databases
()
pattern
=
" "
.
join
(
map
(
str
,
pattern
))
from
invenio.search_engine
import
create_basic_search_units
search_units
=
create_basic_search_units
(
None
,
pattern
,
field
)
if
verbose
>
0
:
voutput
+=
"Hitset:
%s
<br/>"
%
hitset
voutput
+=
"Pattern:
%s
<br/>"
%
pattern
voutput
+=
"Search units:
%s
<br/>"
%
search_units
all_ranked_results
=
[]
included_hits
=
intbitset
()
excluded_hits
=
intbitset
()
for
(
operator
,
pattern
,
field
,
unit_type
)
in
search_units
:
#@UnusedVariable
# Field might not exist
if
field
not
in
params
[
"fields"
]
.
keys
():
field
=
params
[
"default_field"
]
if
unit_type
==
"a"
:
# Eliminates leading and trailing %
if
pattern
[
0
]
==
"%"
:
pattern
=
pattern
[
1
:
-
1
]
pattern
=
"
\"
"
+
pattern
+
"
\"
"
(
ranked_result_part
,
matched_recs
)
=
xapian_get_ranked_index
(
field
,
pattern
,
params
[
"fields"
][
field
],
hitset
,
ranked_result_amount
)
if
verbose
>
0
:
voutput
+=
"Index
%s
:
%s
<br/>"
%
(
field
,
ranked_result_part
)
voutput
+=
"Index records
%s
:
%s
<br/>"
%
(
field
,
matched_recs
)
# Excludes - results
if
operator
==
"-"
:
excluded_hits
=
excluded_hits
.
union
(
matched_recs
)
# + and | are interpreted as OR
else
:
included_hits
=
included_hits
.
union
(
matched_recs
)
all_ranked_results
.
extend
(
ranked_result_part
)
ranked_result
=
[]
if
hitset
:
# Removes the excluded records
result_hits
=
included_hits
.
difference
(
excluded_hits
)
# Avoids duplicate results and normalises scores
ranked_result
=
get_greatest_ranked_records
(
all_ranked_results
)
ranked_result
=
get_normalized_ranking_scores
(
ranked_result
)
# Considers not ranked records
not_ranked
=
hitset
.
difference
(
result_hits
)
if
not_ranked
:
lrecIDs
=
list
(
not_ranked
)
ranked_result
=
zip
(
lrecIDs
,
[
0
]
*
len
(
lrecIDs
))
+
ranked_result
if
verbose
>
0
:
voutput
+=
"All matched records:
%s
<br/>"
%
result_hits
voutput
+=
"All ranked records:
%s
<br/>"
%
ranked_result
voutput
+=
"All not ranked records:
%s
<br/>"
%
not_ranked
ranked_result
.
sort
(
lambda
x
,
y
:
cmp
(
x
[
1
],
y
[
1
]))
return
(
ranked_result
,
params
[
"prefix"
],
params
[
"postfix"
],
voutput
)
return
(
ranked_result
,
""
,
""
,
voutput
)
def
get_normalized_ranking_scores
(
ranked_result
):
max_score
=
0
for
res
in
ranked_result
:
if
res
[
1
]
>
max_score
:
max_score
=
res
[
1
]
normalized_ranked_result
=
[]
for
res
in
ranked_result
:
normalized_score
=
int
(
100.0
/
max_score
*
res
[
1
])
normalized_ranked_result
.
append
((
res
[
0
],
normalized_score
))
return
normalized_ranked_result
Event Timeline
Log In to Comment