Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91849741
websearch_external_collections.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Nov 15, 03:13
Size
20 KB
Mime Type
text/x-python
Expires
Sun, Nov 17, 03:13 (2 d)
Engine
blob
Format
Raw Data
Handle
22333768
Attached To
R3600 invenio-infoscience
websearch_external_collections.py
View Options
# -*- coding: utf-8 -*-
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""External collection 'core' file.
Perform search, database access."""
__revision__
=
"$Id$"
import
cgi
import
sys
from
copy
import
copy
if
sys
.
hexversion
<
0x2040000
:
# pylint: disable=W0622
from
sets
import
Set
as
set
# pylint: enable=W0622
from
invenio.config
import
CFG_SITE_LANG
from
invenio.dbquery
import
run_sql
,
OperationalError
from
invenio.messages
import
gettext_set_language
from
invenio.websearch_external_collections_config
import
CFG_EXTERNAL_COLLECTION_TIMEOUT
from
invenio.websearch_external_collections_searcher
import
external_collections_dictionary
from
invenio.websearch_external_collections_getter
import
HTTPAsyncPageGetter
,
async_download
from
invenio.websearch_external_collections_templates
import
print_results
,
print_timeout
from
invenio.websearch_external_collections_utils
import
get_collection_id
,
get_collection_descendants
,
\
warning
,
get_verbose_print
import
invenio.template
# Global variables
template
=
invenio
.
template
.
load
(
'websearch_external_collections'
)
external_collections_state
=
None
dico_collection_external_searches
=
None
dico_collection_seealso
=
None
#dico_collection_external_searches = {}
#dico_collection_seealso = {}
def
print_external_results_overview
(
req
,
current_collection
,
pattern_list
,
field
,
external_collection
,
verbosity_level
=
0
,
lang
=
CFG_SITE_LANG
):
"""Print the external collection overview box. Return the selected external collections and parsed query"""
from
invenio.search_engine
import
create_basic_search_units
assert
req
vprint
=
get_verbose_print
(
req
,
'External collection (print_external_results_overview): '
,
verbosity_level
)
pattern
=
bind_patterns
(
pattern_list
)
vprint
(
3
,
'pattern = '
+
pattern
)
if
not
pattern
:
return
(
None
,
None
,
None
,
None
)
basic_search_units
=
create_basic_search_units
(
None
,
pattern
,
field
)
vprint
(
3
,
'basic_search_units = '
+
str
(
basic_search_units
))
(
search_engines
,
seealso_engines
)
=
select_external_engines
(
current_collection
,
external_collection
)
vprint
(
3
,
'search_engines = '
+
str
(
search_engines
))
vprint
(
3
,
'seealso_engines = '
+
str
(
seealso_engines
))
search_engines_list
=
external_collection_sort_engine_by_name
(
search_engines
)
vprint
(
3
,
'search_engines_list (sorted) : '
+
str
(
search_engines_list
))
html
=
template
.
external_collection_overview
(
lang
,
search_engines_list
)
req
.
write
(
html
)
return
(
search_engines
,
seealso_engines
,
pattern
,
basic_search_units
)
def
perform_external_collection_search
(
req
,
current_collection
,
pattern_list
,
field
,
external_collection
,
verbosity_level
=
0
,
lang
=
CFG_SITE_LANG
,
selected_external_collections_infos
=
None
):
"""Search external collection and print the seealso box."""
vprint
=
get_verbose_print
(
req
,
'External collection: '
,
verbosity_level
)
if
selected_external_collections_infos
:
(
search_engines
,
seealso_engines
,
pattern
,
basic_search_units
)
=
selected_external_collections_infos
else
:
(
search_engines
,
seealso_engines
,
pattern
,
basic_search_units
)
=
print_external_results_overview
(
req
,
current_collection
,
pattern_list
,
field
,
external_collection
,
verbosity_level
,
lang
)
if
not
pattern
:
return
do_external_search
(
req
,
lang
,
vprint
,
basic_search_units
,
search_engines
)
create_seealso_box
(
req
,
lang
,
vprint
,
basic_search_units
,
seealso_engines
,
pattern
)
vprint
(
3
,
'end'
)
def
bind_patterns
(
pattern_list
):
"""Combine a list of patterns in an unique pattern.
pattern_list[0] should be the standart search pattern,
pattern_list[1:] are advanced search patterns."""
# just in case an empty list is fed to this function
try
:
if
pattern_list
[
0
]:
return
pattern_list
[
0
]
except
IndexError
:
return
None
pattern
=
""
for
pattern_part
in
pattern_list
[
1
:]:
if
pattern_part
:
pattern
+=
" "
+
pattern_part
return
pattern
.
strip
()
# See also box
def
create_seealso_box
(
req
,
lang
,
vprint
,
basic_search_units
=
None
,
seealso_engines
=
None
,
query
=
''
):
"Create the box that proposes links to other useful search engines like Google."
vprint
(
3
,
'Create seealso box'
)
seealso_engines_list
=
external_collection_sort_engine_by_name
(
seealso_engines
)
vprint
(
3
,
'seealso_engines_list = '
+
str
(
seealso_engines_list
))
links
=
build_seealso_links
(
basic_search_units
,
seealso_engines_list
,
req
,
lang
,
query
)
html
=
template
.
external_collection_seealso_box
(
lang
,
links
)
req
.
write
(
html
)
def
build_seealso_links
(
basic_search_units
,
seealso_engines
,
req
,
lang
,
query
):
"""Build the links for the see also box."""
_
=
gettext_set_language
(
lang
)
links
=
[]
for
engine
in
seealso_engines
:
url
=
engine
.
build_search_url
(
basic_search_units
,
req
.
args
,
lang
)
if
url
:
links
.
append
(
'<a class="google" href="
%(url)s
">
%(query)s
%(text_in)s
%(name)s
</a>'
%
\
{
'url'
:
cgi
.
escape
(
url
),
'query'
:
cgi
.
escape
(
query
),
'text_in'
:
_
(
'in'
),
'name'
:
_
(
engine
.
name
)})
return
links
# Selection
def
select_external_engines
(
collection_name
,
selected_external_searches
):
"""Build a tuple of two sets. The first one is the list of engine to use for an external search and the
second one is for the seealso box."""
collection_id
=
get_collection_id
(
collection_name
)
if
not
collection_id
:
return
(
None
,
None
)
if
not
type
(
selected_external_searches
)
is
list
:
selected_external_searches
=
[
selected_external_searches
]
seealso_engines
=
set
()
search_engines
=
set
()
if
dico_collection_seealso
.
has_key
(
collection_id
):
seealso_engines
=
copy
(
dico_collection_seealso
[
collection_id
])
if
dico_collection_external_searches
.
has_key
(
collection_id
):
seealso_engines
=
seealso_engines
.
union
(
dico_collection_external_searches
[
collection_id
])
for
ext_search_name
in
selected_external_searches
:
if
external_collections_dictionary
.
has_key
(
ext_search_name
):
engine
=
external_collections_dictionary
[
ext_search_name
]
if
engine
.
parser
:
search_engines
.
add
(
engine
)
else
:
warning
(
'select_external_engines:
%(ext_search_name)s
unknown.'
%
locals
())
seealso_engines
=
seealso_engines
.
difference
(
search_engines
)
return
(
search_engines
,
seealso_engines
)
# Search
def
do_external_search
(
req
,
lang
,
vprint
,
basic_search_units
,
search_engines
):
"""Make the external search."""
_
=
gettext_set_language
(
lang
)
vprint
(
3
,
'beginning external search'
)
engines_list
=
[]
for
engine
in
search_engines
:
url
=
engine
.
build_search_url
(
basic_search_units
,
req
.
args
,
lang
)
if
url
:
engines_list
.
append
([
url
,
engine
])
pagegetters_list
=
[
HTTPAsyncPageGetter
(
engine
[
0
])
for
engine
in
engines_list
]
def
finished
(
pagegetter
,
data
,
current_time
):
"""Function called, each time the download of a web page finish.
Will parse and print the results of this page."""
print_results
(
req
,
lang
,
pagegetter
,
data
,
current_time
)
finished_list
=
async_download
(
pagegetters_list
,
finished
,
engines_list
,
CFG_EXTERNAL_COLLECTION_TIMEOUT
)
for
(
finished
,
engine
)
in
zip
(
finished_list
,
engines_list
):
if
not
finished
:
url
=
engine
[
0
]
name
=
engine
[
1
]
.
name
print_timeout
(
req
,
lang
,
engine
[
1
],
name
,
url
)
# Database management
def
external_collection_load_states
():
global
external_collections_state
,
dico_collection_external_searches
,
dico_collection_seealso
external_collections_state
=
{}
dico_collection_external_searches
=
{}
dico_collection_seealso
=
{}
query
=
"SELECT collection_externalcollection.id_collection, collection_externalcollection.type, externalcollection.name FROM collection_externalcollection, externalcollection WHERE collection_externalcollection.id_externalcollection = externalcollection.id;"
try
:
results
=
run_sql
(
query
)
except
OperationalError
:
results
=
None
if
results
:
for
result
in
results
:
collection_id
=
int
(
result
[
0
])
search_type
=
int
(
result
[
1
])
engine_name
=
result
[
2
]
if
not
external_collections_dictionary
.
has_key
(
engine_name
):
warning
(
"No search engine : "
+
engine_name
)
continue
engine
=
external_collections_dictionary
[
engine_name
]
if
not
external_collections_state
.
has_key
(
collection_id
):
external_collections_state
[
collection_id
]
=
{}
col_states
=
external_collections_state
[
collection_id
]
col_states
[
engine
]
=
search_type
dictionary
=
None
if
search_type
==
1
:
dictionary
=
dico_collection_seealso
if
search_type
in
[
2
,
3
]:
dictionary
=
dico_collection_external_searches
if
dictionary
is
None
:
continue
if
not
dictionary
.
has_key
(
collection_id
):
dictionary
[
collection_id
]
=
set
()
engine_set
=
dictionary
[
collection_id
]
engine_set
.
add
(
engine
)
def
external_collection_get_state
(
external_collection
,
collection_id
):
external_collection_load_states
()
if
not
external_collections_state
.
has_key
(
collection_id
):
return
0
col_states
=
external_collections_state
[
collection_id
]
if
not
col_states
.
has_key
(
external_collection
):
return
0
return
col_states
[
external_collection
]
def
external_collection_get_update_state_list
(
external_collection
,
collection_id
,
state
,
recurse
=
False
):
changes
=
[]
if
external_collection_get_state
(
external_collection
,
collection_id
)
!=
state
:
changes
=
[
'(
%(collection_id)d
,
%(id_externalcollection)d
,
%(state)d
)'
%
{
'collection_id'
:
collection_id
,
'id_externalcollection'
:
external_collection_getid
(
external_collection
),
'state'
:
state
}]
if
not
recurse
:
return
changes
for
descendant_id
in
get_collection_descendants
(
collection_id
):
changes
+=
external_collection_get_update_state_list
(
external_collection
,
descendant_id
,
state
)
return
changes
def
external_collection_apply_changes
(
changes_list
):
if
not
changes_list
:
return
sql_values
=
", "
.
join
(
changes_list
)
sql
=
'INSERT INTO collection_externalcollection (id_collection, id_externalcollection, type) VALUES '
+
sql_values
+
'ON DUPLICATE KEY UPDATE type=VALUES(type);'
run_sql
(
sql
)
# Misc functions
def
external_collection_sort_engine_by_name
(
engines_set
):
"""Return a list of sorted (by name) search engines."""
engines_list
=
[
engine
for
engine
in
engines_set
]
engines_list
.
sort
(
lambda
x
,
y
:
cmp
(
x
.
name
,
y
.
name
))
return
engines_list
# External search ID
def
external_collection_getid
(
external_collection
):
"""Return the id of an external_collection. Will create a new entry in DB if needed."""
if
external_collection
.
__dict__
.
has_key
(
'id'
):
return
external_collection
.
id
query
=
'SELECT id FROM externalcollection WHERE name="
%(name)s
";'
%
{
'name'
:
external_collection
.
name
}
results
=
run_sql
(
query
)
if
not
results
:
query
=
'INSERT INTO externalcollection (name) VALUES ("
%(name)s
");'
%
{
'name'
:
external_collection
.
name
}
run_sql
(
query
)
return
external_collection_getid
(
external_collection
)
external_collection
.
id
=
results
[
0
][
0
]
return
external_collection
.
id
def
get_external_collection_engine
(
external_collection_name
):
"""Return the external collection engine given its name"""
if
external_collections_dictionary
.
has_key
(
external_collection_name
):
return
external_collections_dictionary
[
external_collection_name
]
else
:
return
None
# Load db infos if it's not already done.
if
external_collections_state
is
None
:
external_collection_load_states
()
# Hosted Collections related functions (the following functions should eventually be regrouped as above)
# These functions could eventually be placed into there own file, ex. websearch_hosted_collections.py
def
calculate_hosted_collections_results
(
req
,
pattern_list
,
field
,
hosted_collections
,
verbosity_level
=
0
,
lang
=
CFG_SITE_LANG
,
timeout
=
CFG_EXTERNAL_COLLECTION_TIMEOUT
):
"""Ruturn a list of the various results for a every hosted collection organized in tuples"""
# normally, the following should be checked before even running this function so the following line could be removed
if
not
hosted_collections
:
return
(
None
,
None
)
vprint
=
get_verbose_print
(
req
,
'Hosted collections: '
,
verbosity_level
)
vprint
(
3
,
'pattern_list = '
+
str
(
pattern_list
)
+
', field = '
+
str
(
field
))
# firstly we calculate the search parameters, i.e. the actual hosted search engines and the basic search units
(
hosted_search_engines
,
basic_search_units
)
=
\
calculate_hosted_collections_search_params
(
req
,
pattern_list
,
field
,
hosted_collections
,
verbosity_level
)
# in case something went wrong with the above calculation just return None
# however, once we run this function no fail should be expected here
# UPDATE : let search go on even there are no basic search units (an empty pattern_list and field)
#if basic_search_units == None or len(hosted_search_engines) == 0: return (None, None)
if
len
(
hosted_search_engines
)
==
0
:
return
(
None
,
None
)
# finally return the list of tuples with the results
return
do_calculate_hosted_collections_results
(
req
,
lang
,
vprint
,
verbosity_level
,
basic_search_units
,
hosted_search_engines
,
timeout
)
vprint
(
3
,
'end'
)
def
calculate_hosted_collections_search_params
(
req
,
pattern_list
,
field
,
hosted_collections
,
verbosity_level
=
0
):
"""Calculate the searching parameters for the selected hosted collections
i.e. the actual hosted search engines and the basic search units"""
from
invenio.search_engine
import
create_basic_search_units
assert
req
vprint
=
get_verbose_print
(
req
,
'Hosted collections (calculate_hosted_collections_search_params): '
,
verbosity_level
)
pattern
=
bind_patterns
(
pattern_list
)
vprint
(
3
,
'pattern = '
+
pattern
)
# if for any strange reason there is no pattern, just return
# UPDATE : let search go on even there is no pattern (an empty pattern_list and field)
#if not pattern: return (None, None)
# calculate the basic search units
basic_search_units
=
create_basic_search_units
(
None
,
pattern
,
field
)
vprint
(
3
,
'basic_search_units = '
+
str
(
basic_search_units
))
# calculate the set of hosted search engines
hosted_search_engines
=
select_hosted_search_engines
(
hosted_collections
)
vprint
(
3
,
'hosted_search_engines = '
+
str
(
hosted_search_engines
))
# no need really to print out a sorted list of the hosted search engines, is there? I'll leave this commented out
#hosted_search_engines_list = external_collection_sort_engine_by_name(hosted_search_engines)
#vprint(3, 'hosted_search_engines_list (sorted) : ' + str(hosted_search_engines_list))
return
(
hosted_search_engines
,
basic_search_units
)
def
select_hosted_search_engines
(
selected_hosted_collections
):
"""Build the set of engines to be used for the hosted collections"""
if
not
type
(
selected_hosted_collections
)
is
list
:
selected_hosted_collections
=
[
selected_hosted_collections
]
hosted_search_engines
=
set
()
for
hosted_collection_name
in
selected_hosted_collections
:
if
external_collections_dictionary
.
has_key
(
hosted_collection_name
):
engine
=
external_collections_dictionary
[
hosted_collection_name
]
# the hosted collection cannot present its results unless it has a parser implemented
if
engine
.
parser
:
hosted_search_engines
.
add
(
engine
)
else
:
warning
(
'select_hosted_search_engines:
%(hosted_collection_name)s
unknown.'
%
locals
())
return
hosted_search_engines
def
do_calculate_hosted_collections_results
(
req
,
lang
,
vprint
,
verbosity_level
,
basic_search_units
,
hosted_search_engines
,
timeout
=
CFG_EXTERNAL_COLLECTION_TIMEOUT
):
"""Actually search the hosted collections and return their results and information in a list of tuples.
One tuple for each hosted collection. Handles timeouts"""
_
=
gettext_set_language
(
lang
)
if
not
vprint
:
vprint
=
get_verbose_print
(
req
,
'Hosted collections (calculate_hosted_collections_search_params): '
,
verbosity_level
)
# defining vprint at this moment probably means we'll just run this one function at this time, therefore the "verbose"
# end hosted search string will not be printed (it is normally printed by the initial calculate function)
# Therefore, either define a flag here to print it by the end of this function or redefine the whole "verbose"
# printing logic of the above functions
vprint
(
3
,
'beginning hosted search'
)
# list to hold the hosted search engines and their respective search urls
engines_list
=
[]
# list to hold the non timed out results
results_list
=
[]
# list to hold all the results
full_results_list
=
[]
# list to hold all the timeouts
timeout_list
=
[]
# in case this is an engine-only list
if
type
(
hosted_search_engines
)
is
set
:
for
engine
in
hosted_search_engines
:
url
=
engine
.
build_search_url
(
basic_search_units
,
req
.
args
,
lang
)
if
url
:
engines_list
.
append
([
url
,
engine
])
# in case we are iterating a pre calculated url+engine list
elif
type
(
hosted_search_engines
)
is
list
:
for
engine
in
hosted_search_engines
:
engines_list
.
append
(
engine
)
# in both the above cases we end up with a [[search url], [engine]] kind of list
# create the list of search urls to be handed to the asynchronous getter
pagegetters_list
=
[
HTTPAsyncPageGetter
(
engine
[
0
])
for
engine
in
engines_list
]
# function to be run on every result
def
finished
(
pagegetter
,
data
,
current_time
):
"""Function called, each time the download of a web page finish.
Will parse and print the results of this page."""
# each pagegetter that didn't timeout is added to this list
results_list
.
append
((
pagegetter
,
data
,
current_time
))
# run the asynchronous getter
finished_list
=
async_download
(
pagegetters_list
,
finished
,
engines_list
,
timeout
)
# create the complete list of tuples, one for each hosted collection, with the results and other information,
# including those that timed out
for
(
finished
,
engine
)
in
zip
(
finished_list
,
engines_list
):
#finished_and_engines_list:
if
finished
:
for
result
in
results_list
:
if
result
[
1
]
==
engine
:
# the engine is fed the results, it will be parsed later, at printing time
engine
[
1
]
.
parser
.
parse_and_get_results
(
result
[
0
]
.
data
,
feedonly
=
True
)
## the list contains:
## * the engine itself: [ search url], [engine]
## * the parsed number of found results
## * the fetching time
full_results_list
.
append
(
(
engine
,
engine
[
1
]
.
parser
.
parse_num_results
(),
result
[
2
])
)
break
elif
not
finished
:
## the list contains:
## * the engine itself: [search url], [engine]
timeout_list
.
append
(
engine
)
return
(
full_results_list
,
timeout_list
)
Event Timeline
Log In to Comment