websearch_external_collections.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Nov 4, 20:14

websearch_external_collections.py
View Options

	# -- coding: utf-8 --

	## This file is part of CDS Invenio.
	## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
	##
	## CDS Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## CDS Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""External collection 'core' file.
	Perform search, database access."""

	__revision__ = "$Id$"

	import cgi
	import sys
	from copy import copy

	if sys.hexversion < 0x2040000:
	# pylint: disable=W0622
	from sets import Set as set
	# pylint: enable=W0622

	from invenio.config import CFG_SITE_LANG
	from invenio.dbquery import run_sql, OperationalError
	from invenio.messages import gettext_set_language

	from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_TIMEOUT
	from invenio.websearch_external_collections_searcher import external_collections_dictionary
	from invenio.websearch_external_collections_getter import HTTPAsyncPageGetter, async_download
	from invenio.websearch_external_collections_templates import print_results, print_timeout
	from invenio.websearch_external_collections_utils import get_collection_id, get_collection_descendants, \
	warning, get_verbose_print

	import invenio.template

	# Global variables
	template = invenio.template.load('websearch_external_collections')
	external_collections_state = None
	dico_collection_external_searches = None
	dico_collection_seealso = None

	#dico_collection_external_searches = {}
	#dico_collection_seealso = {}

	def print_external_results_overview(req, current_collection, pattern_list, field,
	external_collection, verbosity_level=0, lang=CFG_SITE_LANG):
	"""Print the external collection overview box. Return the selected external collections and parsed query"""
	from invenio.search_engine import create_basic_search_units
	assert req
	vprint = get_verbose_print(req, 'External collection (print_external_results_overview): ', verbosity_level)

	pattern = bind_patterns(pattern_list)
	vprint(3, 'pattern = ' + pattern)

	if not pattern:
	return (None, None, None, None)

	basic_search_units = create_basic_search_units(None, pattern, field)
	vprint(3, 'basic_search_units = ' + str(basic_search_units))

	(search_engines, seealso_engines) = select_external_engines(current_collection, external_collection)
	vprint(3, 'search_engines = ' + str(search_engines))
	vprint(3, 'seealso_engines = ' + str(seealso_engines))

	search_engines_list = external_collection_sort_engine_by_name(search_engines)
	vprint(3, 'search_engines_list (sorted) : ' + str(search_engines_list))
	html = template.external_collection_overview(lang, search_engines_list)
	req.write(html)

	return (search_engines, seealso_engines, pattern, basic_search_units)

	def perform_external_collection_search(req, current_collection, pattern_list, field,
	external_collection, verbosity_level=0, lang=CFG_SITE_LANG, selected_external_collections_infos=None):
	"""Search external collection and print the seealso box."""

	vprint = get_verbose_print(req, 'External collection: ', verbosity_level)

	if selected_external_collections_infos:
	(search_engines, seealso_engines, pattern, basic_search_units) = selected_external_collections_infos
	else:
	(search_engines, seealso_engines, pattern, basic_search_units) = print_external_results_overview(req,
	current_collection, pattern_list, field, external_collection, verbosity_level, lang)

	if not pattern:
	return

	do_external_search(req, lang, vprint, basic_search_units, search_engines)
	create_seealso_box(req, lang, vprint, basic_search_units, seealso_engines, pattern)
	vprint(3, 'end')

	def bind_patterns(pattern_list):
	"""Combine a list of patterns in an unique pattern.
	pattern_list[0] should be the standart search pattern,
	pattern_list[1:] are advanced search patterns."""

	# just in case an empty list is fed to this function
	try:
	if pattern_list[0]:
	return pattern_list[0]
	except IndexError:
	return None

	pattern = ""
	for pattern_part in pattern_list[1:]:
	if pattern_part:
	pattern += " " + pattern_part

	return pattern.strip()

	# See also box
	def create_seealso_box(req, lang, vprint, basic_search_units=None, seealso_engines=None, query=''):
	"Create the box that proposes links to other useful search engines like Google."

	vprint(3, 'Create seealso box')
	seealso_engines_list = external_collection_sort_engine_by_name(seealso_engines)
	vprint(3, 'seealso_engines_list = ' + str(seealso_engines_list))
	links = build_seealso_links(basic_search_units, seealso_engines_list, req, lang, query)
	html = template.external_collection_seealso_box(lang, links)
	req.write(html)

	def build_seealso_links(basic_search_units, seealso_engines, req, lang, query):
	"""Build the links for the see also box."""
	_ = gettext_set_language(lang)

	links = []
	for engine in seealso_engines:
	url = engine.build_search_url(basic_search_units, req.args, lang)
	if url:
	links.append('<a class="google" href="%(url)s">%(query)s %(text_in)s %(name)s</a>' % \
	{'url': cgi.escape(url),
	'query': cgi.escape(query),
	'text_in': _('in'),
	'name': _(engine.name)})
	return links

	# Selection
	def select_external_engines(collection_name, selected_external_searches):
	"""Build a tuple of two sets. The first one is the list of engine to use for an external search and the
	second one is for the seealso box."""

	collection_id = get_collection_id(collection_name)
	if not collection_id:
	return (None, None)

	if not type(selected_external_searches) is list:
	selected_external_searches = [selected_external_searches]

	seealso_engines = set()
	search_engines = set()

	if dico_collection_seealso.has_key(collection_id):
	seealso_engines = copy(dico_collection_seealso[collection_id])

	if dico_collection_external_searches.has_key(collection_id):
	seealso_engines = seealso_engines.union(dico_collection_external_searches[collection_id])

	for ext_search_name in selected_external_searches:
	if external_collections_dictionary.has_key(ext_search_name):
	engine = external_collections_dictionary[ext_search_name]
	if engine.parser:
	search_engines.add(engine)
	else:
	warning('select_external_engines: %(ext_search_name)s unknown.' % locals())

	seealso_engines = seealso_engines.difference(search_engines)

	return (search_engines, seealso_engines)

	# Search
	def do_external_search(req, lang, vprint, basic_search_units, search_engines):
	"""Make the external search."""
	_ = gettext_set_language(lang)
	vprint(3, 'beginning external search')
	engines_list = []

	for engine in search_engines:
	url = engine.build_search_url(basic_search_units, req.args, lang)
	if url:
	engines_list.append([url, engine])

	pagegetters_list = [HTTPAsyncPageGetter(engine[0]) for engine in engines_list]

	def finished(pagegetter, data, current_time):
	"""Function called, each time the download of a web page finish.
	Will parse and print the results of this page."""
	print_results(req, lang, pagegetter, data, current_time)

	finished_list = async_download(pagegetters_list, finished, engines_list, CFG_EXTERNAL_COLLECTION_TIMEOUT)

	for (finished, engine) in zip(finished_list, engines_list):
	if not finished:
	url = engine[0]
	name = engine[1].name
	print_timeout(req, lang, engine[1], name, url)

	# Database management
	def external_collection_load_states():
	global external_collections_state, dico_collection_external_searches, dico_collection_seealso

	external_collections_state = {}
	dico_collection_external_searches = {}
	dico_collection_seealso = {}

	query = "SELECT collection_externalcollection.id_collection, collection_externalcollection.type, externalcollection.name FROM collection_externalcollection, externalcollection WHERE collection_externalcollection.id_externalcollection = externalcollection.id;"
	try:
	results = run_sql(query)
	except OperationalError:
	results = None
	if results:
	for result in results:
	collection_id = int(result[0])
	search_type = int(result[1])
	engine_name = result[2]

	if not external_collections_dictionary.has_key(engine_name):
	warning("No search engine : " + engine_name)
	continue

	engine = external_collections_dictionary[engine_name]

	if not external_collections_state.has_key(collection_id):
	external_collections_state[collection_id] = {}
	col_states = external_collections_state[collection_id]

	col_states[engine] = search_type

	dictionary = None

	if search_type == 1:
	dictionary = dico_collection_seealso

	if search_type in [2, 3]:
	dictionary = dico_collection_external_searches

	if dictionary is None:
	continue

	if not dictionary.has_key(collection_id):
	dictionary[collection_id] = set()
	engine_set = dictionary[collection_id]
	engine_set.add(engine)

	def external_collection_get_state(external_collection, collection_id):
	external_collection_load_states()
	if not external_collections_state.has_key(collection_id):
	return 0
	col_states = external_collections_state[collection_id]
	if not col_states.has_key(external_collection):
	return 0
	return col_states[external_collection]

	def external_collection_get_update_state_list(external_collection, collection_id, state, recurse=False):
	changes = []

	if external_collection_get_state(external_collection, collection_id) != state:
	changes = ['(%(collection_id)d, %(id_externalcollection)d, %(state)d)' %
	{'collection_id': collection_id, 'id_externalcollection': external_collection_getid(external_collection), 'state': state}]

	if not recurse:
	return changes

	for descendant_id in get_collection_descendants(collection_id):
	changes += external_collection_get_update_state_list(external_collection, descendant_id, state)

	return changes

	def external_collection_apply_changes(changes_list):
	if not changes_list:
	return

	sql_values = ", ".join(changes_list)
	sql = 'INSERT INTO collection_externalcollection (id_collection, id_externalcollection, type) VALUES ' + sql_values + 'ON DUPLICATE KEY UPDATE type=VALUES(type);'
	run_sql(sql)

	# Misc functions
	def external_collection_sort_engine_by_name(engines_set):
	"""Return a list of sorted (by name) search engines."""
	engines_list = [engine for engine in engines_set]
	engines_list.sort(lambda x, y: cmp(x.name, y.name))
	return engines_list

	# External search ID
	def external_collection_getid(external_collection):
	"""Return the id of an external_collection. Will create a new entry in DB if needed."""

	if external_collection.__dict__.has_key('id'):
	return external_collection.id

	query = 'SELECT id FROM externalcollection WHERE name="%(name)s";' % {'name': external_collection.name}
	results = run_sql(query)
	if not results:
	query = 'INSERT INTO externalcollection (name) VALUES ("%(name)s");' % {'name': external_collection.name}
	run_sql(query)
	return external_collection_getid(external_collection)

	external_collection.id = results[0][0]
	return external_collection.id

	def get_external_collection_engine(external_collection_name):
	"""Return the external collection engine given its name"""

	if external_collections_dictionary.has_key(external_collection_name):
	return external_collections_dictionary[external_collection_name]
	else:
	return None

	# Load db infos if it's not already done.
	if external_collections_state is None:
	external_collection_load_states()

	# Hosted Collections related functions (the following functions should eventually be regrouped as above)
	# These functions could eventually be placed into there own file, ex. websearch_hosted_collections.py
	def calculate_hosted_collections_results(req, pattern_list, field, hosted_collections, verbosity_level=0,
	lang=CFG_SITE_LANG, timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
	"""Ruturn a list of the various results for a every hosted collection organized in tuples"""

	# normally, the following should be checked before even running this function so the following line could be removed
	if not hosted_collections: return (None, None)

	vprint = get_verbose_print(req, 'Hosted collections: ', verbosity_level)
	vprint(3, 'pattern_list = ' + str(pattern_list) + ', field = ' + str(field))

	# firstly we calculate the search parameters, i.e. the actual hosted search engines and the basic search units
	(hosted_search_engines, basic_search_units) = \
	calculate_hosted_collections_search_params(req,
	pattern_list,
	field,
	hosted_collections,
	verbosity_level)

	# in case something went wrong with the above calculation just return None
	# however, once we run this function no fail should be expected here
	# UPDATE : let search go on even there are no basic search units (an empty pattern_list and field)
	#if basic_search_units == None or len(hosted_search_engines) == 0: return (None, None)
	if len(hosted_search_engines) == 0: return (None, None)

	# finally return the list of tuples with the results
	return do_calculate_hosted_collections_results(req, lang, vprint, verbosity_level, basic_search_units, hosted_search_engines, timeout)

	vprint(3, 'end')

	def calculate_hosted_collections_search_params(req,
	pattern_list,
	field,
	hosted_collections,
	verbosity_level=0):
	"""Calculate the searching parameters for the selected hosted collections
	i.e. the actual hosted search engines and the basic search units"""

	from invenio.search_engine import create_basic_search_units
	assert req
	vprint = get_verbose_print(req, 'Hosted collections (calculate_hosted_collections_search_params): ', verbosity_level)

	pattern = bind_patterns(pattern_list)
	vprint(3, 'pattern = ' + pattern)

	# if for any strange reason there is no pattern, just return
	# UPDATE : let search go on even there is no pattern (an empty pattern_list and field)
	#if not pattern: return (None, None)

	# calculate the basic search units
	basic_search_units = create_basic_search_units(None, pattern, field)
	vprint(3, 'basic_search_units = ' + str(basic_search_units))

	# calculate the set of hosted search engines
	hosted_search_engines = select_hosted_search_engines(hosted_collections)
	vprint(3, 'hosted_search_engines = ' + str(hosted_search_engines))

	# no need really to print out a sorted list of the hosted search engines, is there? I'll leave this commented out
	#hosted_search_engines_list = external_collection_sort_engine_by_name(hosted_search_engines)
	#vprint(3, 'hosted_search_engines_list (sorted) : ' + str(hosted_search_engines_list))

	return (hosted_search_engines, basic_search_units)

	def select_hosted_search_engines(selected_hosted_collections):
	"""Build the set of engines to be used for the hosted collections"""

	if not type(selected_hosted_collections) is list:
	selected_hosted_collections = [selected_hosted_collections]

	hosted_search_engines = set()

	for hosted_collection_name in selected_hosted_collections:
	if external_collections_dictionary.has_key(hosted_collection_name):
	engine = external_collections_dictionary[hosted_collection_name]
	# the hosted collection cannot present its results unless it has a parser implemented
	if engine.parser:
	hosted_search_engines.add(engine)
	else:
	warning('select_hosted_search_engines: %(hosted_collection_name)s unknown.' % locals())

	return hosted_search_engines

	def do_calculate_hosted_collections_results(req, lang, vprint, verbosity_level, basic_search_units, hosted_search_engines,
	timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
	"""Actually search the hosted collections and return their results and information in a list of tuples.
	One tuple for each hosted collection. Handles timeouts"""

	_ = gettext_set_language(lang)
	if not vprint:
	vprint = get_verbose_print(req, 'Hosted collections (calculate_hosted_collections_search_params): ', verbosity_level)
	# defining vprint at this moment probably means we'll just run this one function at this time, therefore the "verbose"
	# end hosted search string will not be printed (it is normally printed by the initial calculate function)
	# Therefore, either define a flag here to print it by the end of this function or redefine the whole "verbose"
	# printing logic of the above functions
	vprint(3, 'beginning hosted search')

	# list to hold the hosted search engines and their respective search urls
	engines_list = []
	# list to hold the non timed out results
	results_list = []
	# list to hold all the results
	full_results_list = []
	# list to hold all the timeouts
	timeout_list = []

	# in case this is an engine-only list
	if type(hosted_search_engines) is set:
	for engine in hosted_search_engines:
	url = engine.build_search_url(basic_search_units, req.args, lang)
	if url:
	engines_list.append([url, engine])
	# in case we are iterating a pre calculated url+engine list
	elif type(hosted_search_engines) is list:
	for engine in hosted_search_engines:
	engines_list.append(engine)
	# in both the above cases we end up with a [[search url], [engine]] kind of list

	# create the list of search urls to be handed to the asynchronous getter
	pagegetters_list = [HTTPAsyncPageGetter(engine[0]) for engine in engines_list]

	# function to be run on every result
	def finished(pagegetter, data, current_time):
	"""Function called, each time the download of a web page finish.
	Will parse and print the results of this page."""
	# each pagegetter that didn't timeout is added to this list
	results_list.append((pagegetter, data, current_time))

	# run the asynchronous getter
	finished_list = async_download(pagegetters_list, finished, engines_list, timeout)

	# create the complete list of tuples, one for each hosted collection, with the results and other information,
	# including those that timed out
	for (finished, engine) in zip(finished_list, engines_list): #finished_and_engines_list:
	if finished:
	for result in results_list:
	if result[1] == engine:
	# the engine is fed the results, it will be parsed later, at printing time
	engine[1].parser.parse_and_get_results(result[0].data, feedonly=True)
	## the list contains:
	## * the engine itself: [ search url], [engine]
	## * the parsed number of found results
	## * the fetching time
	full_results_list.append(
	(engine, engine[1].parser.parse_num_results(), result[2])
	)
	break
	elif not finished:
	## the list contains:
	## * the engine itself: [search url], [engine]
	timeout_list.append(engine)

	return (full_results_list, timeout_list)

websearch_external_collections.pyNo OneTemporaryActions

File Metadata

websearch_external_collections.pyView Options

Event Timeline

websearch_external_collections.py
No OneTemporary
Actions

websearch_external_collections.py
View Options