bibrank_record_sorter.py.wml
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Sep 15, 12:19

bibrank_record_sorter.py.wml
View Options

	##Ranking of records using different parameters and methods on the fly.

	## This file is part of the CERN Document Server Software (CDSware).
	## Copyright (C) 2002 CERN.
	##
	## The CDSware is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## The CDSware is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDSware; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	## read config variables:
	#include "config.wml"
	#include "configbis.wml"
	#include "cdswmllib.wml"

	## start Python:
	<protect>#!</protect><PYTHON>
	<protect># -- coding: utf-8 --</protect>
	<protect>## $Id$</protect>
	<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
	## fill config variables:
	pylibdir = "<LIBDIR>/python"

	try:
	import sys
	import zlib
	import marshal
	import string
	import time
	import math
	import MySQLdb
	import Numeric
	import re
	import ConfigParser
	import traceback
	import copy
	except ImportError, e:
	pass

	try:
	sys.path.append('%s' % pylibdir)
	from cdsware.config import *
	from cdsware.dbquery import run_sql
	from cdsware.bibindex_engine_stemmer import stem_by_lang, lang_available
	from cdsware.bibindex_engine_stopwords import is_stopword_force
	from cdsware.search_engine_config import cfg_max_recID
	except ImportError, e:
	pass

	class HitSet:
	"""Class describing set of records, implemented as bit vectors of recIDs.
	Using Numeric arrays for speed (1 value = 8 bits), can use later "real"
	bit vectors to save space."""

	def __init__(self, init_set=None):
	self._nbhits = -1
	if init_set:
	self._set = init_set
	else:
	self._set = Numeric.zeros(cfg_max_recID+1, Numeric.Int0)

	def __repr__(self, join=string.join):
	return "%s(%s)" % (self.__class__.__name__, join(map(repr, self._set), ', '))

	def add(self, recID):
	"Adds a record to the set."
	self._set[recID] = 1

	def addmany(self, recIDs):
	"Adds several recIDs to the set."
	for recID in recIDs: self._set[recID] = 1

	def addlist(self, arr):
	"Adds an array of recIDs to the set."
	Numeric.put(self._set, arr, 1)

	def remove(self, recID):
	"Removes a record from the set."
	self._set[recID] = 0

	def removemany(self, recIDs):
	"Removes several records from the set."
	for recID in recIDs:
	self.remove(recID)

	def intersect(self, other):
	"Does a set intersection with other. Keep result in self."
	self._set = Numeric.bitwise_and(self._set, other._set)

	def union(self, other):
	"Does a set union with other. Keep result in self."
	self._set = Numeric.bitwise_or(self._set, other._set)

	def difference(self, other):
	"Does a set difference with other. Keep result in self."
	#self._set = Numeric.bitwise_not(self._set, other._set)
	for recID in Numeric.nonzero(other._set):
	self.remove(recID)

	def contains(self, recID):
	"Checks whether the set contains recID."
	return self._set[recID]

	__contains__ = contains # Higher performance member-test for python 2.0 and above

	def __getitem__(self, index):
	"Support for the 'for item in set:' protocol."
	return Numeric.nonzero(self._set)[index]

	def calculate_nbhits(self):
	"Calculates the number of records set in the hitset."
	self._nbhits = Numeric.sum(self._set.copy().astype(Numeric.Int))

	def items(self):
	"Return an array containing all recID."
	return Numeric.nonzero(self._set)

	def tolist(self):
	"Return an array containing all recID."
	return Numeric.nonzero(self._set).tolist()

	def compare_on_val(first, second):
	return cmp(second[1], first[1])
	def serialize_via_numeric_array_dumps(arr):
	return Numeric.dumps(arr)
	def serialize_via_numeric_array_compr(str):
	return zlib.compress(str)
	def serialize_via_numeric_array_escape(str):
	return MySQLdb.escape_string(str)
	def serialize_via_numeric_array(arr):
	"""Serialize Numeric array into a compressed string."""
	return serialize_via_numeric_array_escape(serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr)))
	def deserialize_via_numeric_array(string):
	"""Decompress and deserialize string into a Numeric array."""
	return Numeric.loads(zlib.decompress(string))
	def serialize_via_marshal(obj):
	"""Serialize Python object via marshal into a compressed string."""
	return MySQLdb.escape_string(zlib.compress(marshal.dumps(obj)))
	def deserialize_via_marshal(string):
	"""Decompress and deserialize string into a Python object via marshal."""
	return marshal.loads(zlib.decompress(string))

	def adderrorbox(header='', datalist=[]):
	"""used to create table around main data on a page, row based"""

	try:
	perc = str(100 // len(datalist)) + '%'
	except ZeroDivisionError:
	perc = 1

	output = '<table class="errorbox">'
	output += '<thead><tr><th class="errorboxheader" colspan="%s">%s</th></tr></thead>' % (len(datalist), header)
	output += '<tbody>'
	for row in [datalist]:
	output += '<tr>'
	for data in row:
	output += '<td style="vertical-align: top; margin-top: 5px; width: %s;">' % (perc, )
	output += data
	output += '</td>'
	output += '</tr>'
	output += '</tbody></table>'
	return output

	def check_term(term, col_size, term_rec, max_occ, min_occ, termlength):
	"""Check if the term is valid for use
	term - the term to check
	col_size - the number of records in database
	term_rec - the number of records which contains this term
	max_occ - max frequency of the term allowed
	min_occ - min frequence of the term allowed
	termlength - the minimum length of the terms allowed"""

	try:
	if is_stopword_force(term) or (len(term) <= termlength) or ((float(term_rec) / float(col_size)) >= max_occ) or ((float(term_rec) / float(col_size)) <= min_occ):
	return ""
	if int(term):
	return ""
	except StandardError, e:
	pass
	return "true"

	def create_rnkmethod_cache():
	"""Create cache with vital information for each rank method."""

	global methods
	bibrank_meths = run_sql("SELECT name from rnkMETHOD")
	methods = {}
	global voutput
	voutput = ""

	for (rank_method_code,) in bibrank_meths:
	try:
	file = etcdir + "/bibrank/" + rank_method_code + ".cfg"
	config = ConfigParser.ConfigParser()
	config.readfp(open(file))
	except StandardError, e:
	pass

	cfg_function = config.get("rank_method", "function")
	if config.has_section(cfg_function):
	methods[rank_method_code] = {}
	methods[rank_method_code]["function"] = cfg_function
	methods[rank_method_code]["prefix"] = config.get(cfg_function, "relevance_number_output_prologue")
	methods[rank_method_code]["postfix"] = config.get(cfg_function, "relevance_number_output_epilogue")
	methods[rank_method_code]["chars_alphanumericseparators"] = r"[1234567890\!\"\#\$\%\&\'\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\\|\}\~]"
	else:
	raise Exception("Error in configuration file: %s" % (etcdir + "/bibrank/" + rank_method_code + ".cfg"))

	i8n_names = run_sql("SELECT ln,value from rnkMETHODNAME,rnkMETHOD where id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" % (rank_method_code))
	for (ln, value) in i8n_names:
	methods[rank_method_code][ln] = value

	if config.has_option(cfg_function, "table"):
	methods[rank_method_code]["rnkWORD_table"] = config.get(cfg_function, "table")
	methods[rank_method_code]["col_size"] = run_sql("SELECT count(*) FROM %sR" % methods[rank_method_code]["rnkWORD_table"][:-1])[0][0]

	if config.has_option(cfg_function, "stemming") and config.get(cfg_function, "stemming"):
	try:
	methods[rank_method_code]["stemmer"] = config.get(cfg_function, "stemming")
	except Exception,e:
	pass

	if config.has_option(cfg_function, "stopword"):
	methods[rank_method_code]["stopwords"] = config.get(cfg_function, "stopword")

	if config.has_section("find_similar"):
	methods[rank_method_code]["max_word_occurence"] = float(config.get("find_similar", "max_word_occurence"))
	methods[rank_method_code]["min_word_occurence"] = float(config.get("find_similar", "min_word_occurence"))
	methods[rank_method_code]["min_word_length"] = int(config.get("find_similar", "min_word_length"))
	methods[rank_method_code]["min_nr_words_docs"] = int(config.get("find_similar", "min_nr_words_docs"))
	methods[rank_method_code]["max_nr_words_upper"] = int(config.get("find_similar", "max_nr_words_upper"))
	methods[rank_method_code]["max_nr_words_lower"] = int(config.get("find_similar", "max_nr_words_lower"))
	methods[rank_method_code]["override_default_min_relevance"] = config.get("find_similar", "override_default_min_relevance")
	methods[rank_method_code]["default_min_relevance"] = int(config.get("find_similar", "default_min_relevance"))

	if config.has_section("combine_method"):
	i = 1
	methods[rank_method_code]["combine_method"] = []
	while config.has_option("combine_method", "method%s" % i):
	methods[rank_method_code]["combine_method"].append(string.split(config.get("combine_method", "method%s" % i), ","))
	i += 1

	def is_method_valid(colID, rank_method_code):
	"""Checks if a method is valid for the collection given"""

	enabled_colls = dict(run_sql("SELECT id_collection, score from collection_rnkMETHOD,rnkMETHOD WHERE id_rnkMETHOD=rnkMETHOD.id AND name='%s'" % rank_method_code))

	colID = int(colID)
	if enabled_colls.has_key(colID):
	return 1
	else:
	while colID:
	colID = run_sql("SELECT id_dad FROM collection_collection WHERE id_son=%s" % colID)
	if colID and enabled_colls.has_key(colID[0][0]):
	return 1
	elif colID:
	colID = colID[0][0]
	return 0

	def get_bibrank_methods(collection, ln=cdslang):
	"""Returns a list of rank methods and the name om them in the language defined by the ln parameter, if collection is given, only methods enabled for that collection is returned."""

	if not globals().has_key('methods'):
	create_rnkmethod_cache()

	avail_methods = []
	for (rank_method_code, options) in methods.iteritems():
	if options.has_key("function") and is_method_valid(collection, rank_method_code):
	if options.has_key(ln):
	avail_methods.append((rank_method_code, options[ln]))
	elif options.has_key(cdslang):
	avail_methods.append((rank_method_code, options[cdslang]))
	else:
	avail_methods.append((rank_method_code, rank_method_code))
	return avail_methods

	def rank_records(rank_method_code, rank_limit_relevance, hitset_global, pattern=[], verbose=0):
	"""rank_method_code, e.g. `jif' or `sbr' (word frequency vector model)
	rank_limit_relevance, e.g. `23' for `nbc' (number of citations) or `0.10' for `vec'
	hitset, search engine hits;
	pattern, search engine query or record ID (you check the type)
	verbose, verbose level
	output:
	list of records
	list of rank values
	prefix
	postfix
	verbose_output"""

	global voutput
	voutput = ""
	configcreated = ""

	try:
	hitset = copy.deepcopy(hitset_global) #we are receiving a global hitset
	if not globals().has_key('methods'):
	create_rnkmethod_cache()

	function = methods[rank_method_code]["function"]
	func_object = globals().get(function)
	if func_object and pattern and pattern[0][0:6] == "recid:" and function == "word_similarity":
	result = find_similar(rank_method_code, pattern[0][6:], hitset, rank_limit_relevance, verbose)
	elif func_object:
	result = func_object(rank_method_code, pattern, hitset, rank_limit_relevance, verbose)
	else:
	result = rank_by_method(rank_method_code, pattern, hitset, rank_limit_relevance, verbose)
	except Exception, e:
	result = (None, "", adderrorbox("An error occured when trying to rank the search result", ["Unexpected error: %s<br><b>Traceback:</b>%s" % (e, traceback.format_tb(sys.exc_info()[2]))]), voutput)

	if result[0] and result[1]:
	results_similar_recIDs = map(lambda x: x[0], result[0])
	results_similar_relevances = map(lambda x: x[1], result[0])
	result = (results_similar_recIDs, results_similar_relevances, result[1], result[2], "%s" % configcreated + result[3])
	else:
	result = (None, None, result[1], result[2], result[3])

	if verbose > 0:
	print string.replace(voutput, "<br>", "\n")

	return result

	def combine_method(rank_method_code, pattern, hitset, rank_limit_relevance,verbose):
	"""combining several methods"""

	global voutput
	result = {}
	try:
	for (method, percent) in methods[rank_method_code]["combine_method"]:
	function = methods[method]["function"]
	func_object = globals().get(function)
	percent = int(percent)

	if func_object:
	this_result = func_object(method, pattern, hitset, rank_limit_relevance, verbose)[0]
	else:
	this_result = rank_by_method(method, pattern, hitset, rank_limit_relevance, verbose)[0]

	for i in range(0, len(this_result)):
	(recID, value) = this_result[i]
	if value > 0:
	result[recID] = result.get(recID, 0) + int((float(i) / len(this_result)) * float(percent))

	result = result.items()
	result.sort(lambda x, y: cmp(x[1], y[1]))
	return (result, "(", ")", voutput)
	except Exception, e:
	return (None, "Warning, method cannot be used for ranking your query.", "", voutput)

	def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance,verbose):
	"""Ranking of records based on predetermined values.
	input:
	rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from
	rnkMETHODDATA
	lwords - a list of words from the query
	hitset - a list of hits for the query found by search_engine
	rank_limit_relevance - show only records with a rank value above this
	verbose - verbose value
	output:
	reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]]
	prefix - what to show before the rank value
	postfix - what to show after the rank value
	voutput - contains extra information, content dependent on verbose value"""

	global voutput
	rnkdict = run_sql("SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name='%s'" % rank_method_code)

	if not rnkdict:
	return (None, "Warning, Could not load ranking data for method.", "", voutput)

	lwords_hitset = None
	for j in range(0, len(lwords)):
	if lwords[j] and lwords[j][:6] == "recid:":
	if not lwords_hitset:
	lwords_hitset = HitSet()
	lword = lwords[j][6:]
	if string.find(lword, "->") > -1:
	lword = string.split(lword, "->")
	if int(lword[0]) >= cfg_max_recID + 1 or int(lword[1]) >= cfg_max_recID + 1:
	return (None, "Warning, The record range given is out of range.", "", voutput)
	for i in range(int(lword[0]), int(lword[1])):
	lwords_hitset.add(int(i))
	elif lword < cfg_max_recID + 1:
	lwords_hitset.add(int(lword))
	else:
	return (None, "Warning, The record range given is out of range.", "", voutput)

	rnkdict = deserialize_via_marshal(rnkdict[0][0])
	if verbose > 0:
	voutput += "<br>Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br>" % rank_method_code
	voutput += "Ranking data loaded, size of structure: %s<br>" % len(rnkdict)
	lrecIDs = hitset.items()

	if verbose > 0:
	voutput += "Number of records to rank: %s<br>" % len(lrecIDs)
	reclist = []
	reclist_addend = []

	if not lwords_hitset:
	for recID in lrecIDs:
	if rnkdict.has_key(recID):
	reclist.append((recID, rnkdict[recID]))
	del rnkdict[recID]
	else:
	reclist_addend.append((recID, 0))
	else:
	lwords_lrecIDs = lwords_hitset.items()
	for recID in lwords_lrecIDs:
	if rnkdict.has_key(recID) and hitset.contains(recID):
	reclist.append((recID, rnkdict[recID]))
	del rnkdict[recID]
	elif hitset.contains(recID):
	reclist_addend.append((recID, 0))

	if verbose > 0:
	voutput += "Number of records ranked: %s<br>" % len(reclist)
	voutput += "Number of records not ranked: %s<br>" % len(reclist_addend)

	reclist.sort(lambda x, y: cmp(x[1], y[1]))
	return (reclist_addend + reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)

	def find_similar(rank_method_code, recID, hitset, rank_limit_relevance,verbose):
	"""Finding terms to use for calculating similarity. Terms are taken from the recid given, returns a list of recids's and relevance,
	input:
	rank_method_code - the code of the method, from the name field in rnkMETHOD
	recID - records to use for find similar
	hitset - a list of hits for the query found by search_engine
	rank_limit_relevance - show only records with a rank value above this
	verbose - verbose value
	output:
	reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
	prefix - what to show before the rank value
	postfix - what to show after the rank value
	voutput - contains extra information, content dependent on verbose value"""

	startCreate = time.time()
	global voutput

	if verbose > 0:
	voutput += "<br>Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br>" % rank_method_code
	if methods[rank_method_code]["override_default_min_relevance"] == "no":
	rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]

	try:
	recID = int(recID)
	except Exception,e :
	return (None, "Warning: Error in record number, please check that a number is given.", "", voutput)

	rec_terms = run_sql("SELECT termlist FROM %sR WHERE id_bibrec=%s" % (methods[rank_method_code]["rnkWORD_table"][:-1], recID))
	if not rec_terms:
	return (None, "Warning: Requested record does not seem to exist.", "", voutput)
	rec_terms = deserialize_via_marshal(rec_terms[0][0])

	#Get all documents using terms from the selected documents
	if len(rec_terms) == 0:
	return (None, "Warning: Record spesified has no content indexed for use with this method.", "", voutput)
	else:
	terms = "%s" % rec_terms.keys()
	terms_recs = dict(run_sql("SELECT term, hitlist FROM %s WHERE term IN (%s)" % (methods[rank_method_code]["rnkWORD_table"], terms[1:len(terms) - 1])))

	tf_values = {}
	#Calculate all term frequencies
	for (term, tf) in rec_terms.iteritems():
	if len(term) >= methods[rank_method_code]["min_word_length"] and terms_recs.has_key(term) and tf[1] != 0:
	tf_values[term] = int((1 + math.log(tf[0])) * tf[1])
	tf_values = tf_values.items()
	tf_values.sort(lambda x, y: cmp(y[1], x[1]))

	lwords = []
	stime = time.time()
	(recdict, rec_termcount) = ({}, {})

	for (t, tf) in tf_values: #t=term, tf=term frequency
	term_recs = deserialize_via_marshal(terms_recs[t])
	if len(tf_values) <= methods[rank_method_code]["max_nr_words_lower"] or (len(term_recs) >= methods[rank_method_code]["min_nr_words_docs"] and (((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) <= methods[rank_method_code]["max_word_occurence"]) and ((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) >= methods[rank_method_code]["min_word_occurence"]))):
	lwords.append((t, methods[rank_method_code]["rnkWORD_table"]))
	(recdict, rec_termcount) = calculate_record_relevance_findsimilar((t, round(tf, 4)) , term_recs, hitset, recdict, rec_termcount, verbose, "true")
	if len(tf_values) > methods[rank_method_code]["max_nr_words_lower"] and (len(lwords) == methods[rank_method_code]["max_nr_words_upper"] or tf < 0):
	break

	if len(recdict) == 0 or len(lwords) == 0:
	return (None, "Could not find any similar documents, possibly because of error in ranking data.", "", voutput)
	else:
	(reclist, hitset) = sort_record_relevance_findsimilar(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

	if verbose > 0:
	voutput += "<br>Number of terms: %s<br>" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
	voutput += "Number of terms to use for query: %s<br>" % len(lwords)
	voutput += "Terms: %s<br>" % lwords
	voutput += "Current number of recIDs: %s<br>" % (methods[rank_method_code]["col_size"])
	voutput += "Prepare time: %s<br>" % (str(time.time() - startCreate))
	voutput += "Total time used: %s<br>" % (str(time.time() - startCreate))
	rank_method_stat(rank_method_code, reclist, lwords)

	return (reclist[:len(reclist)], methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)

	def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance,verbose):
	"""Ranking a records containing specified words and returns a sorted list.
	input:
	rank_method_code - the code of the method, from the name field in rnkMETHOD
	lwords - a list of words from the query
	hitset - a list of hits for the query found by search_engine
	rank_limit_relevance - show only records with a rank value above this
	verbose - verbose value
	output:
	reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
	prefix - what to show before the rank value
	postfix - what to show after the rank value
	voutput - contains extra information, content dependent on verbose value"""

	global voutput
	startCreate = time.time()

	if verbose > 0:
	voutput += "<br>Running rank method: %s, using word_frequency function in bibrank_record_sorter<br>" % rank_method_code

	lwords_old = lwords
	lwords = []
	words_removed = ""
	#Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
	for i in range(0, len(lwords_old)):
	term = string.lower(lwords_old[i])
	if not methods[rank_method_code]["stopwords"] or methods[rank_method_code]["stopwords"] and not is_stopword_force(term):
	lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
	terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term)))
	for term in terms:
	if methods[rank_method_code].has_key("stemmer"): # stem word
	term = stem_by_lang(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"])
	if lwords_old[i] != term: #add if stemmed word is different than original word
	lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
	else:
	words_removed += "%s " % term
	if verbose > 0 and words_removed:
	voutput += "The following words are very common and were not included in ranking the documents: %s<br>" % words_removed

	(recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
	#For each term, if accepted, get a list of the records using the term
	#calculate then relevance for each term before sorting the list of records
	for (term, table) in lwords:
	term_recs = run_sql("SELECT term, hitlist FROM %s WHERE term='%s'" % (methods[rank_method_code]["rnkWORD_table"], MySQLdb.escape_string(term)))
	if term_recs:
	term_recs = deserialize_via_marshal(term_recs[0][1])
	if check_term(term, methods[rank_method_code]["col_size"], len(term_recs), 1.0, 0.00, 0):
	(recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None)
	del term_recs

	if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
	return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput)
	else:
	(reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

	#Add any documents not ranked to the end of the list
	if hitset:
	hitset.calculate_nbhits()
	lrecIDs = hitset.tolist() #using 2-3mb
	reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist #using 6mb

	if verbose > 0:
	voutput += "<br>Current number of recIDs: %s<br>" % (methods[rank_method_code]["col_size"])
	voutput += "Number of terms: %s<br>" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
	voutput += "Terms: %s<br>" % lwords
	voutput += "Prepare and pre calculate time: %s<br>" % (str(time.time() - startCreate))
	voutput += "Total time used: %s<br>" % (str(time.time() - startCreate))
	rank_method_stat(rank_method_code, reclist, lwords)

	return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)

	def calculate_record_relevance(term, invidx, hitset, recdict, rec_termcount, verbose, quick=None):
	"""Calculating the relevance of the documents based on the input, calculates only one word
	term - (term, query term factor) the term and its importance in the overall search
	invidx - {recid: tf, Gi: norm value} The Gi value is used as a idf value
	hitset - a hitset with records that are allowed to be ranked
	recdict - contains currently ranked records, is returned with new values
	rec_termcount - {recid: count} the number of terms in this record that matches the query
	verbose - verbose value
	quick - if quick=yes only terms with a positive qtf is used, to limit the number of records to sort"""


	(t, qtf) = term
	if invidx.has_key("Gi"):
	Gi = invidx["Gi"][1]
	del invidx["Gi"]
	else:
	return (recdict, rec_termcount)

	if not quick or (qtf >= 0 or (qtf < 0 and len(recdict) == 0)):
	#Only accept records existing in the hitset received from the search engine
	for (j, tf) in invidx.iteritems():
	if hitset.contains(j):
	#recdict[j] = recdict.get(j, 0) + int((1 + math.log(tf[0])) * Gi * tf[1] * qtf)
	recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
	rec_termcount[j] = rec_termcount.get(j, 0) + 1
	#Multiply with the number of terms of the total number of terms in the query existing in the records
	elif quick: #much used term, do not include all records, only use already existing ones
	for (j, tf) in recdict.iteritems():
	if invidx.has_key(j):
	tf = invidx[j]
	#recdict[j] = recdict[j] + int((1 + math.log(tf[0])) * Gi * tf[1] * qtf)
	recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
	rec_termcount[j] = rec_termcount.get(j, 0) + 1

	return (recdict, rec_termcount)

	def calculate_record_relevance_findsimilar(term, invidx, hitset, recdict, rec_termcount, verbose, quick=None):
	"""Calculating the relevance of the documents based on the input, calculates only one word
	term - (term, query term factor) the term and its importance in the overall search
	invidx - {recid: tf, Gi: norm value} The Gi value is used as a idf value
	hitset - a hitset with records that are allowed to be ranked
	recdict - contains currently ranked records, is returned with new values
	rec_termcount - {recid: count} the number of terms in this record that matches the query
	verbose - verbose value
	quick - if quick=yes only terms with a positive qtf is used, to limit the number of records to sort"""


	(t, qtf) = term
	if invidx.has_key("Gi"):
	Gi = invidx["Gi"][1]
	del invidx["Gi"]
	else:
	return (recdict, rec_termcount)

	if not quick or (qtf >= 0 or (qtf < 0 and len(recdict) == 0)):
	#Only accept records existing in the hitset received from the search engine
	for (j, tf) in invidx.iteritems():
	if hitset.contains(j):
	recdict[j] = recdict.get(j, 0) + int((1 + math.log(tf[0])) * Gi * tf[1] * qtf)
	#recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
	rec_termcount[j] = rec_termcount.get(j, 0) + 1
	#Multiply with the number of terms of the total number of terms in the query existing in the records
	elif quick: #much used term, do not include all records, only use already existing ones
	for (j, tf) in recdict.iteritems():
	if invidx.has_key(j):
	tf = invidx[j]
	recdict[j] = recdict[j] + int((1 + math.log(tf[0])) * Gi * tf[1] * qtf)
	#recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
	rec_termcount[j] = rec_termcount.get(j, 0) + 1

	return (recdict, rec_termcount)

	def sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose):
	"""Sorts the dictionary and returns records with a relevance higher than the given value.
	recdict - {recid: value} unsorted
	rank_limit_relevance - a value > 0 usually
	verbose - verbose value"""

	startCreate = time.time()
	global voutput
	reclist = []
	#Multiply with the number of terms of the total number of terms in the query existing in the records
	for j in recdict.keys():
	hitset.remove(j)
	#if recdict[j] > 0:
	# recdict[j] = math.log((recdict[j] * rec_termcount[j]))
	# recdict[j] = recdict[j] * rec_termcount[j]

	divideby = max(recdict.values())
	for (j, w) in recdict.iteritems():
	w = int(w * 100 / divideby)
	if w >= rank_limit_relevance:
	reclist.append((j, w))
	reclist.sort(lambda x, y: cmp(x[1], y[1]))

	if verbose > 0:
	voutput += "Number of records sorted: %s<br>" % len(reclist)
	voutput += "Sort time: %s<br>" % (str(time.time() - startCreate))
	return (reclist, hitset)

	def sort_record_relevance_findsimilar(recdict, rec_termcount, hitset, rank_limit_relevance, verbose):
	"""Sorts the dictionary and returns records with a relevance higher than the given value.
	recdict - {recid: value} unsorted
	rank_limit_relevance - a value > 0 usually
	verbose - verbose value"""

	startCreate = time.time()
	global voutput
	reclist = []
	#Multiply with the number of terms of the total number of terms in the query existing in the records
	for j in recdict.keys():
	hitset.remove(j)
	if recdict[j] > 0:
	recdict[j] = math.log((recdict[j] * rec_termcount[j]))
	# recdict[j] = recdict[j] * rec_termcount[j]

	divideby = max(recdict.values())
	for (j, w) in recdict.iteritems():
	w = int(w * 100 / divideby)
	if w >= rank_limit_relevance:
	reclist.append((j, w))
	reclist.sort(lambda x, y: cmp(x[1], y[1]))

	if verbose > 0:
	voutput += "Number of records sorted: %s<br>" % len(reclist)
	voutput += "Sort time: %s<br>" % (str(time.time() - startCreate))
	return (reclist, hitset)

	def rank_method_stat(rank_method_code, reclist, lwords):
	"""Shows some statistics about the searchresult.
	rank_method_code - name field from rnkMETHOD
	reclist - a list of sorted and ranked records
	lwords - the words in the query"""

	global voutput
	if len(reclist) > 20:
	j = 20
	else:
	j = len(reclist)

	voutput += "<br>Rank statistics:<br>"
	for i in range(1, j + 1):
	voutput += "%s,Recid:%s,Score:%s<br>" % (i,reclist[len(reclist) - i][0],reclist[len(reclist) - i][1])
	for (term, table) in lwords:
	term_recs = run_sql("SELECT hitlist FROM %s WHERE term='%s'" % (table, term))
	if term_recs:
	term_recs = deserialize_via_marshal(term_recs[0][0])
	if term_recs.has_key(reclist[len(reclist) - i][0]):
	voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]])
	voutput += "<br>"

	voutput += "<br>Score variation:<br>"
	count = {}
	for i in range(0, len(reclist)):
	count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1
	i = 100
	while i >= 0:
	if count.has_key(i):
	voutput += "%s-%s<br>" % (i, count[i])
	i -= 1

	try:
	import psyco
	psyco.bind(find_similar)
	psyco.bind(rank_by_method)
	psyco.bind(calculate_record_relevance)
	psyco.bind(post_calculate_record_relevance)
	psyco.bind(word_similarity)
	psyco.bind(sort_record_relevance)
	psyco.bind(serialize_via_numeric_array)
	psyco.bind(serialize_via_marshal)
	psyco.bind(deserialize_via_numeric_array)
	psyco.bind(deserialize_via_marshal)
	except StandardError, e:
	pass

bibrank_record_sorter.py.wmlNo OneTemporaryActions

File Metadata

bibrank_record_sorter.py.wmlView Options

Event Timeline

bibrank_record_sorter.py.wml
No OneTemporary
Actions

bibrank_record_sorter.py.wml
View Options