diff --git a/modules/bibrank/lib/bibrank_record_sorter.py b/modules/bibrank/lib/bibrank_record_sorter.py
index 764620e05..c2f4253d0 100644
--- a/modules/bibrank/lib/bibrank_record_sorter.py
+++ b/modules/bibrank/lib/bibrank_record_sorter.py
@@ -1,736 +1,746 @@
 ##Ranking of records using different parameters and methods on the fly.
 
 ## This file is part of the CERN Document Server Software (CDSware).
 ## Copyright (C) 2002 CERN.
 ##
 ## The CDSware is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ##
 ## The CDSware is distributed in the hope that it will be useful, but
 ## WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.
 ##
 ## You should have received a copy of the GNU General Public License
 ## along with CDSware; if not, write to the Free Software Foundation, Inc.,
 ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 ## read config variables:
 #include "config.wml"
 #include "configbis.wml"
 #include "cdswmllib.wml"
 
 ## start Python:
 <protect>#!</protect><PYTHON>
 <protect># -*- coding: utf-8 -*-</protect>
 <protect>## $Id$</protect>
 <protect>## DO NOT EDIT THIS FILE!  IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
 ## fill config variables:
 pylibdir = "<LIBDIR>/python"
 
 try:
     import sys
     import zlib
     import marshal
     import string
     import time
     import math
     import MySQLdb
     import Numeric
     import re
     import ConfigParser
     import traceback
     import copy
 except ImportError, e:
     pass
 
 try:
     sys.path.append('%s' % pylibdir)
     from cdsware.config import *
     from cdsware.dbquery import run_sql
     from cdsware.bibindex_engine_stemmer import stem_by_lang, lang_available
     from cdsware.bibindex_engine_stopwords import is_stopword_force
     from cdsware.search_engine_config import cfg_max_recID
 except ImportError, e:
     pass
 
 class HitSet:
     """Class describing set of records, implemented as bit vectors of recIDs.
     Using Numeric arrays for speed (1 value = 8 bits), can use later "real"
     bit vectors to save space."""
 
     def __init__(self, init_set=None):
         self._nbhits = -1
         if init_set:
             self._set = init_set
         else:
             self._set = Numeric.zeros(cfg_max_recID+1, Numeric.Int0)
 
     def __repr__(self, join=string.join):
         return "%s(%s)" % (self.__class__.__name__, join(map(repr, self._set), ', '))
 
     def add(self, recID):
         "Adds a record to the set."
         self._set[recID] = 1
 
     def addmany(self, recIDs):
         "Adds several recIDs to the set."
         for recID in recIDs: self._set[recID] = 1
 
     def addlist(self, arr):
         "Adds an array of recIDs to the set."
         Numeric.put(self._set, arr, 1)
 
     def remove(self, recID):
         "Removes a record from the set."
         self._set[recID] = 0
 
     def removemany(self, recIDs):
         "Removes several records from the set."
         for recID in recIDs:
             self.remove(recID)
 
     def intersect(self, other):
         "Does a set intersection with other.  Keep result in self."
         self._set = Numeric.bitwise_and(self._set, other._set)
 
     def union(self, other):
         "Does a set union with other. Keep result in self."
         self._set = Numeric.bitwise_or(self._set, other._set)
 
     def difference(self, other):
         "Does a set difference with other. Keep result in self."
         #self._set = Numeric.bitwise_not(self._set, other._set)
         for recID in Numeric.nonzero(other._set):
             self.remove(recID)
 
     def contains(self, recID):
         "Checks whether the set contains recID."
         return self._set[recID]
 
     __contains__ = contains     # Higher performance member-test for python 2.0 and above
 
     def __getitem__(self, index):
         "Support for the 'for item in set:' protocol."
         return Numeric.nonzero(self._set)[index]
         
     def calculate_nbhits(self):
         "Calculates the number of records set in the hitset."
         self._nbhits = Numeric.sum(self._set.copy().astype(Numeric.Int))
 
     def items(self):
         "Return an array containing all recID."
         return Numeric.nonzero(self._set)
 
     def tolist(self):
         "Return an array containing all recID."
         return Numeric.nonzero(self._set).tolist()
 
 def compare_on_val(first, second):
     return cmp(second[1], first[1])
 def serialize_via_numeric_array_dumps(arr):
     return Numeric.dumps(arr)
 def serialize_via_numeric_array_compr(str):
     return zlib.compress(str)
 def serialize_via_numeric_array_escape(str):
     return MySQLdb.escape_string(str)
 def serialize_via_numeric_array(arr):
     """Serialize Numeric array into a compressed string."""
     return serialize_via_numeric_array_escape(serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr)))
 def deserialize_via_numeric_array(string):
     """Decompress and deserialize string into a Numeric array."""
     return Numeric.loads(zlib.decompress(string))
 def serialize_via_marshal(obj):
     """Serialize Python object via marshal into a compressed string."""
     return MySQLdb.escape_string(zlib.compress(marshal.dumps(obj)))
 def deserialize_via_marshal(string):
     """Decompress and deserialize string into a Python object via marshal."""
     return marshal.loads(zlib.decompress(string))
 
 def adderrorbox(header='', datalist=[]):
     """used to create table around main data on a page, row based"""
 
     try: 
         perc = str(100 // len(datalist)) + '%'
     except ZeroDivisionError: 
         perc = 1
 
     output  = '<table class="errorbox">'
     output += '<thead><tr><th class="errorboxheader" colspan="%s">%s</th></tr></thead>' % (len(datalist), header)
     output += '<tbody>'
     for row in [datalist]:
         output += '<tr>'
         for data in row:
             output += '<td style="vertical-align: top; margin-top: 5px; width: %s;">' % (perc, )
             output += data
             output += '</td>'
         output += '</tr>'
     output += '</tbody></table>'
     return output
 
 def check_term(term, col_size, term_rec, max_occ, min_occ, termlength):
     """Check if the term is valid for use
     term - the term to check
     col_size - the number of records in database
     term_rec - the number of records which contains this term
     max_occ - max frequency of the term allowed
     min_occ - min frequence of the term allowed
     termlength - the minimum length of the terms allowed"""
 
     try:
         if is_stopword_force(term) or (len(term) <= termlength) or ((float(term_rec) / float(col_size)) >= max_occ) or ((float(term_rec) / float(col_size)) <= min_occ):
 	    return ""
         if int(term):
             return ""
     except StandardError, e:
 	pass
     return "true"
 
 def create_rnkmethod_cache():
     """Create cache with vital information for each rank method."""
 
     global methods    
     bibrank_meths = run_sql("SELECT name from rnkMETHOD")
     methods = {}
     global voutput
     voutput = ""
 
     for (rank_method_code,) in bibrank_meths:
         try:
             file = etcdir + "/bibrank/" + rank_method_code + ".cfg"
             config = ConfigParser.ConfigParser()
             config.readfp(open(file))
         except StandardError, e:
             pass
 
         cfg_function = config.get("rank_method", "function")
         if config.has_section(cfg_function):
             methods[rank_method_code] = {}
             methods[rank_method_code]["function"] = cfg_function
             methods[rank_method_code]["prefix"] = config.get(cfg_function, "relevance_number_output_prologue")
             methods[rank_method_code]["postfix"] = config.get(cfg_function, "relevance_number_output_epilogue")
             methods[rank_method_code]["chars_alphanumericseparators"] = r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]"
         else:
             raise Exception("Error in configuration file: %s" % (etcdir + "/bibrank/" + rank_method_code + ".cfg"))
 
         i8n_names = run_sql("SELECT ln,value from rnkMETHODNAME,rnkMETHOD where id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" % (rank_method_code))
         for (ln, value) in i8n_names:
             methods[rank_method_code][ln] = value
 
         if config.has_option(cfg_function, "table"):
             methods[rank_method_code]["rnkWORD_table"] = config.get(cfg_function, "table")
             methods[rank_method_code]["col_size"] = run_sql("SELECT count(*) FROM %sR" % methods[rank_method_code]["rnkWORD_table"][:-1])[0][0]
 
         if config.has_option(cfg_function, "stemming") and config.get(cfg_function, "stemming"):
             try:
                 methods[rank_method_code]["stemmer"] = config.get(cfg_function, "stemming")
             except Exception,e:
                 pass
 
         if config.has_option(cfg_function, "stopword"):
             methods[rank_method_code]["stopwords"] = config.get(cfg_function, "stopword")
 
         if config.has_section("find_similar"):
             methods[rank_method_code]["max_word_occurence"] = float(config.get("find_similar", "max_word_occurence"))
             methods[rank_method_code]["min_word_occurence"] = float(config.get("find_similar", "min_word_occurence"))
             methods[rank_method_code]["min_word_length"] = int(config.get("find_similar", "min_word_length"))
             methods[rank_method_code]["min_nr_words_docs"] = int(config.get("find_similar", "min_nr_words_docs"))
             methods[rank_method_code]["max_nr_words_upper"] = int(config.get("find_similar", "max_nr_words_upper"))
             methods[rank_method_code]["max_nr_words_lower"] = int(config.get("find_similar", "max_nr_words_lower"))
-            methods[rank_method_code]["override_default_min_relevance"] = config.get("find_similar", "override_default_min_relevance")
             methods[rank_method_code]["default_min_relevance"] = int(config.get("find_similar", "default_min_relevance"))
             
         if config.has_section("combine_method"):
             i = 1
             methods[rank_method_code]["combine_method"] = []
             while config.has_option("combine_method", "method%s" % i):
                 methods[rank_method_code]["combine_method"].append(string.split(config.get("combine_method", "method%s" % i), ","))
                 i += 1
 
 def is_method_valid(colID, rank_method_code):
     """Checks if a method is valid for the collection given"""
     
     enabled_colls = dict(run_sql("SELECT id_collection, score from collection_rnkMETHOD,rnkMETHOD WHERE id_rnkMETHOD=rnkMETHOD.id AND name='%s'" % rank_method_code))
 
     colID = int(colID)
     if enabled_colls.has_key(colID):
         return 1
     else:
         while colID:
             colID = run_sql("SELECT id_dad FROM collection_collection WHERE id_son=%s" % colID)
             if colID and enabled_colls.has_key(colID[0][0]):
                 return 1
             elif colID:
                 colID = colID[0][0]
     return 0
 
 def get_bibrank_methods(collection, ln=cdslang):
     """Returns a list of rank methods and the name om them in the language defined by the ln parameter, if collection is given, only methods enabled for that collection is returned."""
 
     if not globals().has_key('methods'):
         create_rnkmethod_cache()
 
     avail_methods = []
     for (rank_method_code, options) in methods.iteritems():
         if options.has_key("function") and is_method_valid(collection, rank_method_code):
             if options.has_key(ln):
                 avail_methods.append((rank_method_code, options[ln]))
             elif options.has_key(cdslang):
                 avail_methods.append((rank_method_code, options[cdslang]))
             else:
                 avail_methods.append((rank_method_code, rank_method_code))              
     return avail_methods
 
 def rank_records(rank_method_code, rank_limit_relevance, hitset_global, pattern=[], verbose=0):
     """rank_method_code, e.g. `jif' or `sbr' (word frequency vector model)                    
        rank_limit_relevance, e.g. `23' for `nbc' (number of citations) or `0.10' for `vec'                   
        hitset, search engine hits;                   
        pattern, search engine query or record ID (you check the type)                   
        verbose, verbose level
        output:
        list of records
        list of rank values
        prefix
        postfix
        verbose_output"""
 
     global voutput
     voutput = ""
     configcreated = ""
 
     try:
         hitset = copy.deepcopy(hitset_global) #we are receiving a global hitset
         if not globals().has_key('methods'):
             create_rnkmethod_cache()
 
         function = methods[rank_method_code]["function"]
         func_object = globals().get(function)
         if func_object and pattern and pattern[0][0:6] == "recid:" and function == "word_similarity":
             result = find_similar(rank_method_code, pattern[0][6:], hitset, rank_limit_relevance, verbose)
         elif func_object:
             result = func_object(rank_method_code, pattern, hitset, rank_limit_relevance, verbose)
         else:
             result = rank_by_method(rank_method_code, pattern, hitset, rank_limit_relevance, verbose)
     except Exception, e:
         result = (None, "", adderrorbox("An error occured when trying to rank the search result", ["Unexpected error: %s<br><b>Traceback:</b>%s" % (e, traceback.format_tb(sys.exc_info()[2]))]), voutput)
 
-    if result[0] and result[1]:
+    if result[0] and result[1]: #split into two lists for search_engine
         results_similar_recIDs = map(lambda x: x[0], result[0])
         results_similar_relevances = map(lambda x: x[1], result[0])
         result = (results_similar_recIDs, results_similar_relevances, result[1], result[2], "%s" % configcreated + result[3])
     else:
         result = (None, None, result[1], result[2], result[3])
 
     if verbose > 0:
         print string.replace(voutput, "<br>", "\n")
 
     return result
 
 def combine_method(rank_method_code, pattern, hitset, rank_limit_relevance,verbose):
-    """combining several methods"""
+    """combining several methods into one based on methods/percentage in config file"""
 
     global voutput
     result = {}
     try:
         for (method, percent) in methods[rank_method_code]["combine_method"]:
             function = methods[method]["function"]
             func_object = globals().get(function)
             percent = int(percent)
 
             if func_object:
                 this_result = func_object(method, pattern, hitset, rank_limit_relevance, verbose)[0]
             else:
                 this_result = rank_by_method(method, pattern, hitset, rank_limit_relevance, verbose)[0]
 
             for i in range(0, len(this_result)): 
                 (recID, value) = this_result[i]  
                 if value > 0:
                     result[recID] = result.get(recID, 0) + int((float(i) / len(this_result)) * float(percent))
 
         result = result.items()
         result.sort(lambda x, y: cmp(x[1], y[1]))
         return (result, "(", ")", voutput)
     except Exception, e:
         return (None, "Warning, method cannot be used for ranking your query.", "", voutput)
         
 def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance,verbose):
     """Ranking of records based on predetermined values.
     input:
     rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from 
     rnkMETHODDATA
     lwords - a list of words from the query
     hitset - a list of hits for the query found by search_engine
     rank_limit_relevance - show only records with a rank value above this
     verbose - verbose value
     output:
     reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]] 
     prefix - what to show before the rank value 
     postfix - what to show after the rank value
     voutput - contains extra information, content dependent on verbose value"""
 
     global voutput
     rnkdict = run_sql("SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name='%s'" % rank_method_code)
 
     if not rnkdict:
         return (None, "Warning, Could not load ranking data for method.", "", voutput)
 
     lwords_hitset = None
-    for j in range(0, len(lwords)):
+    for j in range(0, len(lwords)): #find which docs to search based on ranges..should be done in search_engine...
         if lwords[j] and lwords[j][:6] == "recid:":
             if not lwords_hitset:
                 lwords_hitset = HitSet()
             lword = lwords[j][6:]
             if string.find(lword, "->") > -1:
                 lword = string.split(lword, "->")
                 if int(lword[0]) >= cfg_max_recID + 1 or int(lword[1]) >= cfg_max_recID + 1:        
                     return (None, "Warning, The record range given is out of range.", "", voutput)  
                 for i in range(int(lword[0]), int(lword[1])):
                     lwords_hitset.add(int(i))
             elif lword < cfg_max_recID + 1:
                 lwords_hitset.add(int(lword))
             else:
                 return (None, "Warning, The record range given is out of range.", "", voutput)  
     
     rnkdict = deserialize_via_marshal(rnkdict[0][0])
     if verbose > 0:
         voutput += "<br>Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br>" % rank_method_code
         voutput += "Ranking data loaded, size of structure: %s<br>" % len(rnkdict)
     lrecIDs = hitset.items()
 
     if verbose > 0:
         voutput += "Number of records to rank: %s<br>" % len(lrecIDs)
     reclist = []
     reclist_addend = []
 
-    if not lwords_hitset:
+    if not lwords_hitset: #rank all docs, can this be speed up using something else than for loop?
         for recID in lrecIDs:
             if rnkdict.has_key(recID):
                 reclist.append((recID, rnkdict[recID]))
                 del rnkdict[recID]
             else:
                 reclist_addend.append((recID, 0))
-    else:
+    else: #rank docs in hitset, can this be speed up using something else than for loop?
         lwords_lrecIDs = lwords_hitset.items()
         for recID in lwords_lrecIDs:
             if rnkdict.has_key(recID) and hitset.contains(recID):
                 reclist.append((recID, rnkdict[recID]))
                 del rnkdict[recID]
             elif hitset.contains(recID):
                 reclist_addend.append((recID, 0))
         
     if verbose > 0:
         voutput += "Number of records ranked: %s<br>" % len(reclist)
         voutput += "Number of records not ranked: %s<br>" % len(reclist_addend)
 
     reclist.sort(lambda x, y: cmp(x[1], y[1]))
     return (reclist_addend + reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
 
 def find_similar(rank_method_code, recID, hitset, rank_limit_relevance,verbose):
     """Finding terms to use for calculating similarity. Terms are taken from the recid given, returns a list of recids's and relevance,
     input:
     rank_method_code - the code of the method, from the name field in rnkMETHOD
     recID - records to use for find similar
     hitset - a list of hits for the query found by search_engine
     rank_limit_relevance - show only records with a rank value above this
     verbose - verbose value
     output:
     reclist - a list of sorted records: [[23,34], [344,24], [1,01]] 
     prefix - what to show before the rank value 
     postfix - what to show after the rank value
     voutput - contains extra information, content dependent on verbose value"""
 
     startCreate = time.time()
     global voutput
 
     if verbose > 0:
         voutput += "<br>Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br>" % rank_method_code
-    if methods[rank_method_code]["override_default_min_relevance"] == "no":
-        rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]
+    rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]
 
     try:
         recID = int(recID)
     except Exception,e :
         return (None, "Warning: Error in record number, please check that a number is given.", "", voutput) 
 
     rec_terms = run_sql("SELECT termlist FROM %sR WHERE id_bibrec=%s" % (methods[rank_method_code]["rnkWORD_table"][:-1], recID))
     if not rec_terms:
         return (None, "Warning: Requested record does not seem to exist.", "", voutput) 
     rec_terms = deserialize_via_marshal(rec_terms[0][0])
 
     #Get all documents using terms from the selected documents
     if len(rec_terms) == 0:
         return (None, "Warning: Record spesified has no content indexed for use with this method.", "", voutput)
     else:
         terms = "%s" % rec_terms.keys()
         terms_recs = dict(run_sql("SELECT term, hitlist FROM %s WHERE term IN (%s)" % (methods[rank_method_code]["rnkWORD_table"], terms[1:len(terms) - 1])))
 
     tf_values = {}
     #Calculate all term frequencies
     for (term, tf) in rec_terms.iteritems():
 	if len(term) >= methods[rank_method_code]["min_word_length"] and terms_recs.has_key(term) and tf[1] != 0:
-            tf_values[term] =  int((1 + math.log(tf[0])) * tf[1])
+            tf_values[term] =  int((1 + math.log(tf[0])) * tf[1]) #calculate term weigth
     tf_values = tf_values.items()
-    tf_values.sort(lambda x, y: cmp(y[1], x[1])) 
+    tf_values.sort(lambda x, y: cmp(y[1], x[1])) #sort based on weigth
 
     lwords = []
     stime = time.time()
     (recdict, rec_termcount) = ({}, {})
 
     for (t, tf) in tf_values: #t=term, tf=term frequency
         term_recs = deserialize_via_marshal(terms_recs[t])
-        if len(tf_values) <= methods[rank_method_code]["max_nr_words_lower"] or (len(term_recs) >= methods[rank_method_code]["min_nr_words_docs"] and (((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) <=  methods[rank_method_code]["max_word_occurence"]) and ((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) >= methods[rank_method_code]["min_word_occurence"]))):
-             lwords.append((t, methods[rank_method_code]["rnkWORD_table"]))
-             (recdict, rec_termcount) = calculate_record_relevance_findsimilar((t, round(tf, 4)) , term_recs, hitset, recdict, rec_termcount, verbose, "true")
+        if len(tf_values) <= methods[rank_method_code]["max_nr_words_lower"] or (len(term_recs) >= methods[rank_method_code]["min_nr_words_docs"] and (((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) <=  methods[rank_method_code]["max_word_occurence"]) and ((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) >= methods[rank_method_code]["min_word_occurence"]))): #too complicated...something must be done
+             lwords.append((t, methods[rank_method_code]["rnkWORD_table"])) #list of terms used
+             (recdict, rec_termcount) = calculate_record_relevance_findsimilar((t, round(tf, 4)) , term_recs, hitset, recdict, rec_termcount, verbose, "true") #true tells the function to not calculate all unimportant terms
         if len(tf_values) > methods[rank_method_code]["max_nr_words_lower"] and (len(lwords) ==  methods[rank_method_code]["max_nr_words_upper"] or tf < 0):
             break
 
     if len(recdict) == 0 or len(lwords) == 0:
         return (None, "Could not find any similar documents, possibly because of error in ranking data.", "", voutput)
-    else:
+    else: #sort if we got something to sort
         (reclist, hitset) = sort_record_relevance_findsimilar(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)
 
     if verbose > 0:
         voutput += "<br>Number of terms: %s<br>" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
         voutput += "Number of terms to use for query: %s<br>" % len(lwords)
         voutput += "Terms: %s<br>" % lwords
         voutput += "Current number of recIDs: %s<br>" % (methods[rank_method_code]["col_size"])
         voutput += "Prepare time: %s<br>" % (str(time.time() - startCreate))
         voutput += "Total time used: %s<br>" % (str(time.time() - startCreate))
         rank_method_stat(rank_method_code, reclist, lwords)
 
     return (reclist[:len(reclist)], methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
 
 def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance,verbose):
     """Ranking a records containing specified words and returns a sorted list.
     input:
     rank_method_code - the code of the method, from the name field in rnkMETHOD
     lwords - a list of words from the query
     hitset - a list of hits for the query found by search_engine
     rank_limit_relevance - show only records with a rank value above this
     verbose - verbose value
     output:
     reclist - a list of sorted records: [[23,34], [344,24], [1,01]] 
     prefix - what to show before the rank value 
     postfix - what to show after the rank value
     voutput - contains extra information, content dependent on verbose value"""
 
     global voutput
     startCreate = time.time()
 
     if verbose > 0:
         voutput += "<br>Running rank method: %s, using word_frequency function in bibrank_record_sorter<br>" % rank_method_code
 
     lwords_old = lwords
     lwords = []
     #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
     for i in range(0, len(lwords_old)):
         term = string.lower(lwords_old[i])
         if not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword_force(term):
             lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
             terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term)))  
             for term in terms: 
                 if methods[rank_method_code].has_key("stemmer"): # stem word
                     term = stem_by_lang(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"])
                 if lwords_old[i] != term: #add if stemmed word is different than original word
 	            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
 
     (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
     #For each term, if accepted, get a list of the records using the term
     #calculate then relevance for each term before sorting the list of records
     for (term, table) in lwords:
 	term_recs = run_sql("SELECT term, hitlist FROM %s WHERE term='%s'" % (methods[rank_method_code]["rnkWORD_table"], MySQLdb.escape_string(term)))
-        if term_recs:
+        if term_recs: #if term exists in database, use for ranking
 	    term_recs = deserialize_via_marshal(term_recs[0][1])
-            if check_term(term, methods[rank_method_code]["col_size"], len(term_recs), 1.0, 0.00, 0):
-                (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None)
+            (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None)
             del term_recs
 
     if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
         return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput)
-    else: 
+    else: #sort if we got something to sort
         (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)
 
     #Add any documents not ranked to the end of the list
     if hitset:
         hitset.calculate_nbhits()
         lrecIDs = hitset.tolist()                                #using 2-3mb 
         reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist      #using 6mb
 
     if verbose > 0:
         voutput += "<br>Current number of recIDs: %s<br>" % (methods[rank_method_code]["col_size"])
         voutput += "Number of terms: %s<br>" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
         voutput += "Terms: %s<br>" % lwords
         voutput += "Prepare and pre calculate time: %s<br>" % (str(time.time() - startCreate))
         voutput += "Total time used: %s<br>" % (str(time.time() - startCreate))
         rank_method_stat(rank_method_code, reclist, lwords)
 
     return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
 
 def calculate_record_relevance(term, invidx, hitset, recdict, rec_termcount, verbose, quick=None):
     """Calculating the relevance of the documents based on the input, calculates only one word
     term - (term, query term factor) the term and its importance in the overall search
     invidx - {recid: tf, Gi: norm value} The Gi value is used as a idf value
     hitset - a hitset with records that are allowed to be ranked
     recdict - contains currently ranked records, is returned with new values
     rec_termcount - {recid: count} the number of terms in this record that matches the query
     verbose - verbose value
     quick - if quick=yes only terms with a positive qtf is used, to limit the number of records to sort"""
 
     
     (t, qtf) = term
-    if invidx.has_key("Gi"):
+    if invidx.has_key("Gi"):#Gi = weigth for this term, created by bibrank_word_indexer
         Gi = invidx["Gi"][1]
         del invidx["Gi"]
-    else:
+    else: #if not existing, bibrank should be run with -R
         return (recdict, rec_termcount)
 
     if not quick or (qtf >= 0 or (qtf < 0 and len(recdict) == 0)):
         #Only accept records existing in the hitset received from the search engine
         for (j, tf) in invidx.iteritems():
-            if hitset.contains(j):
-                recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
-                rec_termcount[j] = rec_termcount.get(j, 0) + 1
-        #Multiply with the number of terms of the total number of terms in the query existing in the records 
+            if hitset.contains(j):#only include docs found by search_engine based on query
+                try: #calculates rank value
+                    recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
+                except:
+                    return (recdict, rec_termcount)
+                rec_termcount[j] = rec_termcount.get(j, 0) + 1 #number of terms from query in document
     elif quick: #much used term, do not include all records, only use already existing ones
-        for (j, tf) in recdict.iteritems():
+        for (j, tf) in recdict.iteritems(): #i.e: if doc contains important term, also count unimportant
             if invidx.has_key(j):
                 tf = invidx[j]
                 recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
-                rec_termcount[j] = rec_termcount.get(j, 0) + 1
-
+                rec_termcount[j] = rec_termcount.get(j, 0) + 1 #number of terms from query in document
+    
     return (recdict, rec_termcount)
 
 def calculate_record_relevance_findsimilar(term, invidx, hitset, recdict, rec_termcount, verbose, quick=None):
     """Calculating the relevance of the documents based on the input, calculates only one word
     term - (term, query term factor) the term and its importance in the overall search
     invidx - {recid: tf, Gi: norm value} The Gi value is used as a idf value
     hitset - a hitset with records that are allowed to be ranked
     recdict - contains currently ranked records, is returned with new values
     rec_termcount - {recid: count} the number of terms in this record that matches the query
     verbose - verbose value
     quick - if quick=yes only terms with a positive qtf is used, to limit the number of records to sort"""
 
     
     (t, qtf) = term
-    if invidx.has_key("Gi"):
+    if invidx.has_key("Gi"): #Gi = weigth for this term, created by bibrank_word_indexer
         Gi = invidx["Gi"][1]
         del invidx["Gi"]
-    else:
+    else: #if not existing, bibrank should be run with -R
         return (recdict, rec_termcount)
 
     if not quick or (qtf >= 0 or (qtf < 0 and len(recdict) == 0)):
         #Only accept records existing in the hitset received from the search engine
         for (j, tf) in invidx.iteritems():
-            if hitset.contains(j):
+            if hitset.contains(j): #only include docs found by search_engine based on query
+                #calculate rank value
                 recdict[j] = recdict.get(j, 0) + int((1 + math.log(tf[0])) * Gi * tf[1] * qtf)
-                rec_termcount[j] = rec_termcount.get(j, 0) + 1
-        #Multiply with the number of terms of the total number of terms in the query existing in the records 
+                rec_termcount[j] = rec_termcount.get(j, 0) + 1 #number of terms from query in document
     elif quick: #much used term, do not include all records, only use already existing ones
-        for (j, tf) in recdict.iteritems():
-            if invidx.has_key(j):
+        for (j, tf) in recdict.iteritems(): #i.e: if doc contains important term, also count unimportant
+            if invidx.has_key(j): 
                 tf = invidx[j]
                 recdict[j] = recdict[j] + int((1 + math.log(tf[0])) * Gi * tf[1] * qtf)
-                rec_termcount[j] = rec_termcount.get(j, 0) + 1
+                rec_termcount[j] = rec_termcount.get(j, 0) + 1 #number of terms from query in document
 
     return (recdict, rec_termcount)
 
 def sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose):
     """Sorts the dictionary and returns records with a relevance higher than the given value.
     recdict - {recid: value} unsorted
     rank_limit_relevance - a value > 0 usually
     verbose - verbose value"""
 
     startCreate = time.time()
     global voutput
-
     reclist = []
+
+    #remove all ranked documents so that unranked can be added to the end
     hitset.removemany(recdict.keys()) 
+
+    #gives each record a score between 0-100
     divideby = max(recdict.values())
     for (j, w) in recdict.iteritems():
         w = int(w * 100 / divideby)
 	if w >= rank_limit_relevance:
             reclist.append((j, w))
+    
+    #sort scores
     reclist.sort(lambda x, y: cmp(x[1], y[1]))
 
     if verbose > 0:
         voutput += "Number of records sorted: %s<br>" % len(reclist)
         voutput += "Sort time: %s<br>" % (str(time.time() - startCreate))
     return (reclist, hitset)
 
 def sort_record_relevance_findsimilar(recdict, rec_termcount, hitset, rank_limit_relevance, verbose):
     """Sorts the dictionary and returns records with a relevance higher than the given value.
     recdict - {recid: value} unsorted
     rank_limit_relevance - a value > 0 usually
     verbose - verbose value"""
 
     startCreate = time.time()
     global voutput
     reclist = []
+
     #Multiply with the number of terms of the total number of terms in the query existing in the records
     for j in recdict.keys():
-        hitset.remove(j)
-        if recdict[j] > 0:
+        if recdict[j] > 0 and rec_termcount[j] > 1:
             recdict[j] = math.log((recdict[j] * rec_termcount[j]))
- 
+        else:
+            recdict[j] = 0
+
+    hitset.removemany(recdict.keys())
+    #gives each record a score between 0-100
     divideby = max(recdict.values())
     for (j, w) in recdict.iteritems():
         w = int(w * 100 / divideby)
 	if w >= rank_limit_relevance:
             reclist.append((j, w))
+
+    #sort scores
     reclist.sort(lambda x, y: cmp(x[1], y[1]))
 
     if verbose > 0:
         voutput += "Number of records sorted: %s<br>" % len(reclist)
         voutput += "Sort time: %s<br>" % (str(time.time() - startCreate))
     return (reclist, hitset)
 
 def rank_method_stat(rank_method_code, reclist, lwords):
     """Shows some statistics about the searchresult.
     rank_method_code - name field from rnkMETHOD
     reclist - a list of sorted and ranked records
     lwords - the words in the query"""
 
     global voutput
     if len(reclist) > 20:
 	j = 20
     else:
 	j = len(reclist)
 
     voutput += "<br>Rank statistics:<br>"
     for i in range(1, j + 1):
    	voutput += "%s,Recid:%s,Score:%s<br>" % (i,reclist[len(reclist) - i][0],reclist[len(reclist) - i][1])
         for (term, table) in lwords:
 	    term_recs = run_sql("SELECT hitlist FROM %s WHERE term='%s'" % (table, term))
             if term_recs:
                 term_recs = deserialize_via_marshal(term_recs[0][0])
                 if term_recs.has_key(reclist[len(reclist) - i][0]):
                     voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]])
         voutput += "<br>"
 
     voutput += "<br>Score variation:<br>"
     count = {}
     for i in range(0, len(reclist)):
         count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1
     i = 100
     while i >= 0:
         if count.has_key(i):
             voutput += "%s-%s<br>" % (i, count[i])
         i -= 1
 
 try:
     import psyco
     psyco.bind(find_similar) 
     psyco.bind(rank_by_method)
     psyco.bind(calculate_record_relevance)
     psyco.bind(post_calculate_record_relevance)
     psyco.bind(word_similarity)
     psyco.bind(sort_record_relevance)
     psyco.bind(serialize_via_numeric_array)
     psyco.bind(serialize_via_marshal)
     psyco.bind(deserialize_via_numeric_array)
     psyco.bind(deserialize_via_marshal)
 except StandardError, e:
     pass
 
diff --git a/modules/bibrank/lib/bibrank_record_sorter.py.wml b/modules/bibrank/lib/bibrank_record_sorter.py.wml
index 764620e05..c2f4253d0 100644
--- a/modules/bibrank/lib/bibrank_record_sorter.py.wml
+++ b/modules/bibrank/lib/bibrank_record_sorter.py.wml
@@ -1,736 +1,746 @@
 ##Ranking of records using different parameters and methods on the fly.
 
 ## This file is part of the CERN Document Server Software (CDSware).
 ## Copyright (C) 2002 CERN.
 ##
 ## The CDSware is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ##
 ## The CDSware is distributed in the hope that it will be useful, but
 ## WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.
 ##
 ## You should have received a copy of the GNU General Public License
 ## along with CDSware; if not, write to the Free Software Foundation, Inc.,
 ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 ## read config variables:
 #include "config.wml"
 #include "configbis.wml"
 #include "cdswmllib.wml"
 
 ## start Python:
 <protect>#!</protect><PYTHON>
 <protect># -*- coding: utf-8 -*-</protect>
 <protect>## $Id$</protect>
 <protect>## DO NOT EDIT THIS FILE!  IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
 ## fill config variables:
 pylibdir = "<LIBDIR>/python"
 
 try:
     import sys
     import zlib
     import marshal
     import string
     import time
     import math
     import MySQLdb
     import Numeric
     import re
     import ConfigParser
     import traceback
     import copy
 except ImportError, e:
     pass
 
 try:
     sys.path.append('%s' % pylibdir)
     from cdsware.config import *
     from cdsware.dbquery import run_sql
     from cdsware.bibindex_engine_stemmer import stem_by_lang, lang_available
     from cdsware.bibindex_engine_stopwords import is_stopword_force
     from cdsware.search_engine_config import cfg_max_recID
 except ImportError, e:
     pass
 
 class HitSet:
     """Class describing set of records, implemented as bit vectors of recIDs.
     Using Numeric arrays for speed (1 value = 8 bits), can use later "real"
     bit vectors to save space."""
 
     def __init__(self, init_set=None):
         self._nbhits = -1
         if init_set:
             self._set = init_set
         else:
             self._set = Numeric.zeros(cfg_max_recID+1, Numeric.Int0)
 
     def __repr__(self, join=string.join):
         return "%s(%s)" % (self.__class__.__name__, join(map(repr, self._set), ', '))
 
     def add(self, recID):
         "Adds a record to the set."
         self._set[recID] = 1
 
     def addmany(self, recIDs):
         "Adds several recIDs to the set."
         for recID in recIDs: self._set[recID] = 1
 
     def addlist(self, arr):
         "Adds an array of recIDs to the set."
         Numeric.put(self._set, arr, 1)
 
     def remove(self, recID):
         "Removes a record from the set."
         self._set[recID] = 0
 
     def removemany(self, recIDs):
         "Removes several records from the set."
         for recID in recIDs:
             self.remove(recID)
 
     def intersect(self, other):
         "Does a set intersection with other.  Keep result in self."
         self._set = Numeric.bitwise_and(self._set, other._set)
 
     def union(self, other):
         "Does a set union with other. Keep result in self."
         self._set = Numeric.bitwise_or(self._set, other._set)
 
     def difference(self, other):
         "Does a set difference with other. Keep result in self."
         #self._set = Numeric.bitwise_not(self._set, other._set)
         for recID in Numeric.nonzero(other._set):
             self.remove(recID)
 
     def contains(self, recID):
         "Checks whether the set contains recID."
         return self._set[recID]
 
     __contains__ = contains     # Higher performance member-test for python 2.0 and above
 
     def __getitem__(self, index):
         "Support for the 'for item in set:' protocol."
         return Numeric.nonzero(self._set)[index]
         
     def calculate_nbhits(self):
         "Calculates the number of records set in the hitset."
         self._nbhits = Numeric.sum(self._set.copy().astype(Numeric.Int))
 
     def items(self):
         "Return an array containing all recID."
         return Numeric.nonzero(self._set)
 
     def tolist(self):
         "Return an array containing all recID."
         return Numeric.nonzero(self._set).tolist()
 
 def compare_on_val(first, second):
     return cmp(second[1], first[1])
 def serialize_via_numeric_array_dumps(arr):
     return Numeric.dumps(arr)
 def serialize_via_numeric_array_compr(str):
     return zlib.compress(str)
 def serialize_via_numeric_array_escape(str):
     return MySQLdb.escape_string(str)
 def serialize_via_numeric_array(arr):
     """Serialize Numeric array into a compressed string."""
     return serialize_via_numeric_array_escape(serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr)))
 def deserialize_via_numeric_array(string):
     """Decompress and deserialize string into a Numeric array."""
     return Numeric.loads(zlib.decompress(string))
 def serialize_via_marshal(obj):
     """Serialize Python object via marshal into a compressed string."""
     return MySQLdb.escape_string(zlib.compress(marshal.dumps(obj)))
 def deserialize_via_marshal(string):
     """Decompress and deserialize string into a Python object via marshal."""
     return marshal.loads(zlib.decompress(string))
 
 def adderrorbox(header='', datalist=[]):
     """used to create table around main data on a page, row based"""
 
     try: 
         perc = str(100 // len(datalist)) + '%'
     except ZeroDivisionError: 
         perc = 1
 
     output  = '<table class="errorbox">'
     output += '<thead><tr><th class="errorboxheader" colspan="%s">%s</th></tr></thead>' % (len(datalist), header)
     output += '<tbody>'
     for row in [datalist]:
         output += '<tr>'
         for data in row:
             output += '<td style="vertical-align: top; margin-top: 5px; width: %s;">' % (perc, )
             output += data
             output += '</td>'
         output += '</tr>'
     output += '</tbody></table>'
     return output
 
 def check_term(term, col_size, term_rec, max_occ, min_occ, termlength):
     """Check if the term is valid for use
     term - the term to check
     col_size - the number of records in database
     term_rec - the number of records which contains this term
     max_occ - max frequency of the term allowed
     min_occ - min frequence of the term allowed
     termlength - the minimum length of the terms allowed"""
 
     try:
         if is_stopword_force(term) or (len(term) <= termlength) or ((float(term_rec) / float(col_size)) >= max_occ) or ((float(term_rec) / float(col_size)) <= min_occ):
 	    return ""
         if int(term):
             return ""
     except StandardError, e:
 	pass
     return "true"
 
 def create_rnkmethod_cache():
     """Create cache with vital information for each rank method."""
 
     global methods    
     bibrank_meths = run_sql("SELECT name from rnkMETHOD")
     methods = {}
     global voutput
     voutput = ""
 
     for (rank_method_code,) in bibrank_meths:
         try:
             file = etcdir + "/bibrank/" + rank_method_code + ".cfg"
             config = ConfigParser.ConfigParser()
             config.readfp(open(file))
         except StandardError, e:
             pass
 
         cfg_function = config.get("rank_method", "function")
         if config.has_section(cfg_function):
             methods[rank_method_code] = {}
             methods[rank_method_code]["function"] = cfg_function
             methods[rank_method_code]["prefix"] = config.get(cfg_function, "relevance_number_output_prologue")
             methods[rank_method_code]["postfix"] = config.get(cfg_function, "relevance_number_output_epilogue")
             methods[rank_method_code]["chars_alphanumericseparators"] = r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]"
         else:
             raise Exception("Error in configuration file: %s" % (etcdir + "/bibrank/" + rank_method_code + ".cfg"))
 
         i8n_names = run_sql("SELECT ln,value from rnkMETHODNAME,rnkMETHOD where id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" % (rank_method_code))
         for (ln, value) in i8n_names:
             methods[rank_method_code][ln] = value
 
         if config.has_option(cfg_function, "table"):
             methods[rank_method_code]["rnkWORD_table"] = config.get(cfg_function, "table")
             methods[rank_method_code]["col_size"] = run_sql("SELECT count(*) FROM %sR" % methods[rank_method_code]["rnkWORD_table"][:-1])[0][0]
 
         if config.has_option(cfg_function, "stemming") and config.get(cfg_function, "stemming"):
             try:
                 methods[rank_method_code]["stemmer"] = config.get(cfg_function, "stemming")
             except Exception,e:
                 pass
 
         if config.has_option(cfg_function, "stopword"):
             methods[rank_method_code]["stopwords"] = config.get(cfg_function, "stopword")
 
         if config.has_section("find_similar"):
             methods[rank_method_code]["max_word_occurence"] = float(config.get("find_similar", "max_word_occurence"))
             methods[rank_method_code]["min_word_occurence"] = float(config.get("find_similar", "min_word_occurence"))
             methods[rank_method_code]["min_word_length"] = int(config.get("find_similar", "min_word_length"))
             methods[rank_method_code]["min_nr_words_docs"] = int(config.get("find_similar", "min_nr_words_docs"))
             methods[rank_method_code]["max_nr_words_upper"] = int(config.get("find_similar", "max_nr_words_upper"))
             methods[rank_method_code]["max_nr_words_lower"] = int(config.get("find_similar", "max_nr_words_lower"))
-            methods[rank_method_code]["override_default_min_relevance"] = config.get("find_similar", "override_default_min_relevance")
             methods[rank_method_code]["default_min_relevance"] = int(config.get("find_similar", "default_min_relevance"))
             
         if config.has_section("combine_method"):
             i = 1
             methods[rank_method_code]["combine_method"] = []
             while config.has_option("combine_method", "method%s" % i):
                 methods[rank_method_code]["combine_method"].append(string.split(config.get("combine_method", "method%s" % i), ","))
                 i += 1
 
 def is_method_valid(colID, rank_method_code):
     """Checks if a method is valid for the collection given"""
     
     enabled_colls = dict(run_sql("SELECT id_collection, score from collection_rnkMETHOD,rnkMETHOD WHERE id_rnkMETHOD=rnkMETHOD.id AND name='%s'" % rank_method_code))
 
     colID = int(colID)
     if enabled_colls.has_key(colID):
         return 1
     else:
         while colID:
             colID = run_sql("SELECT id_dad FROM collection_collection WHERE id_son=%s" % colID)
             if colID and enabled_colls.has_key(colID[0][0]):
                 return 1
             elif colID:
                 colID = colID[0][0]
     return 0
 
 def get_bibrank_methods(collection, ln=cdslang):
     """Returns a list of rank methods and the name om them in the language defined by the ln parameter, if collection is given, only methods enabled for that collection is returned."""
 
     if not globals().has_key('methods'):
         create_rnkmethod_cache()
 
     avail_methods = []
     for (rank_method_code, options) in methods.iteritems():
         if options.has_key("function") and is_method_valid(collection, rank_method_code):
             if options.has_key(ln):
                 avail_methods.append((rank_method_code, options[ln]))
             elif options.has_key(cdslang):
                 avail_methods.append((rank_method_code, options[cdslang]))
             else:
                 avail_methods.append((rank_method_code, rank_method_code))              
     return avail_methods
 
 def rank_records(rank_method_code, rank_limit_relevance, hitset_global, pattern=[], verbose=0):
     """rank_method_code, e.g. `jif' or `sbr' (word frequency vector model)                    
        rank_limit_relevance, e.g. `23' for `nbc' (number of citations) or `0.10' for `vec'                   
        hitset, search engine hits;                   
        pattern, search engine query or record ID (you check the type)                   
        verbose, verbose level
        output:
        list of records
        list of rank values
        prefix
        postfix
        verbose_output"""
 
     global voutput
     voutput = ""
     configcreated = ""
 
     try:
         hitset = copy.deepcopy(hitset_global) #we are receiving a global hitset
         if not globals().has_key('methods'):
             create_rnkmethod_cache()
 
         function = methods[rank_method_code]["function"]
         func_object = globals().get(function)
         if func_object and pattern and pattern[0][0:6] == "recid:" and function == "word_similarity":
             result = find_similar(rank_method_code, pattern[0][6:], hitset, rank_limit_relevance, verbose)
         elif func_object:
             result = func_object(rank_method_code, pattern, hitset, rank_limit_relevance, verbose)
         else:
             result = rank_by_method(rank_method_code, pattern, hitset, rank_limit_relevance, verbose)
     except Exception, e:
         result = (None, "", adderrorbox("An error occured when trying to rank the search result", ["Unexpected error: %s<br><b>Traceback:</b>%s" % (e, traceback.format_tb(sys.exc_info()[2]))]), voutput)
 
-    if result[0] and result[1]:
+    if result[0] and result[1]: #split into two lists for search_engine
         results_similar_recIDs = map(lambda x: x[0], result[0])
         results_similar_relevances = map(lambda x: x[1], result[0])
         result = (results_similar_recIDs, results_similar_relevances, result[1], result[2], "%s" % configcreated + result[3])
     else:
         result = (None, None, result[1], result[2], result[3])
 
     if verbose > 0:
         print string.replace(voutput, "<br>", "\n")
 
     return result
 
 def combine_method(rank_method_code, pattern, hitset, rank_limit_relevance,verbose):
-    """combining several methods"""
+    """combining several methods into one based on methods/percentage in config file"""
 
     global voutput
     result = {}
     try:
         for (method, percent) in methods[rank_method_code]["combine_method"]:
             function = methods[method]["function"]
             func_object = globals().get(function)
             percent = int(percent)
 
             if func_object:
                 this_result = func_object(method, pattern, hitset, rank_limit_relevance, verbose)[0]
             else:
                 this_result = rank_by_method(method, pattern, hitset, rank_limit_relevance, verbose)[0]
 
             for i in range(0, len(this_result)): 
                 (recID, value) = this_result[i]  
                 if value > 0:
                     result[recID] = result.get(recID, 0) + int((float(i) / len(this_result)) * float(percent))
 
         result = result.items()
         result.sort(lambda x, y: cmp(x[1], y[1]))
         return (result, "(", ")", voutput)
     except Exception, e:
         return (None, "Warning, method cannot be used for ranking your query.", "", voutput)
         
 def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance,verbose):
     """Ranking of records based on predetermined values.
     input:
     rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from 
     rnkMETHODDATA
     lwords - a list of words from the query
     hitset - a list of hits for the query found by search_engine
     rank_limit_relevance - show only records with a rank value above this
     verbose - verbose value
     output:
     reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]] 
     prefix - what to show before the rank value 
     postfix - what to show after the rank value
     voutput - contains extra information, content dependent on verbose value"""
 
     global voutput
     rnkdict = run_sql("SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name='%s'" % rank_method_code)
 
     if not rnkdict:
         return (None, "Warning, Could not load ranking data for method.", "", voutput)
 
     lwords_hitset = None
-    for j in range(0, len(lwords)):
+    for j in range(0, len(lwords)): #find which docs to search based on ranges..should be done in search_engine...
         if lwords[j] and lwords[j][:6] == "recid:":
             if not lwords_hitset:
                 lwords_hitset = HitSet()
             lword = lwords[j][6:]
             if string.find(lword, "->") > -1:
                 lword = string.split(lword, "->")
                 if int(lword[0]) >= cfg_max_recID + 1 or int(lword[1]) >= cfg_max_recID + 1:        
                     return (None, "Warning, The record range given is out of range.", "", voutput)  
                 for i in range(int(lword[0]), int(lword[1])):
                     lwords_hitset.add(int(i))
             elif lword < cfg_max_recID + 1:
                 lwords_hitset.add(int(lword))
             else:
                 return (None, "Warning, The record range given is out of range.", "", voutput)  
     
     rnkdict = deserialize_via_marshal(rnkdict[0][0])
     if verbose > 0:
         voutput += "<br>Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br>" % rank_method_code
         voutput += "Ranking data loaded, size of structure: %s<br>" % len(rnkdict)
     lrecIDs = hitset.items()
 
     if verbose > 0:
         voutput += "Number of records to rank: %s<br>" % len(lrecIDs)
     reclist = []
     reclist_addend = []
 
-    if not lwords_hitset:
+    if not lwords_hitset: #rank all docs, can this be speed up using something else than for loop?
         for recID in lrecIDs:
             if rnkdict.has_key(recID):
                 reclist.append((recID, rnkdict[recID]))
                 del rnkdict[recID]
             else:
                 reclist_addend.append((recID, 0))
-    else:
+    else: #rank docs in hitset, can this be speed up using something else than for loop?
         lwords_lrecIDs = lwords_hitset.items()
         for recID in lwords_lrecIDs:
             if rnkdict.has_key(recID) and hitset.contains(recID):
                 reclist.append((recID, rnkdict[recID]))
                 del rnkdict[recID]
             elif hitset.contains(recID):
                 reclist_addend.append((recID, 0))
         
     if verbose > 0:
         voutput += "Number of records ranked: %s<br>" % len(reclist)
         voutput += "Number of records not ranked: %s<br>" % len(reclist_addend)
 
     reclist.sort(lambda x, y: cmp(x[1], y[1]))
     return (reclist_addend + reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
 
 def find_similar(rank_method_code, recID, hitset, rank_limit_relevance,verbose):
     """Finding terms to use for calculating similarity. Terms are taken from the recid given, returns a list of recids's and relevance,
     input:
     rank_method_code - the code of the method, from the name field in rnkMETHOD
     recID - records to use for find similar
     hitset - a list of hits for the query found by search_engine
     rank_limit_relevance - show only records with a rank value above this
     verbose - verbose value
     output:
     reclist - a list of sorted records: [[23,34], [344,24], [1,01]] 
     prefix - what to show before the rank value 
     postfix - what to show after the rank value
     voutput - contains extra information, content dependent on verbose value"""
 
     startCreate = time.time()
     global voutput
 
     if verbose > 0:
         voutput += "<br>Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br>" % rank_method_code
-    if methods[rank_method_code]["override_default_min_relevance"] == "no":
-        rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]
+    rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]
 
     try:
         recID = int(recID)
     except Exception,e :
         return (None, "Warning: Error in record number, please check that a number is given.", "", voutput) 
 
     rec_terms = run_sql("SELECT termlist FROM %sR WHERE id_bibrec=%s" % (methods[rank_method_code]["rnkWORD_table"][:-1], recID))
     if not rec_terms:
         return (None, "Warning: Requested record does not seem to exist.", "", voutput) 
     rec_terms = deserialize_via_marshal(rec_terms[0][0])
 
     #Get all documents using terms from the selected documents
     if len(rec_terms) == 0:
         return (None, "Warning: Record spesified has no content indexed for use with this method.", "", voutput)
     else:
         terms = "%s" % rec_terms.keys()
         terms_recs = dict(run_sql("SELECT term, hitlist FROM %s WHERE term IN (%s)" % (methods[rank_method_code]["rnkWORD_table"], terms[1:len(terms) - 1])))
 
     tf_values = {}
     #Calculate all term frequencies
     for (term, tf) in rec_terms.iteritems():
 	if len(term) >= methods[rank_method_code]["min_word_length"] and terms_recs.has_key(term) and tf[1] != 0:
-            tf_values[term] =  int((1 + math.log(tf[0])) * tf[1])
+            tf_values[term] =  int((1 + math.log(tf[0])) * tf[1]) #calculate term weigth
     tf_values = tf_values.items()
-    tf_values.sort(lambda x, y: cmp(y[1], x[1])) 
+    tf_values.sort(lambda x, y: cmp(y[1], x[1])) #sort based on weigth
 
     lwords = []
     stime = time.time()
     (recdict, rec_termcount) = ({}, {})
 
     for (t, tf) in tf_values: #t=term, tf=term frequency
         term_recs = deserialize_via_marshal(terms_recs[t])
-        if len(tf_values) <= methods[rank_method_code]["max_nr_words_lower"] or (len(term_recs) >= methods[rank_method_code]["min_nr_words_docs"] and (((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) <=  methods[rank_method_code]["max_word_occurence"]) and ((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) >= methods[rank_method_code]["min_word_occurence"]))):
-             lwords.append((t, methods[rank_method_code]["rnkWORD_table"]))
-             (recdict, rec_termcount) = calculate_record_relevance_findsimilar((t, round(tf, 4)) , term_recs, hitset, recdict, rec_termcount, verbose, "true")
+        if len(tf_values) <= methods[rank_method_code]["max_nr_words_lower"] or (len(term_recs) >= methods[rank_method_code]["min_nr_words_docs"] and (((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) <=  methods[rank_method_code]["max_word_occurence"]) and ((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) >= methods[rank_method_code]["min_word_occurence"]))): #too complicated...something must be done
+             lwords.append((t, methods[rank_method_code]["rnkWORD_table"])) #list of terms used
+             (recdict, rec_termcount) = calculate_record_relevance_findsimilar((t, round(tf, 4)) , term_recs, hitset, recdict, rec_termcount, verbose, "true") #true tells the function to not calculate all unimportant terms
         if len(tf_values) > methods[rank_method_code]["max_nr_words_lower"] and (len(lwords) ==  methods[rank_method_code]["max_nr_words_upper"] or tf < 0):
             break
 
     if len(recdict) == 0 or len(lwords) == 0:
         return (None, "Could not find any similar documents, possibly because of error in ranking data.", "", voutput)
-    else:
+    else: #sort if we got something to sort
         (reclist, hitset) = sort_record_relevance_findsimilar(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)
 
     if verbose > 0:
         voutput += "<br>Number of terms: %s<br>" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
         voutput += "Number of terms to use for query: %s<br>" % len(lwords)
         voutput += "Terms: %s<br>" % lwords
         voutput += "Current number of recIDs: %s<br>" % (methods[rank_method_code]["col_size"])
         voutput += "Prepare time: %s<br>" % (str(time.time() - startCreate))
         voutput += "Total time used: %s<br>" % (str(time.time() - startCreate))
         rank_method_stat(rank_method_code, reclist, lwords)
 
     return (reclist[:len(reclist)], methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
 
 def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance,verbose):
     """Ranking a records containing specified words and returns a sorted list.
     input:
     rank_method_code - the code of the method, from the name field in rnkMETHOD
     lwords - a list of words from the query
     hitset - a list of hits for the query found by search_engine
     rank_limit_relevance - show only records with a rank value above this
     verbose - verbose value
     output:
     reclist - a list of sorted records: [[23,34], [344,24], [1,01]] 
     prefix - what to show before the rank value 
     postfix - what to show after the rank value
     voutput - contains extra information, content dependent on verbose value"""
 
     global voutput
     startCreate = time.time()
 
     if verbose > 0:
         voutput += "<br>Running rank method: %s, using word_frequency function in bibrank_record_sorter<br>" % rank_method_code
 
     lwords_old = lwords
     lwords = []
     #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
     for i in range(0, len(lwords_old)):
         term = string.lower(lwords_old[i])
         if not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword_force(term):
             lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
             terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term)))  
             for term in terms: 
                 if methods[rank_method_code].has_key("stemmer"): # stem word
                     term = stem_by_lang(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"])
                 if lwords_old[i] != term: #add if stemmed word is different than original word
 	            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
 
     (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
     #For each term, if accepted, get a list of the records using the term
     #calculate then relevance for each term before sorting the list of records
     for (term, table) in lwords:
 	term_recs = run_sql("SELECT term, hitlist FROM %s WHERE term='%s'" % (methods[rank_method_code]["rnkWORD_table"], MySQLdb.escape_string(term)))
-        if term_recs:
+        if term_recs: #if term exists in database, use for ranking
 	    term_recs = deserialize_via_marshal(term_recs[0][1])
-            if check_term(term, methods[rank_method_code]["col_size"], len(term_recs), 1.0, 0.00, 0):
-                (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None)
+            (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None)
             del term_recs
 
     if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
         return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput)
-    else: 
+    else: #sort if we got something to sort
         (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)
 
     #Add any documents not ranked to the end of the list
     if hitset:
         hitset.calculate_nbhits()
         lrecIDs = hitset.tolist()                                #using 2-3mb 
         reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist      #using 6mb
 
     if verbose > 0:
         voutput += "<br>Current number of recIDs: %s<br>" % (methods[rank_method_code]["col_size"])
         voutput += "Number of terms: %s<br>" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
         voutput += "Terms: %s<br>" % lwords
         voutput += "Prepare and pre calculate time: %s<br>" % (str(time.time() - startCreate))
         voutput += "Total time used: %s<br>" % (str(time.time() - startCreate))
         rank_method_stat(rank_method_code, reclist, lwords)
 
     return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
 
 def calculate_record_relevance(term, invidx, hitset, recdict, rec_termcount, verbose, quick=None):
     """Calculating the relevance of the documents based on the input, calculates only one word
     term - (term, query term factor) the term and its importance in the overall search
     invidx - {recid: tf, Gi: norm value} The Gi value is used as a idf value
     hitset - a hitset with records that are allowed to be ranked
     recdict - contains currently ranked records, is returned with new values
     rec_termcount - {recid: count} the number of terms in this record that matches the query
     verbose - verbose value
     quick - if quick=yes only terms with a positive qtf is used, to limit the number of records to sort"""
 
     
     (t, qtf) = term
-    if invidx.has_key("Gi"):
+    if invidx.has_key("Gi"):#Gi = weigth for this term, created by bibrank_word_indexer
         Gi = invidx["Gi"][1]
         del invidx["Gi"]
-    else:
+    else: #if not existing, bibrank should be run with -R
         return (recdict, rec_termcount)
 
     if not quick or (qtf >= 0 or (qtf < 0 and len(recdict) == 0)):
         #Only accept records existing in the hitset received from the search engine
         for (j, tf) in invidx.iteritems():
-            if hitset.contains(j):
-                recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
-                rec_termcount[j] = rec_termcount.get(j, 0) + 1
-        #Multiply with the number of terms of the total number of terms in the query existing in the records 
+            if hitset.contains(j):#only include docs found by search_engine based on query
+                try: #calculates rank value
+                    recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
+                except:
+                    return (recdict, rec_termcount)
+                rec_termcount[j] = rec_termcount.get(j, 0) + 1 #number of terms from query in document
     elif quick: #much used term, do not include all records, only use already existing ones
-        for (j, tf) in recdict.iteritems():
+        for (j, tf) in recdict.iteritems(): #i.e: if doc contains important term, also count unimportant
             if invidx.has_key(j):
                 tf = invidx[j]
                 recdict[j] = recdict.get(j, 0) + int(math.log(tf[0] * Gi * tf[1] * qtf))
-                rec_termcount[j] = rec_termcount.get(j, 0) + 1
-
+                rec_termcount[j] = rec_termcount.get(j, 0) + 1 #number of terms from query in document
+    
     return (recdict, rec_termcount)
 
 def calculate_record_relevance_findsimilar(term, invidx, hitset, recdict, rec_termcount, verbose, quick=None):
     """Calculating the relevance of the documents based on the input, calculates only one word
     term - (term, query term factor) the term and its importance in the overall search
     invidx - {recid: tf, Gi: norm value} The Gi value is used as a idf value
     hitset - a hitset with records that are allowed to be ranked
     recdict - contains currently ranked records, is returned with new values
     rec_termcount - {recid: count} the number of terms in this record that matches the query
     verbose - verbose value
     quick - if quick=yes only terms with a positive qtf is used, to limit the number of records to sort"""
 
     
     (t, qtf) = term
-    if invidx.has_key("Gi"):
+    if invidx.has_key("Gi"): #Gi = weigth for this term, created by bibrank_word_indexer
         Gi = invidx["Gi"][1]
         del invidx["Gi"]
-    else:
+    else: #if not existing, bibrank should be run with -R
         return (recdict, rec_termcount)
 
     if not quick or (qtf >= 0 or (qtf < 0 and len(recdict) == 0)):
         #Only accept records existing in the hitset received from the search engine
         for (j, tf) in invidx.iteritems():
-            if hitset.contains(j):
+            if hitset.contains(j): #only include docs found by search_engine based on query
+                #calculate rank value
                 recdict[j] = recdict.get(j, 0) + int((1 + math.log(tf[0])) * Gi * tf[1] * qtf)
-                rec_termcount[j] = rec_termcount.get(j, 0) + 1
-        #Multiply with the number of terms of the total number of terms in the query existing in the records 
+                rec_termcount[j] = rec_termcount.get(j, 0) + 1 #number of terms from query in document
     elif quick: #much used term, do not include all records, only use already existing ones
-        for (j, tf) in recdict.iteritems():
-            if invidx.has_key(j):
+        for (j, tf) in recdict.iteritems(): #i.e: if doc contains important term, also count unimportant
+            if invidx.has_key(j): 
                 tf = invidx[j]
                 recdict[j] = recdict[j] + int((1 + math.log(tf[0])) * Gi * tf[1] * qtf)
-                rec_termcount[j] = rec_termcount.get(j, 0) + 1
+                rec_termcount[j] = rec_termcount.get(j, 0) + 1 #number of terms from query in document
 
     return (recdict, rec_termcount)
 
 def sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose):
     """Sorts the dictionary and returns records with a relevance higher than the given value.
     recdict - {recid: value} unsorted
     rank_limit_relevance - a value > 0 usually
     verbose - verbose value"""
 
     startCreate = time.time()
     global voutput
-
     reclist = []
+
+    #remove all ranked documents so that unranked can be added to the end
     hitset.removemany(recdict.keys()) 
+
+    #gives each record a score between 0-100
     divideby = max(recdict.values())
     for (j, w) in recdict.iteritems():
         w = int(w * 100 / divideby)
 	if w >= rank_limit_relevance:
             reclist.append((j, w))
+    
+    #sort scores
     reclist.sort(lambda x, y: cmp(x[1], y[1]))
 
     if verbose > 0:
         voutput += "Number of records sorted: %s<br>" % len(reclist)
         voutput += "Sort time: %s<br>" % (str(time.time() - startCreate))
     return (reclist, hitset)
 
 def sort_record_relevance_findsimilar(recdict, rec_termcount, hitset, rank_limit_relevance, verbose):
     """Sorts the dictionary and returns records with a relevance higher than the given value.
     recdict - {recid: value} unsorted
     rank_limit_relevance - a value > 0 usually
     verbose - verbose value"""
 
     startCreate = time.time()
     global voutput
     reclist = []
+
     #Multiply with the number of terms of the total number of terms in the query existing in the records
     for j in recdict.keys():
-        hitset.remove(j)
-        if recdict[j] > 0:
+        if recdict[j] > 0 and rec_termcount[j] > 1:
             recdict[j] = math.log((recdict[j] * rec_termcount[j]))
- 
+        else:
+            recdict[j] = 0
+
+    hitset.removemany(recdict.keys())
+    #gives each record a score between 0-100
     divideby = max(recdict.values())
     for (j, w) in recdict.iteritems():
         w = int(w * 100 / divideby)
 	if w >= rank_limit_relevance:
             reclist.append((j, w))
+
+    #sort scores
     reclist.sort(lambda x, y: cmp(x[1], y[1]))
 
     if verbose > 0:
         voutput += "Number of records sorted: %s<br>" % len(reclist)
         voutput += "Sort time: %s<br>" % (str(time.time() - startCreate))
     return (reclist, hitset)
 
 def rank_method_stat(rank_method_code, reclist, lwords):
     """Shows some statistics about the searchresult.
     rank_method_code - name field from rnkMETHOD
     reclist - a list of sorted and ranked records
     lwords - the words in the query"""
 
     global voutput
     if len(reclist) > 20:
 	j = 20
     else:
 	j = len(reclist)
 
     voutput += "<br>Rank statistics:<br>"
     for i in range(1, j + 1):
    	voutput += "%s,Recid:%s,Score:%s<br>" % (i,reclist[len(reclist) - i][0],reclist[len(reclist) - i][1])
         for (term, table) in lwords:
 	    term_recs = run_sql("SELECT hitlist FROM %s WHERE term='%s'" % (table, term))
             if term_recs:
                 term_recs = deserialize_via_marshal(term_recs[0][0])
                 if term_recs.has_key(reclist[len(reclist) - i][0]):
                     voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]])
         voutput += "<br>"
 
     voutput += "<br>Score variation:<br>"
     count = {}
     for i in range(0, len(reclist)):
         count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1
     i = 100
     while i >= 0:
         if count.has_key(i):
             voutput += "%s-%s<br>" % (i, count[i])
         i -= 1
 
 try:
     import psyco
     psyco.bind(find_similar) 
     psyco.bind(rank_by_method)
     psyco.bind(calculate_record_relevance)
     psyco.bind(post_calculate_record_relevance)
     psyco.bind(word_similarity)
     psyco.bind(sort_record_relevance)
     psyco.bind(serialize_via_numeric_array)
     psyco.bind(serialize_via_marshal)
     psyco.bind(deserialize_via_numeric_array)
     psyco.bind(deserialize_via_marshal)
 except StandardError, e:
     pass
 
diff --git a/modules/bibrank/lib/bibrank_word_indexer.py b/modules/bibrank/lib/bibrank_word_indexer.py
index 89705309f..2100fd4d9 100644
--- a/modules/bibrank/lib/bibrank_word_indexer.py
+++ b/modules/bibrank/lib/bibrank_word_indexer.py
@@ -1,1472 +1,1472 @@
  # $Id$
 ## BibRank word frequency indexer utility.
 
 ## This file is part of the CERN Document Server Software (CDSware).
 ## Copyright (C) 2002 CERN.
 ##
 ## The CDSware is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ##
 ## The CDSware is distributed in the hope that it will be useful, but
 ## WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.  
 ##
 ## You should have received a copy of the GNU General Public License
 ## along with CDSware; if not, write to the Free Software Foundation, Inc.,
 ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 ## read config variables:
 #include "config.wml"
 #include "configbis.wml"
 #include "cdswmllib.wml"
 
 ## start Python:
 <protect>#!</protect><PYTHON>
 <protect># -*- coding: utf-8 -*-</protect>
 <protect>## $Id$</protect>
 <protect>## DO NOT EDIT THIS FILE!  IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
 
 __version__ = "<: print generate_pretty_version_string('$Id$'); :>"
 
 ## fill config variables:
 pylibdir = "<LIBDIR>/python"
 
 ## programs used to convert fulltext files to text:
 conv_programs = {#"ps": ["<PSTOTEXT>","<PSTOASCII>"],  # switched off at the moment, since PDF is faster
                  #"ps.gz": ["<PSTOTEXT>","<PSTOASCII>"],
                  "pdf": ["<PDFTOTEXT>","<PSTOTEXT>","<PSTOASCII>"],
                  "doc": ["<ANTIWORD>","<CATDOC>","<WVTEXT>"],
                  "ppt": ["<PPTHTML>"],
                  "xls": ["<XLHTML>"]
                  }
 ## helper programs used if the above programs convert only to html or other intermediate file formats:
 conv_programs_helpers =  {"html": "<HTMLTOTEXT>",
                           "gz": "<GZIP>" }
 
 ## okay, rest of the Python code goes below
 #######
 
 ## import interesting modules:
 try:
     from zlib import compress,decompress
     from string import split,translate,lower,upper
     import marshal
     import getopt
     import getpass
     import string
     import os
     import sre
     import sys
     import time
     import MySQLdb
     import Numeric
     import urllib
     import signal
     import tempfile
     import unicodedata
     import traceback
     import cStringIO
     import math
     import re
     import ConfigParser
 except ImportError, e:
     import sys
 
 try:
     sys.path.append('%s' % pylibdir)
     from cdsware.config import *
     from cdsware.search_engine_config import cfg_max_recID
     from cdsware.search_engine import perform_request_search, strip_accents, HitSet
     from cdsware.dbquery import run_sql
     from cdsware.bibindex_engine_stemmer import stem_by_lang, lang_available
     from cdsware.bibindex_engine_stopwords import is_stopword_force
 except ImportError, e:
     import sys
 
 ## safety parameters concerning MySQL thread-multiplication problem:
 cfg_check_mysql_threads = 0 # to check or not to check the problem? 
 cfg_max_mysql_threads = 50 # how many threads (connections) we consider as still safe
 cfg_mysql_thread_timeout = 20 # we'll kill threads that were sleeping for more than X seconds
 
 ## override urllib's default password-asking behaviour:
 class MyFancyURLopener(urllib.FancyURLopener):
     def prompt_user_passwd(self, host, realm):
         # supply some dummy credentials by default
         return ("mysuperuser", "mysuperpass")
     def http_error_401(self, url, fp, errcode, errmsg, headers):
         # do not bother with protected pages
         raise IOError, (999, 'unauthorized access')  
         return None
     
 #urllib._urlopener = MyFancyURLopener()
 
 ## precompile some often-used regexp for speed reasons:
 re_subfields = sre.compile('\$\$\w');
 
 nb_char_in_line = 50  # for verbose pretty printing
 chunksize = 1000 # default size of chunks that the records will be treated by
 wordTables = []
 base_process_size = 4500 # process base size
 
 ## Dictionary merging functions
 def dict_union(list1, list2):
     "Returns union of the two dictionaries."
     union_dict = {}
 
     for (e, count) in list1.iteritems():
         union_dict[e] = count
     for (e, count) in list2.iteritems():
         if not union_dict.has_key(e):
             union_dict[e] = count
 	else:
 	    union_dict[e] = (union_dict[e][0] + count[0], count[1])
 
     #for (e, count) in list2.iteritems():
     #    list1[e] = (list1.get(e, (0, 0))[0] + count[0], count[1])
 
     #return list1
     return union_dict
 
 ## safety function for killing slow MySQL threads:
 def kill_sleepy_mysql_threads(max_threads=cfg_max_mysql_threads, thread_timeout=cfg_mysql_thread_timeout):
     """Check the number of MySQL threads and if there are more than
        MAX_THREADS of them, lill all threads that are in a sleeping
        state for more than THREAD_TIMEOUT seconds.  (This is useful
        for working around the the max_connection problem that appears
        during indexation in some not-yet-understood cases.)  If some
        threads are to be killed, write info into the log file.      
     """
     res = run_sql("SHOW FULL PROCESSLIST")
     if len(res) > max_threads:
         for row in res:
             r_id,r_user,r_host,r_db,r_command,r_time,r_state,r_info = row
             if r_command == "Sleep" and int(r_time) > thread_timeout:
                 run_sql("KILL %s", (r_id,))
                 if options["verbose"] >= 1:                
                     write_message("WARNING: too many MySQL threads, killing thread %s" % r_id)
     return
 
 # tagToFunctions mapping. It offers an indirection level necesary for
 # indexing fulltext. The default is get_words_from_phrase
 tagToWordsFunctions = {}
 
 def get_words_from_phrase(phrase, weight, lang="",
                           chars_punctuation=r"[\.\,\:\;\?\!\"]",
                           chars_alphanumericseparators=r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]",
                           split=string.split):
     "Returns list of words from phrase 'phrase'."
     words = {} 
-    #print phrase
     phrase = strip_accents(phrase) 
     phrase = lower(phrase) 
     #Getting rid of strange characters
     phrase = re.sub("&eacute;", 'e', phrase) 
     phrase = re.sub("&egrave;", 'e', phrase) 
     phrase = re.sub("&agrave;", 'a', phrase)
     phrase = re.sub("&nbsp;", ' ', phrase)
     phrase = re.sub("&laquo;", ' ', phrase)
     phrase = re.sub("&raquo;", ' ', phrase)
     phrase = re.sub("&ecirc;", ' ', phrase)
     phrase = re.sub("&amp;", ' ', phrase)
-    
     if string.find(phrase, "</") > -1:
         #Most likely html, remove html code
         phrase = re.sub("(?s)<[^>]*>|&#?\w+;", ' ', phrase)
+    #removes http links
+    phrase = re.sub("(?s)http://[^( )]*", '', phrase)
     phrase = re.sub(chars_punctuation, ' ', phrase)
-    phrase = lower(phrase)
+
     #By doing this like below, characters standing alone, like c a b is not added to the inedx, but when they are together with characters like c++ or c$ they are added.
     for word in split(phrase):    
         if options["remove_stopword"] == "True" and not is_stopword_force(word) and check_term(word, 0):
             if lang and lang !="none" and options["use_stemming"]:
                 word = stem_by_lang(word, lang)
             if not words.has_key(word):
                 words[word] = (0,0)
             words[word] = (words[word][0] + weight, 0)
         elif options["remove_stopword"] == "True" and not is_stopword_force(word):  
             phrase = re.sub(chars_alphanumericseparators, ' ', word) 
             for word_ in split(phrase):   
                 if lang and lang !="none" and options["use_stemming"]:
                     word_ = stem_by_lang(word_, lang)
                 if word_:
                     if not words.has_key(word_):
                         words[word_] = (0,0)
                     words[word_] = (words[word_][0] + weight, 0)
     return words
 
 def split_ranges(parse_string):
     recIDs = []
     ranges = string.split(parse_string, ",")
     for range in ranges:
         tmp_recIDs = string.split(range, "-")
         
         if len(tmp_recIDs)==1:
             recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])])
         else:
             if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check
                 tmp = tmp_recIDs[0]
                 tmp_recIDs[0] = tmp_recIDs[1]
                 tmp_recIDs[1] = tmp
             recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])])
     return recIDs
 
 def get_date_range(var):
     "Returns the two dates contained as a low,high tuple"
     limits = string.split(var, ",")
     if len(limits)==1:
         low = get_date(limits[0])
         return low,None
     if len(limits)==2:
         low = get_date(limits[0])
         high = get_date(limits[1])
         return low,high
 
 def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
     """Returns a date string according to the format string.
        It can handle normal date strings and shifts with respect
        to now."""
     date = time.time()
     shift_re=sre.compile("([-\+]{0,1})([\d]+)([dhms])")
     factors = {"d":24*3600, "h":3600, "m":60, "s":1}
     m = shift_re.match(var)
     if m:
         sign = m.groups()[0] == "-" and -1 or 1
         factor = factors[m.groups()[2]]
         value = float(m.groups()[1])
         date = time.localtime(date + sign * factor * value)
         date = time.strftime(format_string, date)
     else:
         date = time.strptime(var, format_string)
         date = time.strftime(format_string, date)        
     return date
 
 def create_range_list(res):
     """Creates a range list from a recID select query result contained
     in res. The result is expected to have ascending numerical order."""
     if not res:
         return []
     row = res[0]
     if not row:
         return []
     else:
         range_list = [[row[0],row[0]]]
     for row in res[1:]:
         id = row[0]
         if id == range_list[-1][1] + 1:
             range_list[-1][1] = id
         else:
             range_list.append([id,id])
     return range_list
         
 def beautify_range_list(range_list):
     """Returns a non overlapping, maximal range list"""
     ret_list = []
     for new in range_list:
         found = 0
         for old in ret_list:
             if new[0] <= old[0] <= new[1] + 1 or new[0] - 1 <= old[1] <= new[1]:
                 old[0] = min(old[0], new[0])
                 old[1] = max(old[1], new[1])
                 found = 1
                 break
 
         if not found:
             ret_list.append(new)
                 
     return ret_list
 
 def serialize_via_numeric_array_dumps(arr):
     return Numeric.dumps(arr)
 
 def serialize_via_numeric_array_compr(str):
     return compress(str)
 
 def serialize_via_numeric_array(arr):
     """Serialize Numeric array into a compressed string."""
     return serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr))
 
 def deserialize_via_numeric_array(string):
     """Decompress and deserialize string into a Numeric array."""
     return Numeric.loads(decompress(string))
 
 def serialize_via_marshal(obj):
     """Serialize Python object via marshal into a compressed string."""
     return MySQLdb.escape_string(compress(marshal.dumps(obj)))
 
 def deserialize_via_marshal(string):
     """Decompress and deserialize string into a Python object via marshal."""
     return marshal.loads(decompress(string))
 
 class WordTable:
     "A class to hold the words table."
 
     def __init__(self, tablename, fields_to_index, separators="[^\s]"):
         "Creates words table instance."
         self.tablename = tablename
         self.recIDs_in_mem = []
         self.fields_to_index = fields_to_index
         self.separators = separators
         self.value = {}
 
     def get_field(self, recID, tag):
         """Returns list of values of the MARC-21 'tag' fields for the
            record 'recID'."""
 
         out = []
         bibXXx = "bib" + tag[0] + tag[1] + "x"
         bibrec_bibXXx = "bibrec_" + bibXXx
         query = """SELECT value FROM %s AS b, %s AS bb
                 WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id
                 AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID, tag);
         res = run_sql(query)
         for row in res:
             out.append(row[0])
         return out
 
     def clean(self):
         "Cleans the words table."
         self.value={}
 
     def put_into_db(self, mode="normal", split=string.split):
         """Updates the current words table in the corresponding MySQL's
            rnkWORD table.  Mode 'normal' means normal execution,
            mode 'emergency' means words index reverting to old state.
            """
         if options["verbose"]:
             write_message("%s %s wordtable flush started" % (self.tablename,mode))
             write_message('...updating %d words into %sR started' % \
                 (len(self.value), self.tablename[:-1]))
         task_update_progress("%s flushed %d/%d words" % (self.tablename, 0, len(self.value)))
             
         self.recIDs_in_mem = beautify_range_list(self.recIDs_in_mem)
 
         if mode == "normal":
             for group in self.recIDs_in_mem:
                 query = """UPDATE %sR SET type='TEMPORARY' WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='CURRENT'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
 
         nb_words_total = len(self.value)
         nb_words_report = int(nb_words_total/10)
         nb_words_done = 0
         for word in self.value.keys():
 	    self.put_word_into_db(word, self.value[word])
             nb_words_done += 1
             if nb_words_report!=0 and ((nb_words_done % nb_words_report) == 0):
                 if options["verbose"]:
                     write_message('......processed %d/%d words' % (nb_words_done, nb_words_total))
                 task_update_progress("%s flushed %d/%d words" % (self.tablename, nb_words_done, nb_words_total))
         if options["verbose"] >= 9:
             write_message('...updating %d words into %s ended' % \
                 (nb_words_total, self.tablename))
                 
         #if options["verbose"]:
         #    write_message('...updating reverse table %sR started' % self.tablename[:-1])
         if mode == "normal":
             for group in self.recIDs_in_mem:
                 query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
                 query = """DELETE FROM %sR WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
             if options["verbose"] >= 9:
                 write_message('End of updating wordTable into %s' % self.tablename)
         elif mode == "emergency":
 	    write_message("emergency")
             for group in self.recIDs_in_mem:
                 query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
                 query = """DELETE FROM %sR WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
             if options["verbose"] >= 9:
                 write_message('End of emergency flushing wordTable into %s' % self.tablename)
         #if options["verbose"]:
         #    write_message('...updating reverse table %sR ended' % self.tablename[:-1])
 
         self.clean()
         self.recIDs_in_mem = []
         if options["verbose"]:
             write_message("%s %s wordtable flush ended" % (self.tablename, mode))
         task_update_progress("%s flush ended" % (self.tablename))
 
     def load_old_recIDs(self,word):
         """Load existing hitlist for the word from the database index files."""
         query = "SELECT hitlist FROM %s WHERE term=%%s" % self.tablename
         res = run_sql(query, (word,))
         if res:
             return deserialize_via_marshal(res[0][0])
         else:
             return None
     
     def merge_with_old_recIDs(self,word,recIDs, set):
         """Merge the system numbers stored in memory (hash of recIDs with value[0] > 0 or -1
         according to whether to add/delete them) with those stored in the database index
         and received in set universe of recIDs for the given word.
 
         Return 0 in case no change was done to SET, return 1 in case SET was changed.
         """
 
         set_changed_p = 0
         for recID,sign in recIDs.iteritems():
             if sign[0] == -1 and set.has_key(recID):
                 # delete recID if existent in set and if marked as to be deleted
                 del set[recID]
                 set_changed_p = 1
             elif sign[0] > -1 and not set.has_key(recID):
                 # add recID if not existent in set and if marked as to be added
                 set[recID] = sign
                 set_changed_p = 1
 	    elif sign[0] > -1 and sign[0] != set[recID][0]:
                 set[recID] = sign
                 set_changed_p = 1
 
         return set_changed_p
 
     def put_word_into_db(self, word, recIDs, split=string.split):
         """Flush a single word to the database and delete it from memory"""
         set = self.load_old_recIDs(word)
 	#write_message("%s %s" % (word, self.value[word]))
         if set: # merge the word recIDs found in memory:
             options["modified_words"][word] = 1
             if self.merge_with_old_recIDs(word, recIDs, set) == 0:
                 # nothing to update:
                 if options["verbose"] >= 9:
                     write_message("......... unchanged hitlist for ``%s''" % word)
                 pass
             else:
                 # yes there were some new words:
                 if options["verbose"] >= 9:
                     write_message("......... updating hitlist for ``%s''" % word)
 		run_sql("UPDATE %s SET hitlist='%s' WHERE term='%s'" % (self.tablename, serialize_via_marshal(set), MySQLdb.escape_string(word)))
         else: # the word is new, will create new set:
             if options["verbose"] >= 9:
                 write_message("......... inserting hitlist for ``%s''" % word)
 	    set = self.value[word]
 	    if len(set) > 0:   
                 #new word, add to list
                 options["modified_words"][word] = 1
 	        run_sql("INSERT INTO %s (term, hitlist) VALUES ('%s', '%s')" % (self.tablename, MySQLdb.escape_string(word), serialize_via_marshal(set)))       
         if not set: # never store empty words
             run_sql("DELETE from %s WHERE term=%%s" % self.tablename,
                     (word,))
  
         del self.value[word]
             
     def display(self):
         "Displays the word table."
         keys = self.value.keys()
         keys.sort()
         for k in keys:
             if options["verbose"]:
                 write_message("%s: %s" % (k, self.value[k]))
 
     def count(self):
         "Returns the number of words in the table."
         return len(self.value)
 
     def info(self):
         "Prints some information on the words table."
         if options["verbose"]:
             write_message("The words table contains %d words." % self.count())
 
     def lookup_words(self, word=""):
         "Lookup word from the words table."
 
         if not word:
             done = 0
             while not done:
                 try:
                     word = raw_input("Enter word: ")
                     done = 1
                 except (EOFError, KeyboardInterrupt):
                     return
 
         if self.value.has_key(word):
             if options["verbose"]:
                 write_message("The word '%s' is found %d times." \
                 % (word, len(self.value[word])))
         else:
             if options["verbose"]:
                 write_message("The word '%s' does not exist in the word file."\
                               % word)
 
     def update_last_updated(self, rank_method_code, starting_time=None):
         """Update last_updated column of the index table in the database.
         Puts starting time there so that if the task was interrupted for record download,
         the records will be reindexed next time."""
         if starting_time is None:
             return None
         if options["verbose"] >= 9:
             write_message("updating last_updated to %s...", starting_time)            
         return run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s",
                        (starting_time, rank_method_code,))
 
     def add_recIDs(self, recIDs):
         """Fetches records which id in the recIDs range list and adds
         them to the wordTable.  The recIDs range list is of the form:
         [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
         """
         global chunksize
         flush_count = 0
         records_done = 0
         records_to_go = 0
 
         for range in recIDs:
             records_to_go = records_to_go + range[1] - range[0] + 1
             
         time_started = time.time() # will measure profile time
         for range in recIDs:
             i_low = range[0]
             chunksize_count = 0
             while i_low <= range[1]:
                 # calculate chunk group of recIDs and treat it:
                 i_high = min(i_low+options["flush"]-flush_count-1,range[1])
                 i_high = min(i_low+chunksize-chunksize_count-1, i_high)
                 try:
                     self.chk_recID_range(i_low, i_high)
                 except StandardError, e:
                     write_message("Exception caught: %s" % e, sys.stderr)
                     if options["verbose"] >= 9:        
                         traceback.print_tb(sys.exc_info()[2])
                     task_update_status("ERROR")
                     task_sig_stop_commands()
                     sys.exit(1)
                 if options["verbose"]:
                     write_message("%s adding records #%d-#%d started" % \
                         (self.tablename, i_low, i_high))
                 if cfg_check_mysql_threads:
                     kill_sleepy_mysql_threads()
                 task_update_progress("%s adding recs %d-%d" % (self.tablename, i_low, i_high))
 		self.del_recID_range(i_low, i_high)
                 just_processed = self.add_recID_range(i_low, i_high)
                 flush_count = flush_count + i_high - i_low + 1
                 chunksize_count = chunksize_count + i_high - i_low + 1
                 records_done = records_done + just_processed
                 if options["verbose"]:
                     write_message("%s adding records #%d-#%d ended  " % \
                         (self.tablename, i_low, i_high))
                 if chunksize_count >= chunksize:
                     chunksize_count = 0
                 # flush if necessary:
                 if flush_count >= options["flush"]:
                     self.put_into_db()
                     self.clean()
                     if options["verbose"]:
                         write_message("%s backing up" % (self.tablename))
                     flush_count = 0
                     self.log_progress(time_started,records_done,records_to_go)
                 # iterate:
                 i_low = i_high + 1
         if flush_count > 0:
             self.put_into_db()
             self.log_progress(time_started,records_done,records_to_go)
 
     def add_date(self, date=""):
         # If date is not set, then retrieve it from the database.
         # Reindex all formats newer than the modification date
         if not date:
             write_message("Using the last update time for the rank method")
             id = self.tablename[len("bibindex"):]
             query = """SELECT last_updated FROM rnkMETHOD WHERE name='%s'
             """ % options["current_run"]
             res = run_sql(query)
 
             if not res:
                 return
             if not res[0][0]:
                 date = ("0000-00-00",'')
             else:
                 date = (res[0][0],'')
 
         query = """SELECT b.id FROM bibrec AS b WHERE b.modification_date >=
         '%s'""" % date[0]
         if date[1]:
             query += "and b.modification_date <= '%s'" % date[1]
         query += "ORDER BY b.id ASC"""
         res = run_sql(query)        
 
         list = create_range_list(res)
         if not list:
             if options["verbose"]:
                 write_message( "No new records added. %s is up to date" % self.tablename)
         else:
             self.add_recIDs(list)
         return list
 
         
     def add_recID_range(self, recID1, recID2):
         empty_list_string = serialize_via_marshal([])
         wlist = {}
         normalize = {}
 
         self.recIDs_in_mem.append([recID1,recID2])
         # secondly fetch all needed tags:
         
         for (tag, weight, lang) in self.fields_to_index:
 	    if tag in tagToWordsFunctions.keys():
                 get_words_function = tagToWordsFunctions[ tag ]
 	    else: get_words_function = get_words_from_phrase
             bibXXx = "bib" + tag[0] + tag[1] + "x"
             bibrec_bibXXx = "bibrec_" + bibXXx
             query = """SELECT bb.id_bibrec,b.value FROM %s AS b, %s AS bb
                     WHERE bb.id_bibrec BETWEEN %d AND %d
                     AND bb.id_bibxxx=b.id AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID1, recID2, tag)    
             res = run_sql(query)
 	    nb_total_to_read = len(res)
             verbose_idx = 0     # for verbose pretty printing       
             for row in res:
 		recID, phrase = row 
                 if options["validset"].contains(recID):
                     if not wlist.has_key(recID): wlist[recID] = {}
                     new_words = get_words_function(phrase, weight, lang) # ,self.separators
                     wlist[recID] = dict_union(new_words,wlist[recID])
 
         # were there some words for these recIDs found?
         if len(wlist) == 0: return 0
         recIDs = wlist.keys()
         for recID in recIDs:
             # was this record marked as deleted?
             if "DELETED" in self.get_field(recID, "980__c"):
                 wlist[recID] = {}
                 if options["verbose"] >= 9:
                     write_message("... record %d was declared deleted, removing its word list" % recID)
             if options["verbose"] >= 9:
                 write_message("... record %d, termlist: %s" % (recID, wlist[recID]))
         query_factory = cStringIO.StringIO()
         qwrite = query_factory.write
         qwrite( "INSERT INTO %sR (id_bibrec,termlist,type) VALUES" % self.tablename[:-1])
         qwrite( "('" )
         qwrite( str(recIDs[0]) )
         qwrite( "','" )
         qwrite( serialize_via_marshal(wlist[recIDs[0]]) )
         qwrite( "','FUTURE')" )
         for recID in recIDs[1:]:
             qwrite(",('")
             qwrite(str(recID))
             qwrite("','")
             qwrite(serialize_via_marshal(wlist[recID]))
             qwrite("','FUTURE')")
 
         query = query_factory.getvalue()
         query_factory.close()
         run_sql(query)
 
         query_factory = cStringIO.StringIO()
         qwrite = query_factory.write
         qwrite("INSERT INTO %sR (id_bibrec,termlist,type) VALUES" % self.tablename[:-1])
         qwrite("('")
         qwrite(str(recIDs[0]))
         qwrite("','")
         qwrite(serialize_via_marshal(wlist[recIDs[0]]))
         qwrite("','CURRENT')")
         for recID in recIDs[1:]:
             qwrite( ",('" )
             qwrite( str(recID) )
             qwrite( "','" )
             qwrite( empty_list_string )
             qwrite( "','CURRENT')" )
         query = query_factory.getvalue()
         query_factory.close()
 
         try:
             run_sql(query)
         except MySQLdb.DatabaseError:
             pass
         
         put = self.put
         for recID in recIDs:
             for (w, count) in wlist[recID].iteritems():
                 put(recID, w, count)
         
         return len(recIDs)
                 
     def log_progress(self, start, done, todo):
         """Calculate progress and store it.
         start: start time,
         done: records processed,
         todo: total number of records"""
         time_elapsed = time.time() - start
         # consistency check
         if time_elapsed == 0 or done > todo:
             return
 
         time_recs_per_min = done/(time_elapsed/60.0)
         if options["verbose"]:
             write_message("%d records took %.1f seconds to complete.(%1.f recs/min)"\
                 % (done, time_elapsed, time_recs_per_min))
         
         if time_recs_per_min:
             if options["verbose"]:
                 write_message("Estimated runtime: %.1f minutes" % \
                     ((todo-done)/time_recs_per_min))
 
     def put(self, recID, word, sign):
         "Adds/deletes a word to the word list."        
         try:
             word = lower(word[:50])
             if self.value.has_key(word):
                 # the word 'word' exist already: update sign
                 self.value[word][recID] = sign
                 # PROBLEM ?
             else:
                 self.value[word] = {recID: sign}
         except:
             write_message("Error: Cannot put word %s with sign %d for recID %s." % (word, sign, recID))
 
 
     def del_recIDs(self, recIDs):
         """Fetches records which id in the recIDs range list and adds
         them to the wordTable.  The recIDs range list is of the form:
         [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
         """
         count = 0
         for range in recIDs:
             self.del_recID_range(range[0],range[1])
             count = count + range[1] - range[0]
         self.put_into_db()
         
     def del_recID_range(self, low, high):
         """Deletes records with 'recID' system number between low
            and high from memory words index table."""
         if options["verbose"] > 2:
             write_message("%s fetching existing words for records #%d-#%d started" % \
                 (self.tablename, low, high))
         self.recIDs_in_mem.append([low,high])
         query = """SELECT id_bibrec,termlist FROM %sR as bb WHERE bb.id_bibrec
         BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high)
         recID_rows = run_sql(query)        
         for recID_row in recID_rows:
             recID = recID_row[0]
             wlist = deserialize_via_marshal(recID_row[1])
             for word in wlist:
                 self.put(recID, word, (-1, 0))
         if options["verbose"] > 2:
             write_message("%s fetching existing words for records #%d-#%d ended" % \
                 (self.tablename, low, high))
 
     def report_on_table_consistency(self):
         """Check reverse words index tables (e.g. rnkWORD01R) for
         interesting states such as 'TEMPORARY' state.
         Prints small report (no of words, no of bad words).
         """
         # find number of words:
         query = """SELECT COUNT(*) FROM %s""" % (self.tablename)
         res = run_sql(query, None, 1)
         if res:
             nb_words = res[0][0]
         else:
             nb_words = 0
 
         # find number of records:
         query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR""" % (self.tablename[:-1])
         res = run_sql(query, None, 1)
         if res:
             nb_records = res[0][0]
         else:
             nb_records = 0
 
         # report stats:
         if options["verbose"]:
             write_message("%s contains %d words from %d records" % (self.tablename, nb_words, nb_records))
 
         # find possible bad states in reverse tables:
         query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR WHERE type <> 'CURRENT'""" % (self.tablename[:-1])
         res = run_sql(query)
         if res:
             nb_bad_records = res[0][0]
         else:
             nb_bad_records = 999999999
         if nb_bad_records:
             write_message("EMERGENCY: %s needs to repair %d of %d records" % \
                 (self.tablename, nb_bad_records, nb_records))
         else:
             if options["verbose"]:
                 write_message("%s is in consistent state" % (self.tablename))
         
         return nb_bad_records
 
     def repair(self):
         """Repair the whole table"""
         # find possible bad states in reverse tables:
         query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR WHERE type <> 'CURRENT'""" % (self.tablename[:-1])
         res = run_sql(query, None, 1)
         if res:
             nb_bad_records = res[0][0]
         else: 
             nb_bad_records = 0
       
         # find number of records:
         query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR""" % (self.tablename[:-1])
         res = run_sql(query)
         if res:
             nb_records = res[0][0]
         else:
             nb_records = 0
 
         if nb_bad_records == 0:
             return
         query = """SELECT id_bibrec FROM %sR WHERE type <> 'CURRENT' ORDER BY id_bibrec""" \
                 % (self.tablename[:-1])
         res = run_sql(query)
         recIDs = create_range_list(res)
 
         flush_count = 0
         records_done = 0
         records_to_go = 0
 
         for range in recIDs:
             records_to_go = records_to_go + range[1] - range[0] + 1
 
         time_started = time.time() # will measure profile time
         for range in recIDs:
             i_low = range[0]
             chunksize_count = 0
             while i_low <= range[1]:
                 # calculate chunk group of recIDs and treat it:
                 i_high = min(i_low+options["flush"]-flush_count-1,range[1])
                 i_high = min(i_low+chunksize-chunksize_count-1, i_high)
                 try:
                     self.fix_recID_range(i_low, i_high)
                 except StandardError, e:
                     write_message("Exception caught: %s" % e, sys.stderr)
                     if options["verbose"] >= 9:        
                         traceback.print_tb(sys.exc_info()[2])
                     task_update_status("ERROR")
                     task_sig_stop_commands()
                     sys.exit(1)
 
                 flush_count = flush_count + i_high - i_low + 1
                 chunksize_count = chunksize_count + i_high - i_low + 1
                 records_done = records_done + i_high - i_low + 1
                 if chunksize_count >= chunksize:
                     chunksize_count = 0
                 # flush if necessary:
                 if flush_count >= options["flush"]:
                     self.put_into_db("emergency")
                     self.clean()
                     flush_count = 0
                     self.log_progress(time_started,records_done,records_to_go)
                 # iterate:
                 i_low = i_high + 1
         if flush_count > 0:
             self.put_into_db("emergency")
             self.log_progress(time_started,records_done,records_to_go)
         write_message("%s inconsistencies repaired." % self.tablename)
 
     def chk_recID_range(self, low, high):
         """Check if the reverse index table is in proper state"""
         ## check db
         query = """SELECT COUNT(*) FROM %sR WHERE type <> 'CURRENT'
         AND id_bibrec BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high)
         res = run_sql(query, None, 1)
         if res[0][0]==0:
             if options["verbose"]:
                 write_message("%s for %d-%d is in consistent state"%(self.tablename,low,high))
             return # okay, words table is consistent
 
         ## inconsistency detected!
         write_message("EMERGENCY: %s inconsistencies detected..." % self.tablename)
         write_message("""EMERGENCY: Errors found. You should check consistency of the %s - %sR tables.\nRunning 'bibindex --repair' is recommended.""" \
             % (self.tablename, self.tablename[:-1]))
         raise StandardError
 
     def fix_recID_range(self, low, high):
         """Try to fix reverse index database consistency (e.g. table rnkWORD01R) in the low,high doc-id range.
 
         Possible states for a recID follow:
         CUR TMP FUT: very bad things have happened: warn!
         CUR TMP    : very bad things have happened: warn!
         CUR     FUT: delete FUT (crash before flushing)
         CUR        : database is ok
             TMP FUT: add TMP to memory and del FUT from memory
                      flush (revert to old state)
             TMP    : very bad things have happened: warn!
                 FUT: very bad things have happended: warn!
         """
 
         state = {}
         query = "SELECT id_bibrec,type FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d'"\
                 % (self.tablename[:-1], low, high)
         res = run_sql(query)
         for row in res:
             if not state.has_key(row[0]):
                 state[row[0]]=[]
             state[row[0]].append(row[1])
 
         ok = 1 # will hold info on whether we will be able to repair
         for recID in state.keys():
             if not 'TEMPORARY' in state[recID]:
                 if 'FUTURE' in state[recID]:
                     if 'CURRENT' not in state[recID]:
                         write_message("EMERGENCY: Record %d is in inconsistent state. Can't repair it" % recID)
                         ok = 0
                     else:
                         write_message("EMERGENCY: Inconsistency in record %d detected" % recID)
                         query = """DELETE FROM %sR
                         WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID)
                         run_sql(query)
                         write_message("EMERGENCY: Inconsistency in record %d repaired." % recID)
             else:
                 if 'FUTURE' in state[recID] and not 'CURRENT' in state[recID]:
                     self.recIDs_in_mem.append([recID,recID])
                     # Get the words file
                     query = """SELECT type,termlist FROM %sR
                     WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID)
                     if options["verbose"] >= 9:
                         write_message(query)
                     res = run_sql(query)
                     for row in res:
                         wlist = deserialize_via_marshal(row[1])
                         if options["verbose"] >= 9:
                             write_message("Words are %s " % wlist)
                         if row[0] == 'TEMPORARY':
                             sign = 1
                         else:
                             sign = -1
                         for word in wlist:
                             self.put(recID, word, wlist[word])
 
                 else:
                     write_message("EMERGENCY: %s for %d is in inconsistent state. Couldn't repair it." % (self.tablename, recID))
                     ok = 0
 
         if not ok:
             write_message("""EMERGENCY: Unrepairable errors found. You should check consistency
                 of the %s - %sR tables. Deleting affected records is
                 recommended.""" % (self.tablename, self.tablename[:-1]))
             raise StandardError
                        
 def word_index(row, run):
     """Run the indexing task.  The row argument is the BibSched task
     queue row, containing if, arguments, etc.
     Return 1 in case of success and 0 in case of failure.
     """
 
     ## import optional modules:
     try:
         import psyco
         psyco.bind(get_words_from_phrase)
         psyco.bind(WordTable.merge_with_old_recIDs)
         psyco.bind(serialize_via_numeric_array)
         psyco.bind(serialize_via_marshal)
         psyco.bind(deserialize_via_numeric_array)
         psyco.bind(deserialize_via_marshal)
         psyco.bind(update_rnkWORD)
         psyco.bind(check_rnkWORD)
     except StandardError,e:
         print "Warning: Psyco", e
         pass
 
     global options, task_id, wordTables, languages 
       
     # read from SQL row:
     task_id = row[0]
     task_proc = row[1]
     options = marshal.loads(row[6])
 
     # install signal handlers
     signal.signal(signal.SIGUSR1, task_sig_sleep)
     signal.signal(signal.SIGTERM, task_sig_stop)
     signal.signal(signal.SIGABRT, task_sig_suicide)
     signal.signal(signal.SIGCONT, task_sig_wakeup)
     signal.signal(signal.SIGINT, task_sig_unknown)
     ## go ahead and treat each table:
 
     options["run"] = []
     options["run"].append(run)
     for rank_method_code in options["run"]:
         method_starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         write_message("Running rank method: %s" % getName(rank_method_code))
         try:
             file = etcdir + "/bibrank/" + rank_method_code + ".cfg"
             config = ConfigParser.ConfigParser()
             config.readfp(open(file))
         except StandardError, e:
             write_message("Cannot find configurationfile: %s" % file, sys.stderr)
             raise StandardError
 
         options["current_run"] = rank_method_code
         options["modified_words"] = {}
         options["table"] = config.get(config.get("rank_method", "function"), "table")
         options["use_stemming"] = config.get(config.get("rank_method","function"),"stemming")
         options["remove_stopword"] = config.get(config.get("rank_method","function"),"stopword")
         tags = get_tags(config) #get the tags to include
         options["validset"] = get_valid_range(rank_method_code) #get the records from the collections the method is enabled for
         function = config.get("rank_method","function")
         wordTable = WordTable(options["table"], tags)
         wordTable.report_on_table_consistency()
         try:
             if options["cmd"] == "del":
                 if options["id"]:
                     wordTable.del_recIDs(options["id"])
                 elif options["collection"]:
                     l_of_colls = string.split(options["collection"], ",")
                     recIDs = perform_request_search(c=l_of_colls)
                     recIDs_range = []
                     for recID in recIDs:
                         recIDs_range.append([recID,recID])
                     wordTable.del_recIDs(recIDs_range)
                 else:
                     write_message("Missing IDs of records to delete from index %s.", wordTable.tablename,
                                   sys.stderr)
                     raise StandardError
             elif options["cmd"] == "add":
                 if options["id"]:
                     wordTable.add_recIDs(options["id"])
                 elif options["collection"]:
                     l_of_colls = string.split(options["collection"], ",")
                     recIDs = perform_request_search(c=l_of_colls)
                     recIDs_range = []
                     for recID in recIDs:
                         recIDs_range.append([recID,recID])
                     wordTable.add_recIDs(recIDs_range)
                 elif options["last_updated"]:
                     wordTable.add_date("")
                     wordTable.update_last_updated(rank_method_code, method_starting_time)
                 else:
                     wordTable.add_recIDs([[0,cfg_max_recID]])
                     #wordTable.add_date(options["modified"])
                     # only update last_updated if run via automatic mode:
             elif options["cmd"] == "repair":
                 wordTable.repair()
                 check_rnkWORD(options["table"])
             elif options["cmd"] == "check":
                 check_rnkWORD(options["table"])
                 options["modified_words"] = {}
             elif options["cmd"] == "stat":
                 rank_method_code_statistics(options["table"])
             else:
                 write_message("Invalid command found processing %s" % \
                      wordTable.tablename, sys.stderr)
                 raise StandardError
             update_rnkWORD(options["table"], options["modified_words"])
         except StandardError, e:
             write_message("Exception caught: %s" % e, sys.stderr)
             if options["verbose"] >= 9:        
                 traceback.print_tb(sys.exc_info()[2])
             sys.exit(1)
         wordTable.report_on_table_consistency()
     # We are done. State it in the database, close and quit
 
     return 1
        
 def get_tags(config):
     """Get the tags that should be used creating the index and each tag's parameter"""
     tags = []
     function = config.get("rank_method","function")
     i = 1
     shown_error = 0
   
     #try:
     if 1:
         while config.has_option(function,"tag%s"% i):
             tag = config.get(function, "tag%s" % i)
             tag = string.split(tag, ",")
             tag[1] = int(string.strip(tag[1]))
             tag[2] = string.strip(tag[2])
   
             #check if stemmer for language is available
             if config.get(function,"stemming") and stem_by_lang("information", "en") != "inform":
                 if shown_error == 0:
                     write_message("Warning: PyStemmer not found. Please read INSTALL.")
                     shown_error = 1
             elif tag[2] and tag[2] != "none" and config.get(function,"stemming") and not lang_available(tag[2]): 
                 write_message("Warning: Language '%s' not available in PyStemmer." % tag[2])
             tags.append(tag)
             i += 1
     #except Exception:
     #    write_message("Could not read data from configuration file, please check for errors")
     #    raise StandardError
 
     return tags
 
 def get_valid_range(rank_method_code):
     """Returns which records are valid for this rank method, according to which collections it is enabled for."""
 
     #if options["verbose"] >=9:
     #    write_message("Getting records from collections enabled for rank method.")
     #res = run_sql("SELECT collection.name FROM collection,collection_rnkMETHOD,rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" %  rank_method_code)
     #l_of_colls = []
     #for coll in res:
     #    l_of_colls.append(coll[0])
     #if len(l_of_colls) > 0:
     #    recIDs = perform_request_search(c=l_of_colls)
     #else:
     #    recIDs = []
     
     valid = HitSet(Numeric.ones(cfg_max_recID+1, Numeric.Int0))
     #valid.addlist(recIDs)
     return valid
 
 def write_message(msg, stream=sys.stdout):
     """Prints message and flush output stream (may be sys.stdout or sys.stderr)."""
     if stream == sys.stdout or stream == sys.stderr:
         stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
         stream.write("%s\n" % msg)
         stream.flush()
     else:
         sys.stderr.write("Unknown stream %s.  [must be sys.stdout or sys.stderr]\n" % stream)
 
 def check_term(term, termlength):
     """Check if term contains not allowed characters, or for any other reasons for not using this term."""
     try:
         if len(term) <= termlength:
 	    return False
         reg = re.compile(r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]")
         if re.search(reg, term):
             return False
         term = str.replace(term, "-", "")
         term = str.replace(term, ".", "")
         term = str.replace(term, ",", "")
         if int(term):
             return False
     except StandardError, e:
 	pass
     return True
 
 def check_rnkWORD(table):
     """Checks for any problems in rnkWORD tables."""
     i = 0 
     errors = {}
     termslist = run_sql("SELECT term FROM %s" % table)
     N = run_sql("select max(id_bibrec) from %sR" % table[:-1])[0][0]
     write_message("Checking integrity of rank values in %s" % table) 
     terms = map(lambda x: x[0], termslist)
 
     while i < len(terms):
         current_terms = ""
         for j in range(i, ((i+5000)< len(terms) and (i+5000) or len(terms))):
             current_terms += "'%s'," % terms[j]
         terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term in (%s)" % (table, current_terms[:-1]))
         for (t, hitlist) in terms_docs: 
             term_docs = deserialize_via_marshal(hitlist)
             if (term_docs.has_key("Gi") and term_docs["Gi"][1] == 0) or not term_docs.has_key("Gi"):
                 write_message("ERROR: Missing value for term: %s (%s) in %s: %s" % (t, repr(t), table, len(term_docs)))
                 errors[t] = 1
         i += 5000
     write_message("Checking integrity of rank values in %sR" % table[:-1]) 
     i = 0
     while i < N:
         docs_terms = run_sql("SELECT id_bibrec, termlist FROM %sR WHERE id_bibrec>=%s and id_bibrec<=%s" % (table[:-1], i, i+5000))
         for (j, termlist) in docs_terms:
             termlist = deserialize_via_marshal(termlist)
             for (t, tf) in termlist.iteritems():
                 if tf[1] == 0 and not errors.has_key(t):
                     errors[t] = 1
                     write_message("ERROR: Gi missing for record %s and term: %s (%s) in %s" % (j,t,repr(t), table))
                     terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term='%s'" % (table, t))
                     termlist = deserialize_via_marshal(terms_docs[0][1])
     	i += 5000
 
     if len(errors) == 0:
         write_message("No direct errors found, but nonconsistent data may exist.")
     else:
         write_message("%s errors found during integrity check, repair and rebalancing recommended." % len(errors))
     options["modified_words"] = errors
 
 def rank_method_code_statistics(table):
     """Shows some statistics about this rank method."""
 
     maxID = run_sql("select max(id) from %s" % table)
     maxID = maxID[0][0]
     terms = {}
     Gi = {}
 
     write_message("Showing statistics of terms in index:")
     write_message("Important: For the 'Least used terms', the number of terms is shown first, and the number of occurences second.") 
     write_message("Least used terms---Most important terms---Least important terms")
     i = 0
     while i < maxID:
         terms_docs=run_sql("SELECT term, hitlist FROM %s WHERE id>= %s and id < %s" % (table, i, i + 10000))
         for (t, hitlist) in terms_docs:
             term_docs=deserialize_via_marshal(hitlist)
             terms[len(term_docs)] = terms.get(len(term_docs), 0) + 1
             if term_docs.has_key("Gi"):
                 Gi[t] = term_docs["Gi"]
         i=i + 10000
     terms=terms.items()
     terms.sort(lambda x, y: cmp(y[1], x[1]))
     Gi=Gi.items()
     Gi.sort(lambda x, y: cmp(y[1], x[1]))
     for i in range(0, 20):
         write_message("%s/%s---%s---%s" % (terms[i][0],terms[i][1], Gi[i][0],Gi[len(Gi) - i - 1][0]))
 
 def update_rnkWORD(table, terms):
     """Updates rnkWORDF and rnkWORDR with Gi and Nj values. For each term in rnkWORDF, a Gi value for the term is added. And for each term in each document, the Nj value for that document is added. In rnkWORDR, the Gi value for each term in each document is added. For description on how things are computed, look in the hacking docs.
     table - name of forward index to update 
     terms - modified terms"""
 
     stime = time.time()
     Gi = {}
     Nj = {}
     N = run_sql("select count(id_bibrec) from %sR" % table[:-1])[0][0]
  
     write_message("Beginning post-processing of %s terms" % len(terms))
     if len(terms) == 0:
         write_message("No terms to process, ending...")
         return ""
 
     #Locating all documents related to the modified/new/deleted terms, if fast update, 
     #only take into account new/modified occurences
     write_message("Phase 1: Finding records containing modified terms")      
     terms = terms.keys()
     i = 0 
 
     while i < len(terms):
         terms_docs = get_from_forward_index(terms, i, (i+5000), table)
         for (t, hitlist) in terms_docs: 
             term_docs = deserialize_via_marshal(hitlist)
             if term_docs.has_key("Gi"):
                 del term_docs["Gi"]
 	    for (j, tf) in term_docs.iteritems():
                 if (options["quick"] == "yes" and tf[1] == 0) or options["quick"] == "no":
                     Nj[j] = 0
         write_message("Phase 1: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
         i += 5000
     write_message("Phase 1: Finished finding records containing modified terms")
     
     #Find all terms in the records found in last phase
     write_message("Phase 2: Finding all terms in affected records")
     records = Nj.keys()   
     i = 0
     while i < len(records):
         docs_terms = get_from_reverse_index(records, i, (i + 5000), table)
         for (j, termlist) in docs_terms:
             doc_terms = deserialize_via_marshal(termlist)
             for (t, tf) in doc_terms.iteritems(): 
                  Gi[t] = 0
         write_message("Phase 2: ......processed %s/%s records " % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
     	i += 5000
     write_message("Phase 2: Finished finding all terms in affected records")
 
     terms = Gi.keys()
     Gi = {}
     i = 0
     if options["quick"] == "no":
         #Calculating Fi and Gi value for each term
         write_message("Phase 3: Calculating importance of all affected terms")
         while i < len(terms):
             terms_docs = get_from_forward_index(terms, i, (i+5000), table)
             for (t, hitlist) in terms_docs:
                 term_docs = deserialize_via_marshal(hitlist)
                 if term_docs.has_key("Gi"):
                     del term_docs["Gi"]
                 Fi = 0
                 Gi[t] = 1
 	        for (j, tf) in term_docs.iteritems():
                     Fi += tf[0]
                 for (j, tf) in term_docs.iteritems():
                     if tf[0] != Fi:
                         Gi[t] = Gi[t] + ((float(tf[0]) / Fi) * math.log(float(tf[0]) / Fi) / math.log(2)) / math.log(N) 
             write_message("Phase 3: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
             i += 5000
         write_message("Phase 3: Finished calculating importance of all affected terms")
     else:
         #Using existing Gi value instead of calculating a new one. Missing some accurancy.
         write_message("Phase 3: Getting approximate importance of all affected terms")
         while i < len(terms):
             terms_docs = get_from_forward_index(terms, i, (i+5000), table)
             for (t, hitlist) in terms_docs:
                 term_docs = deserialize_via_marshal(hitlist)
                 if term_docs.has_key("Gi"):
                     Gi[t] = term_docs["Gi"][1]
                 elif len(term_docs) == 1:
                     Gi[t] = 1
                 else:
                     Fi = 0
                     Gi[t] = 1
 	            for (j, tf) in term_docs.iteritems():
                         Fi += tf[0]
                     for (j, tf) in term_docs.iteritems():
                         if tf[0] != Fi:
                             Gi[t] = Gi[t] + ((float(tf[0]) / Fi) * math.log(float(tf[0]) / Fi) / math.log(2)) / math.log(N) 
             write_message("Phase 3: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
             i += 5000
         write_message("Phase 3: Finished getting approximate importance of all affected terms")
 
     write_message("Phase 4: Calculating normalization value for all affected records and updating %sR" % table[:-1])
     records = Nj.keys()
     i = 0
     while i < len(records):
         #Calculating the normalization value for each document, and adding the Gi value to each term in each document.
         docs_terms = get_from_reverse_index(records, i, (i + 5000), table)
         for (j, termlist) in docs_terms:
             doc_terms = deserialize_via_marshal(termlist)           
             for (t, tf) in doc_terms.iteritems():
                 if Gi.has_key(t):
                     Nj[j] = Nj.get(j, 0) + math.pow(Gi[t] * (1 + math.log(tf[0])), 2)
                     Git = int(math.floor(Gi[t]*100))
                     if Git >= 0:
                         Git += 1
                     doc_terms[t] = (tf[0], Git)
                 else:  
                     Nj[j] = Nj.get(j, 0) + math.pow(tf[1] * (1 + math.log(tf[0])), 2)
             Nj[j] = 1.0 / math.sqrt(Nj[j])
             Nj[j] = int(Nj[j] * 100)
             if Nj[j] >= 0:
                 Nj[j] += 1
             run_sql("UPDATE %sR SET termlist='%s' WHERE id_bibrec=%s" % (table[:-1], serialize_via_marshal(doc_terms), j))  
         write_message("Phase 4: ......processed %s/%s records" % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
 	i += 5000
     write_message("Phase 4: Finished calculating normalization value for all affected records and updating %sR" % table[:-1])
     write_message("Phase 5: Updating %s with new normalization values" % table)
     i = 0
     terms = Gi.keys()
     while i < len(terms):
         #Adding the Gi value to each term, and adding the normalization value to each term in each document.
         terms_docs = get_from_forward_index(terms, i, (i+5000), table)
         for (t, hitlist) in terms_docs: 
             term_docs = deserialize_via_marshal(hitlist)
             if term_docs.has_key("Gi"):
                 del term_docs["Gi"]
 	    for (j, tf) in term_docs.iteritems():
                 if Nj.has_key(j):
                     term_docs[j] = (tf[0], Nj[j])
             Git = int(math.floor(Gi[t]*100))
             if Git >= 0:
                 Git += 1
             term_docs["Gi"] = (0, Git)
             run_sql("UPDATE %s SET hitlist='%s' WHERE term='%s'" % (table, serialize_via_marshal(term_docs), MySQLdb.escape_string(t)))
         write_message("Phase 5: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
         i += 5000
     write_message("Phase 5:  Finished updating %s with new normalization values" % table)
     write_message("Time used for post-processing: %.1fmin" % ((time.time() - stime) / 60))
     write_message("Finished post-processing") 
     
 
 def get_from_forward_index(terms, start, stop, table):
     current_terms = ""
     for j in range(start, (stop < len(terms) and stop or len(terms))):
         current_terms += "'%s'," % terms[j]
     terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term IN (%s)" % (table,current_terms[:-1]))
     return terms_docs
 
 def get_from_reverse_index(records, start, stop, table):
     current_recs = "%s" % records[start:stop]
     current_recs = current_recs[1:-1]
     docs_terms = run_sql("SELECT id_bibrec, termlist FROM %sR WHERE id_bibrec IN (%s)" % (table[:-1],current_recs))
     return docs_terms
 
 def test_word_separators(phrase="hep-th/0101001"):
     """Tests word separating policy on various input."""
     print "%s:" % phrase
     gwfp = get_words_from_phrase(phrase)
     for (word, count) in gwfp.iteritems():
         print "\t-> %s - %s" % (word, count)
 
 def task_sig_sleep(sig, frame):
     """Signal handler for the 'sleep' signal sent by BibSched."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)
     write_message("sleeping...")
     task_update_status("SLEEPING")
     signal.pause() # wait for wake-up signal
 
 def task_sig_wakeup(sig, frame):
     """Signal handler for the 'wakeup' signal sent by BibSched."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)
     write_message("continuing...")
     task_update_status("CONTINUING")
 
 def task_sig_stop(sig, frame):
     """Signal handler for the 'stop' signal sent by BibSched."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)
     write_message("stopping...")
     task_update_status("STOPPING")
     errcode = 0
     try:
         task_sig_stop_commands()
         write_message("stopped")
         task_update_status("STOPPED")
     except StandardError, err:
         write_message("Error during stopping! %e" % err)
         task_update_status("STOPPINGFAILED")
         errcode = 1
     sys.exit(errcode)
 
 def task_sig_stop_commands():
     """Do all the commands necessary to stop the task before quitting.
     Useful for task_sig_stop() handler.    
     """
     write_message("stopping commands started")    
     for table in wordTables:
         table.put_into_db()    
     write_message("stopping commands ended")    
     
 def task_sig_suicide(sig, frame):
     """Signal handler for the 'suicide' signal sent by BibSched."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)
     write_message("suiciding myself now...")
     task_update_status("SUICIDING")
     write_message("suicided")
     task_update_status("SUICIDED")
     sys.exit(0)
 
 def task_sig_unknown(sig, frame):
     """Signal handler for the other unknown signals sent by shell or user."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)    
     write_message("unknown signal %d ignored" % sig) # do nothing for other signals
 
 def task_update_progress(msg):
     """Updates progress information in the BibSched task table."""
     global task_id, options
     if options["verbose"] >= 9:
         write_message("Updating task progress to %s." % msg)
     return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, task_id))
 
 def task_update_status(val):
     """Updates state information in the BibSched task table."""
     global task_id, options
     if options["verbose"] >= 9:
         write_message("Updating task status to %s." % val)
     return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, task_id))    
 
 def getName(methname, ln=cdslang, type='ln'):
     """Returns the name of the rank method, either in default language or given language.
     methname = short name of the method
     ln - the language to get the name in
     type - which name "type" to get."""
 
     try:
         rnkid = run_sql("SELECT id FROM rnkMETHOD where name='%s'" % methname)
         if rnkid:
             rnkid = str(rnkid[0][0])
             res = run_sql("SELECT value FROM rnkMETHODNAME where type='%s' and ln='%s' and id_rnkMETHOD=%s" % (type, ln, rnkid))
             if not res:
                 res = run_sql("SELECT value FROM rnkMETHODNAME WHERE ln='%s' and id_rnkMETHOD=%s and type='%s'"  % (cdslang, rnkid, type))
             if not res:
                 return methname
             return res[0][0]
         else:
             raise Exception
     except Exception, e:
         write_message("Cannot run rank method, either given code for method is wrong, or it has not been added using the webinterface.")
         raise Exception
 
 def word_similarity(row, run):
     """Call correct method"""
     return word_index(row, run)
diff --git a/modules/bibrank/lib/bibrank_word_indexer.py.wml b/modules/bibrank/lib/bibrank_word_indexer.py.wml
index 89705309f..2100fd4d9 100644
--- a/modules/bibrank/lib/bibrank_word_indexer.py.wml
+++ b/modules/bibrank/lib/bibrank_word_indexer.py.wml
@@ -1,1472 +1,1472 @@
  # $Id$
 ## BibRank word frequency indexer utility.
 
 ## This file is part of the CERN Document Server Software (CDSware).
 ## Copyright (C) 2002 CERN.
 ##
 ## The CDSware is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ##
 ## The CDSware is distributed in the hope that it will be useful, but
 ## WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.  
 ##
 ## You should have received a copy of the GNU General Public License
 ## along with CDSware; if not, write to the Free Software Foundation, Inc.,
 ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 ## read config variables:
 #include "config.wml"
 #include "configbis.wml"
 #include "cdswmllib.wml"
 
 ## start Python:
 <protect>#!</protect><PYTHON>
 <protect># -*- coding: utf-8 -*-</protect>
 <protect>## $Id$</protect>
 <protect>## DO NOT EDIT THIS FILE!  IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
 
 __version__ = "<: print generate_pretty_version_string('$Id$'); :>"
 
 ## fill config variables:
 pylibdir = "<LIBDIR>/python"
 
 ## programs used to convert fulltext files to text:
 conv_programs = {#"ps": ["<PSTOTEXT>","<PSTOASCII>"],  # switched off at the moment, since PDF is faster
                  #"ps.gz": ["<PSTOTEXT>","<PSTOASCII>"],
                  "pdf": ["<PDFTOTEXT>","<PSTOTEXT>","<PSTOASCII>"],
                  "doc": ["<ANTIWORD>","<CATDOC>","<WVTEXT>"],
                  "ppt": ["<PPTHTML>"],
                  "xls": ["<XLHTML>"]
                  }
 ## helper programs used if the above programs convert only to html or other intermediate file formats:
 conv_programs_helpers =  {"html": "<HTMLTOTEXT>",
                           "gz": "<GZIP>" }
 
 ## okay, rest of the Python code goes below
 #######
 
 ## import interesting modules:
 try:
     from zlib import compress,decompress
     from string import split,translate,lower,upper
     import marshal
     import getopt
     import getpass
     import string
     import os
     import sre
     import sys
     import time
     import MySQLdb
     import Numeric
     import urllib
     import signal
     import tempfile
     import unicodedata
     import traceback
     import cStringIO
     import math
     import re
     import ConfigParser
 except ImportError, e:
     import sys
 
 try:
     sys.path.append('%s' % pylibdir)
     from cdsware.config import *
     from cdsware.search_engine_config import cfg_max_recID
     from cdsware.search_engine import perform_request_search, strip_accents, HitSet
     from cdsware.dbquery import run_sql
     from cdsware.bibindex_engine_stemmer import stem_by_lang, lang_available
     from cdsware.bibindex_engine_stopwords import is_stopword_force
 except ImportError, e:
     import sys
 
 ## safety parameters concerning MySQL thread-multiplication problem:
 cfg_check_mysql_threads = 0 # to check or not to check the problem? 
 cfg_max_mysql_threads = 50 # how many threads (connections) we consider as still safe
 cfg_mysql_thread_timeout = 20 # we'll kill threads that were sleeping for more than X seconds
 
 ## override urllib's default password-asking behaviour:
 class MyFancyURLopener(urllib.FancyURLopener):
     def prompt_user_passwd(self, host, realm):
         # supply some dummy credentials by default
         return ("mysuperuser", "mysuperpass")
     def http_error_401(self, url, fp, errcode, errmsg, headers):
         # do not bother with protected pages
         raise IOError, (999, 'unauthorized access')  
         return None
     
 #urllib._urlopener = MyFancyURLopener()
 
 ## precompile some often-used regexp for speed reasons:
 re_subfields = sre.compile('\$\$\w');
 
 nb_char_in_line = 50  # for verbose pretty printing
 chunksize = 1000 # default size of chunks that the records will be treated by
 wordTables = []
 base_process_size = 4500 # process base size
 
 ## Dictionary merging functions
 def dict_union(list1, list2):
     "Returns union of the two dictionaries."
     union_dict = {}
 
     for (e, count) in list1.iteritems():
         union_dict[e] = count
     for (e, count) in list2.iteritems():
         if not union_dict.has_key(e):
             union_dict[e] = count
 	else:
 	    union_dict[e] = (union_dict[e][0] + count[0], count[1])
 
     #for (e, count) in list2.iteritems():
     #    list1[e] = (list1.get(e, (0, 0))[0] + count[0], count[1])
 
     #return list1
     return union_dict
 
 ## safety function for killing slow MySQL threads:
 def kill_sleepy_mysql_threads(max_threads=cfg_max_mysql_threads, thread_timeout=cfg_mysql_thread_timeout):
     """Check the number of MySQL threads and if there are more than
        MAX_THREADS of them, lill all threads that are in a sleeping
        state for more than THREAD_TIMEOUT seconds.  (This is useful
        for working around the the max_connection problem that appears
        during indexation in some not-yet-understood cases.)  If some
        threads are to be killed, write info into the log file.      
     """
     res = run_sql("SHOW FULL PROCESSLIST")
     if len(res) > max_threads:
         for row in res:
             r_id,r_user,r_host,r_db,r_command,r_time,r_state,r_info = row
             if r_command == "Sleep" and int(r_time) > thread_timeout:
                 run_sql("KILL %s", (r_id,))
                 if options["verbose"] >= 1:                
                     write_message("WARNING: too many MySQL threads, killing thread %s" % r_id)
     return
 
 # tagToFunctions mapping. It offers an indirection level necesary for
 # indexing fulltext. The default is get_words_from_phrase
 tagToWordsFunctions = {}
 
 def get_words_from_phrase(phrase, weight, lang="",
                           chars_punctuation=r"[\.\,\:\;\?\!\"]",
                           chars_alphanumericseparators=r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]",
                           split=string.split):
     "Returns list of words from phrase 'phrase'."
     words = {} 
-    #print phrase
     phrase = strip_accents(phrase) 
     phrase = lower(phrase) 
     #Getting rid of strange characters
     phrase = re.sub("&eacute;", 'e', phrase) 
     phrase = re.sub("&egrave;", 'e', phrase) 
     phrase = re.sub("&agrave;", 'a', phrase)
     phrase = re.sub("&nbsp;", ' ', phrase)
     phrase = re.sub("&laquo;", ' ', phrase)
     phrase = re.sub("&raquo;", ' ', phrase)
     phrase = re.sub("&ecirc;", ' ', phrase)
     phrase = re.sub("&amp;", ' ', phrase)
-    
     if string.find(phrase, "</") > -1:
         #Most likely html, remove html code
         phrase = re.sub("(?s)<[^>]*>|&#?\w+;", ' ', phrase)
+    #removes http links
+    phrase = re.sub("(?s)http://[^( )]*", '', phrase)
     phrase = re.sub(chars_punctuation, ' ', phrase)
-    phrase = lower(phrase)
+
     #By doing this like below, characters standing alone, like c a b is not added to the inedx, but when they are together with characters like c++ or c$ they are added.
     for word in split(phrase):    
         if options["remove_stopword"] == "True" and not is_stopword_force(word) and check_term(word, 0):
             if lang and lang !="none" and options["use_stemming"]:
                 word = stem_by_lang(word, lang)
             if not words.has_key(word):
                 words[word] = (0,0)
             words[word] = (words[word][0] + weight, 0)
         elif options["remove_stopword"] == "True" and not is_stopword_force(word):  
             phrase = re.sub(chars_alphanumericseparators, ' ', word) 
             for word_ in split(phrase):   
                 if lang and lang !="none" and options["use_stemming"]:
                     word_ = stem_by_lang(word_, lang)
                 if word_:
                     if not words.has_key(word_):
                         words[word_] = (0,0)
                     words[word_] = (words[word_][0] + weight, 0)
     return words
 
 def split_ranges(parse_string):
     recIDs = []
     ranges = string.split(parse_string, ",")
     for range in ranges:
         tmp_recIDs = string.split(range, "-")
         
         if len(tmp_recIDs)==1:
             recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])])
         else:
             if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check
                 tmp = tmp_recIDs[0]
                 tmp_recIDs[0] = tmp_recIDs[1]
                 tmp_recIDs[1] = tmp
             recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])])
     return recIDs
 
 def get_date_range(var):
     "Returns the two dates contained as a low,high tuple"
     limits = string.split(var, ",")
     if len(limits)==1:
         low = get_date(limits[0])
         return low,None
     if len(limits)==2:
         low = get_date(limits[0])
         high = get_date(limits[1])
         return low,high
 
 def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
     """Returns a date string according to the format string.
        It can handle normal date strings and shifts with respect
        to now."""
     date = time.time()
     shift_re=sre.compile("([-\+]{0,1})([\d]+)([dhms])")
     factors = {"d":24*3600, "h":3600, "m":60, "s":1}
     m = shift_re.match(var)
     if m:
         sign = m.groups()[0] == "-" and -1 or 1
         factor = factors[m.groups()[2]]
         value = float(m.groups()[1])
         date = time.localtime(date + sign * factor * value)
         date = time.strftime(format_string, date)
     else:
         date = time.strptime(var, format_string)
         date = time.strftime(format_string, date)        
     return date
 
 def create_range_list(res):
     """Creates a range list from a recID select query result contained
     in res. The result is expected to have ascending numerical order."""
     if not res:
         return []
     row = res[0]
     if not row:
         return []
     else:
         range_list = [[row[0],row[0]]]
     for row in res[1:]:
         id = row[0]
         if id == range_list[-1][1] + 1:
             range_list[-1][1] = id
         else:
             range_list.append([id,id])
     return range_list
         
 def beautify_range_list(range_list):
     """Returns a non overlapping, maximal range list"""
     ret_list = []
     for new in range_list:
         found = 0
         for old in ret_list:
             if new[0] <= old[0] <= new[1] + 1 or new[0] - 1 <= old[1] <= new[1]:
                 old[0] = min(old[0], new[0])
                 old[1] = max(old[1], new[1])
                 found = 1
                 break
 
         if not found:
             ret_list.append(new)
                 
     return ret_list
 
 def serialize_via_numeric_array_dumps(arr):
     return Numeric.dumps(arr)
 
 def serialize_via_numeric_array_compr(str):
     return compress(str)
 
 def serialize_via_numeric_array(arr):
     """Serialize Numeric array into a compressed string."""
     return serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr))
 
 def deserialize_via_numeric_array(string):
     """Decompress and deserialize string into a Numeric array."""
     return Numeric.loads(decompress(string))
 
 def serialize_via_marshal(obj):
     """Serialize Python object via marshal into a compressed string."""
     return MySQLdb.escape_string(compress(marshal.dumps(obj)))
 
 def deserialize_via_marshal(string):
     """Decompress and deserialize string into a Python object via marshal."""
     return marshal.loads(decompress(string))
 
 class WordTable:
     "A class to hold the words table."
 
     def __init__(self, tablename, fields_to_index, separators="[^\s]"):
         "Creates words table instance."
         self.tablename = tablename
         self.recIDs_in_mem = []
         self.fields_to_index = fields_to_index
         self.separators = separators
         self.value = {}
 
     def get_field(self, recID, tag):
         """Returns list of values of the MARC-21 'tag' fields for the
            record 'recID'."""
 
         out = []
         bibXXx = "bib" + tag[0] + tag[1] + "x"
         bibrec_bibXXx = "bibrec_" + bibXXx
         query = """SELECT value FROM %s AS b, %s AS bb
                 WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id
                 AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID, tag);
         res = run_sql(query)
         for row in res:
             out.append(row[0])
         return out
 
     def clean(self):
         "Cleans the words table."
         self.value={}
 
     def put_into_db(self, mode="normal", split=string.split):
         """Updates the current words table in the corresponding MySQL's
            rnkWORD table.  Mode 'normal' means normal execution,
            mode 'emergency' means words index reverting to old state.
            """
         if options["verbose"]:
             write_message("%s %s wordtable flush started" % (self.tablename,mode))
             write_message('...updating %d words into %sR started' % \
                 (len(self.value), self.tablename[:-1]))
         task_update_progress("%s flushed %d/%d words" % (self.tablename, 0, len(self.value)))
             
         self.recIDs_in_mem = beautify_range_list(self.recIDs_in_mem)
 
         if mode == "normal":
             for group in self.recIDs_in_mem:
                 query = """UPDATE %sR SET type='TEMPORARY' WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='CURRENT'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
 
         nb_words_total = len(self.value)
         nb_words_report = int(nb_words_total/10)
         nb_words_done = 0
         for word in self.value.keys():
 	    self.put_word_into_db(word, self.value[word])
             nb_words_done += 1
             if nb_words_report!=0 and ((nb_words_done % nb_words_report) == 0):
                 if options["verbose"]:
                     write_message('......processed %d/%d words' % (nb_words_done, nb_words_total))
                 task_update_progress("%s flushed %d/%d words" % (self.tablename, nb_words_done, nb_words_total))
         if options["verbose"] >= 9:
             write_message('...updating %d words into %s ended' % \
                 (nb_words_total, self.tablename))
                 
         #if options["verbose"]:
         #    write_message('...updating reverse table %sR started' % self.tablename[:-1])
         if mode == "normal":
             for group in self.recIDs_in_mem:
                 query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
                 query = """DELETE FROM %sR WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
             if options["verbose"] >= 9:
                 write_message('End of updating wordTable into %s' % self.tablename)
         elif mode == "emergency":
 	    write_message("emergency")
             for group in self.recIDs_in_mem:
                 query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
                 query = """DELETE FROM %sR WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 if options["verbose"] >= 9:
                     write_message(query)
                 run_sql(query)
             if options["verbose"] >= 9:
                 write_message('End of emergency flushing wordTable into %s' % self.tablename)
         #if options["verbose"]:
         #    write_message('...updating reverse table %sR ended' % self.tablename[:-1])
 
         self.clean()
         self.recIDs_in_mem = []
         if options["verbose"]:
             write_message("%s %s wordtable flush ended" % (self.tablename, mode))
         task_update_progress("%s flush ended" % (self.tablename))
 
     def load_old_recIDs(self,word):
         """Load existing hitlist for the word from the database index files."""
         query = "SELECT hitlist FROM %s WHERE term=%%s" % self.tablename
         res = run_sql(query, (word,))
         if res:
             return deserialize_via_marshal(res[0][0])
         else:
             return None
     
     def merge_with_old_recIDs(self,word,recIDs, set):
         """Merge the system numbers stored in memory (hash of recIDs with value[0] > 0 or -1
         according to whether to add/delete them) with those stored in the database index
         and received in set universe of recIDs for the given word.
 
         Return 0 in case no change was done to SET, return 1 in case SET was changed.
         """
 
         set_changed_p = 0
         for recID,sign in recIDs.iteritems():
             if sign[0] == -1 and set.has_key(recID):
                 # delete recID if existent in set and if marked as to be deleted
                 del set[recID]
                 set_changed_p = 1
             elif sign[0] > -1 and not set.has_key(recID):
                 # add recID if not existent in set and if marked as to be added
                 set[recID] = sign
                 set_changed_p = 1
 	    elif sign[0] > -1 and sign[0] != set[recID][0]:
                 set[recID] = sign
                 set_changed_p = 1
 
         return set_changed_p
 
     def put_word_into_db(self, word, recIDs, split=string.split):
         """Flush a single word to the database and delete it from memory"""
         set = self.load_old_recIDs(word)
 	#write_message("%s %s" % (word, self.value[word]))
         if set: # merge the word recIDs found in memory:
             options["modified_words"][word] = 1
             if self.merge_with_old_recIDs(word, recIDs, set) == 0:
                 # nothing to update:
                 if options["verbose"] >= 9:
                     write_message("......... unchanged hitlist for ``%s''" % word)
                 pass
             else:
                 # yes there were some new words:
                 if options["verbose"] >= 9:
                     write_message("......... updating hitlist for ``%s''" % word)
 		run_sql("UPDATE %s SET hitlist='%s' WHERE term='%s'" % (self.tablename, serialize_via_marshal(set), MySQLdb.escape_string(word)))
         else: # the word is new, will create new set:
             if options["verbose"] >= 9:
                 write_message("......... inserting hitlist for ``%s''" % word)
 	    set = self.value[word]
 	    if len(set) > 0:   
                 #new word, add to list
                 options["modified_words"][word] = 1
 	        run_sql("INSERT INTO %s (term, hitlist) VALUES ('%s', '%s')" % (self.tablename, MySQLdb.escape_string(word), serialize_via_marshal(set)))       
         if not set: # never store empty words
             run_sql("DELETE from %s WHERE term=%%s" % self.tablename,
                     (word,))
  
         del self.value[word]
             
     def display(self):
         "Displays the word table."
         keys = self.value.keys()
         keys.sort()
         for k in keys:
             if options["verbose"]:
                 write_message("%s: %s" % (k, self.value[k]))
 
     def count(self):
         "Returns the number of words in the table."
         return len(self.value)
 
     def info(self):
         "Prints some information on the words table."
         if options["verbose"]:
             write_message("The words table contains %d words." % self.count())
 
     def lookup_words(self, word=""):
         "Lookup word from the words table."
 
         if not word:
             done = 0
             while not done:
                 try:
                     word = raw_input("Enter word: ")
                     done = 1
                 except (EOFError, KeyboardInterrupt):
                     return
 
         if self.value.has_key(word):
             if options["verbose"]:
                 write_message("The word '%s' is found %d times." \
                 % (word, len(self.value[word])))
         else:
             if options["verbose"]:
                 write_message("The word '%s' does not exist in the word file."\
                               % word)
 
     def update_last_updated(self, rank_method_code, starting_time=None):
         """Update last_updated column of the index table in the database.
         Puts starting time there so that if the task was interrupted for record download,
         the records will be reindexed next time."""
         if starting_time is None:
             return None
         if options["verbose"] >= 9:
             write_message("updating last_updated to %s...", starting_time)            
         return run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s",
                        (starting_time, rank_method_code,))
 
     def add_recIDs(self, recIDs):
         """Fetches records which id in the recIDs range list and adds
         them to the wordTable.  The recIDs range list is of the form:
         [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
         """
         global chunksize
         flush_count = 0
         records_done = 0
         records_to_go = 0
 
         for range in recIDs:
             records_to_go = records_to_go + range[1] - range[0] + 1
             
         time_started = time.time() # will measure profile time
         for range in recIDs:
             i_low = range[0]
             chunksize_count = 0
             while i_low <= range[1]:
                 # calculate chunk group of recIDs and treat it:
                 i_high = min(i_low+options["flush"]-flush_count-1,range[1])
                 i_high = min(i_low+chunksize-chunksize_count-1, i_high)
                 try:
                     self.chk_recID_range(i_low, i_high)
                 except StandardError, e:
                     write_message("Exception caught: %s" % e, sys.stderr)
                     if options["verbose"] >= 9:        
                         traceback.print_tb(sys.exc_info()[2])
                     task_update_status("ERROR")
                     task_sig_stop_commands()
                     sys.exit(1)
                 if options["verbose"]:
                     write_message("%s adding records #%d-#%d started" % \
                         (self.tablename, i_low, i_high))
                 if cfg_check_mysql_threads:
                     kill_sleepy_mysql_threads()
                 task_update_progress("%s adding recs %d-%d" % (self.tablename, i_low, i_high))
 		self.del_recID_range(i_low, i_high)
                 just_processed = self.add_recID_range(i_low, i_high)
                 flush_count = flush_count + i_high - i_low + 1
                 chunksize_count = chunksize_count + i_high - i_low + 1
                 records_done = records_done + just_processed
                 if options["verbose"]:
                     write_message("%s adding records #%d-#%d ended  " % \
                         (self.tablename, i_low, i_high))
                 if chunksize_count >= chunksize:
                     chunksize_count = 0
                 # flush if necessary:
                 if flush_count >= options["flush"]:
                     self.put_into_db()
                     self.clean()
                     if options["verbose"]:
                         write_message("%s backing up" % (self.tablename))
                     flush_count = 0
                     self.log_progress(time_started,records_done,records_to_go)
                 # iterate:
                 i_low = i_high + 1
         if flush_count > 0:
             self.put_into_db()
             self.log_progress(time_started,records_done,records_to_go)
 
     def add_date(self, date=""):
         # If date is not set, then retrieve it from the database.
         # Reindex all formats newer than the modification date
         if not date:
             write_message("Using the last update time for the rank method")
             id = self.tablename[len("bibindex"):]
             query = """SELECT last_updated FROM rnkMETHOD WHERE name='%s'
             """ % options["current_run"]
             res = run_sql(query)
 
             if not res:
                 return
             if not res[0][0]:
                 date = ("0000-00-00",'')
             else:
                 date = (res[0][0],'')
 
         query = """SELECT b.id FROM bibrec AS b WHERE b.modification_date >=
         '%s'""" % date[0]
         if date[1]:
             query += "and b.modification_date <= '%s'" % date[1]
         query += "ORDER BY b.id ASC"""
         res = run_sql(query)        
 
         list = create_range_list(res)
         if not list:
             if options["verbose"]:
                 write_message( "No new records added. %s is up to date" % self.tablename)
         else:
             self.add_recIDs(list)
         return list
 
         
     def add_recID_range(self, recID1, recID2):
         empty_list_string = serialize_via_marshal([])
         wlist = {}
         normalize = {}
 
         self.recIDs_in_mem.append([recID1,recID2])
         # secondly fetch all needed tags:
         
         for (tag, weight, lang) in self.fields_to_index:
 	    if tag in tagToWordsFunctions.keys():
                 get_words_function = tagToWordsFunctions[ tag ]
 	    else: get_words_function = get_words_from_phrase
             bibXXx = "bib" + tag[0] + tag[1] + "x"
             bibrec_bibXXx = "bibrec_" + bibXXx
             query = """SELECT bb.id_bibrec,b.value FROM %s AS b, %s AS bb
                     WHERE bb.id_bibrec BETWEEN %d AND %d
                     AND bb.id_bibxxx=b.id AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID1, recID2, tag)    
             res = run_sql(query)
 	    nb_total_to_read = len(res)
             verbose_idx = 0     # for verbose pretty printing       
             for row in res:
 		recID, phrase = row 
                 if options["validset"].contains(recID):
                     if not wlist.has_key(recID): wlist[recID] = {}
                     new_words = get_words_function(phrase, weight, lang) # ,self.separators
                     wlist[recID] = dict_union(new_words,wlist[recID])
 
         # were there some words for these recIDs found?
         if len(wlist) == 0: return 0
         recIDs = wlist.keys()
         for recID in recIDs:
             # was this record marked as deleted?
             if "DELETED" in self.get_field(recID, "980__c"):
                 wlist[recID] = {}
                 if options["verbose"] >= 9:
                     write_message("... record %d was declared deleted, removing its word list" % recID)
             if options["verbose"] >= 9:
                 write_message("... record %d, termlist: %s" % (recID, wlist[recID]))
         query_factory = cStringIO.StringIO()
         qwrite = query_factory.write
         qwrite( "INSERT INTO %sR (id_bibrec,termlist,type) VALUES" % self.tablename[:-1])
         qwrite( "('" )
         qwrite( str(recIDs[0]) )
         qwrite( "','" )
         qwrite( serialize_via_marshal(wlist[recIDs[0]]) )
         qwrite( "','FUTURE')" )
         for recID in recIDs[1:]:
             qwrite(",('")
             qwrite(str(recID))
             qwrite("','")
             qwrite(serialize_via_marshal(wlist[recID]))
             qwrite("','FUTURE')")
 
         query = query_factory.getvalue()
         query_factory.close()
         run_sql(query)
 
         query_factory = cStringIO.StringIO()
         qwrite = query_factory.write
         qwrite("INSERT INTO %sR (id_bibrec,termlist,type) VALUES" % self.tablename[:-1])
         qwrite("('")
         qwrite(str(recIDs[0]))
         qwrite("','")
         qwrite(serialize_via_marshal(wlist[recIDs[0]]))
         qwrite("','CURRENT')")
         for recID in recIDs[1:]:
             qwrite( ",('" )
             qwrite( str(recID) )
             qwrite( "','" )
             qwrite( empty_list_string )
             qwrite( "','CURRENT')" )
         query = query_factory.getvalue()
         query_factory.close()
 
         try:
             run_sql(query)
         except MySQLdb.DatabaseError:
             pass
         
         put = self.put
         for recID in recIDs:
             for (w, count) in wlist[recID].iteritems():
                 put(recID, w, count)
         
         return len(recIDs)
                 
     def log_progress(self, start, done, todo):
         """Calculate progress and store it.
         start: start time,
         done: records processed,
         todo: total number of records"""
         time_elapsed = time.time() - start
         # consistency check
         if time_elapsed == 0 or done > todo:
             return
 
         time_recs_per_min = done/(time_elapsed/60.0)
         if options["verbose"]:
             write_message("%d records took %.1f seconds to complete.(%1.f recs/min)"\
                 % (done, time_elapsed, time_recs_per_min))
         
         if time_recs_per_min:
             if options["verbose"]:
                 write_message("Estimated runtime: %.1f minutes" % \
                     ((todo-done)/time_recs_per_min))
 
     def put(self, recID, word, sign):
         "Adds/deletes a word to the word list."        
         try:
             word = lower(word[:50])
             if self.value.has_key(word):
                 # the word 'word' exist already: update sign
                 self.value[word][recID] = sign
                 # PROBLEM ?
             else:
                 self.value[word] = {recID: sign}
         except:
             write_message("Error: Cannot put word %s with sign %d for recID %s." % (word, sign, recID))
 
 
     def del_recIDs(self, recIDs):
         """Fetches records which id in the recIDs range list and adds
         them to the wordTable.  The recIDs range list is of the form:
         [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
         """
         count = 0
         for range in recIDs:
             self.del_recID_range(range[0],range[1])
             count = count + range[1] - range[0]
         self.put_into_db()
         
     def del_recID_range(self, low, high):
         """Deletes records with 'recID' system number between low
            and high from memory words index table."""
         if options["verbose"] > 2:
             write_message("%s fetching existing words for records #%d-#%d started" % \
                 (self.tablename, low, high))
         self.recIDs_in_mem.append([low,high])
         query = """SELECT id_bibrec,termlist FROM %sR as bb WHERE bb.id_bibrec
         BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high)
         recID_rows = run_sql(query)        
         for recID_row in recID_rows:
             recID = recID_row[0]
             wlist = deserialize_via_marshal(recID_row[1])
             for word in wlist:
                 self.put(recID, word, (-1, 0))
         if options["verbose"] > 2:
             write_message("%s fetching existing words for records #%d-#%d ended" % \
                 (self.tablename, low, high))
 
     def report_on_table_consistency(self):
         """Check reverse words index tables (e.g. rnkWORD01R) for
         interesting states such as 'TEMPORARY' state.
         Prints small report (no of words, no of bad words).
         """
         # find number of words:
         query = """SELECT COUNT(*) FROM %s""" % (self.tablename)
         res = run_sql(query, None, 1)
         if res:
             nb_words = res[0][0]
         else:
             nb_words = 0
 
         # find number of records:
         query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR""" % (self.tablename[:-1])
         res = run_sql(query, None, 1)
         if res:
             nb_records = res[0][0]
         else:
             nb_records = 0
 
         # report stats:
         if options["verbose"]:
             write_message("%s contains %d words from %d records" % (self.tablename, nb_words, nb_records))
 
         # find possible bad states in reverse tables:
         query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR WHERE type <> 'CURRENT'""" % (self.tablename[:-1])
         res = run_sql(query)
         if res:
             nb_bad_records = res[0][0]
         else:
             nb_bad_records = 999999999
         if nb_bad_records:
             write_message("EMERGENCY: %s needs to repair %d of %d records" % \
                 (self.tablename, nb_bad_records, nb_records))
         else:
             if options["verbose"]:
                 write_message("%s is in consistent state" % (self.tablename))
         
         return nb_bad_records
 
     def repair(self):
         """Repair the whole table"""
         # find possible bad states in reverse tables:
         query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR WHERE type <> 'CURRENT'""" % (self.tablename[:-1])
         res = run_sql(query, None, 1)
         if res:
             nb_bad_records = res[0][0]
         else: 
             nb_bad_records = 0
       
         # find number of records:
         query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR""" % (self.tablename[:-1])
         res = run_sql(query)
         if res:
             nb_records = res[0][0]
         else:
             nb_records = 0
 
         if nb_bad_records == 0:
             return
         query = """SELECT id_bibrec FROM %sR WHERE type <> 'CURRENT' ORDER BY id_bibrec""" \
                 % (self.tablename[:-1])
         res = run_sql(query)
         recIDs = create_range_list(res)
 
         flush_count = 0
         records_done = 0
         records_to_go = 0
 
         for range in recIDs:
             records_to_go = records_to_go + range[1] - range[0] + 1
 
         time_started = time.time() # will measure profile time
         for range in recIDs:
             i_low = range[0]
             chunksize_count = 0
             while i_low <= range[1]:
                 # calculate chunk group of recIDs and treat it:
                 i_high = min(i_low+options["flush"]-flush_count-1,range[1])
                 i_high = min(i_low+chunksize-chunksize_count-1, i_high)
                 try:
                     self.fix_recID_range(i_low, i_high)
                 except StandardError, e:
                     write_message("Exception caught: %s" % e, sys.stderr)
                     if options["verbose"] >= 9:        
                         traceback.print_tb(sys.exc_info()[2])
                     task_update_status("ERROR")
                     task_sig_stop_commands()
                     sys.exit(1)
 
                 flush_count = flush_count + i_high - i_low + 1
                 chunksize_count = chunksize_count + i_high - i_low + 1
                 records_done = records_done + i_high - i_low + 1
                 if chunksize_count >= chunksize:
                     chunksize_count = 0
                 # flush if necessary:
                 if flush_count >= options["flush"]:
                     self.put_into_db("emergency")
                     self.clean()
                     flush_count = 0
                     self.log_progress(time_started,records_done,records_to_go)
                 # iterate:
                 i_low = i_high + 1
         if flush_count > 0:
             self.put_into_db("emergency")
             self.log_progress(time_started,records_done,records_to_go)
         write_message("%s inconsistencies repaired." % self.tablename)
 
     def chk_recID_range(self, low, high):
         """Check if the reverse index table is in proper state"""
         ## check db
         query = """SELECT COUNT(*) FROM %sR WHERE type <> 'CURRENT'
         AND id_bibrec BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high)
         res = run_sql(query, None, 1)
         if res[0][0]==0:
             if options["verbose"]:
                 write_message("%s for %d-%d is in consistent state"%(self.tablename,low,high))
             return # okay, words table is consistent
 
         ## inconsistency detected!
         write_message("EMERGENCY: %s inconsistencies detected..." % self.tablename)
         write_message("""EMERGENCY: Errors found. You should check consistency of the %s - %sR tables.\nRunning 'bibindex --repair' is recommended.""" \
             % (self.tablename, self.tablename[:-1]))
         raise StandardError
 
     def fix_recID_range(self, low, high):
         """Try to fix reverse index database consistency (e.g. table rnkWORD01R) in the low,high doc-id range.
 
         Possible states for a recID follow:
         CUR TMP FUT: very bad things have happened: warn!
         CUR TMP    : very bad things have happened: warn!
         CUR     FUT: delete FUT (crash before flushing)
         CUR        : database is ok
             TMP FUT: add TMP to memory and del FUT from memory
                      flush (revert to old state)
             TMP    : very bad things have happened: warn!
                 FUT: very bad things have happended: warn!
         """
 
         state = {}
         query = "SELECT id_bibrec,type FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d'"\
                 % (self.tablename[:-1], low, high)
         res = run_sql(query)
         for row in res:
             if not state.has_key(row[0]):
                 state[row[0]]=[]
             state[row[0]].append(row[1])
 
         ok = 1 # will hold info on whether we will be able to repair
         for recID in state.keys():
             if not 'TEMPORARY' in state[recID]:
                 if 'FUTURE' in state[recID]:
                     if 'CURRENT' not in state[recID]:
                         write_message("EMERGENCY: Record %d is in inconsistent state. Can't repair it" % recID)
                         ok = 0
                     else:
                         write_message("EMERGENCY: Inconsistency in record %d detected" % recID)
                         query = """DELETE FROM %sR
                         WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID)
                         run_sql(query)
                         write_message("EMERGENCY: Inconsistency in record %d repaired." % recID)
             else:
                 if 'FUTURE' in state[recID] and not 'CURRENT' in state[recID]:
                     self.recIDs_in_mem.append([recID,recID])
                     # Get the words file
                     query = """SELECT type,termlist FROM %sR
                     WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID)
                     if options["verbose"] >= 9:
                         write_message(query)
                     res = run_sql(query)
                     for row in res:
                         wlist = deserialize_via_marshal(row[1])
                         if options["verbose"] >= 9:
                             write_message("Words are %s " % wlist)
                         if row[0] == 'TEMPORARY':
                             sign = 1
                         else:
                             sign = -1
                         for word in wlist:
                             self.put(recID, word, wlist[word])
 
                 else:
                     write_message("EMERGENCY: %s for %d is in inconsistent state. Couldn't repair it." % (self.tablename, recID))
                     ok = 0
 
         if not ok:
             write_message("""EMERGENCY: Unrepairable errors found. You should check consistency
                 of the %s - %sR tables. Deleting affected records is
                 recommended.""" % (self.tablename, self.tablename[:-1]))
             raise StandardError
                        
 def word_index(row, run):
     """Run the indexing task.  The row argument is the BibSched task
     queue row, containing if, arguments, etc.
     Return 1 in case of success and 0 in case of failure.
     """
 
     ## import optional modules:
     try:
         import psyco
         psyco.bind(get_words_from_phrase)
         psyco.bind(WordTable.merge_with_old_recIDs)
         psyco.bind(serialize_via_numeric_array)
         psyco.bind(serialize_via_marshal)
         psyco.bind(deserialize_via_numeric_array)
         psyco.bind(deserialize_via_marshal)
         psyco.bind(update_rnkWORD)
         psyco.bind(check_rnkWORD)
     except StandardError,e:
         print "Warning: Psyco", e
         pass
 
     global options, task_id, wordTables, languages 
       
     # read from SQL row:
     task_id = row[0]
     task_proc = row[1]
     options = marshal.loads(row[6])
 
     # install signal handlers
     signal.signal(signal.SIGUSR1, task_sig_sleep)
     signal.signal(signal.SIGTERM, task_sig_stop)
     signal.signal(signal.SIGABRT, task_sig_suicide)
     signal.signal(signal.SIGCONT, task_sig_wakeup)
     signal.signal(signal.SIGINT, task_sig_unknown)
     ## go ahead and treat each table:
 
     options["run"] = []
     options["run"].append(run)
     for rank_method_code in options["run"]:
         method_starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         write_message("Running rank method: %s" % getName(rank_method_code))
         try:
             file = etcdir + "/bibrank/" + rank_method_code + ".cfg"
             config = ConfigParser.ConfigParser()
             config.readfp(open(file))
         except StandardError, e:
             write_message("Cannot find configurationfile: %s" % file, sys.stderr)
             raise StandardError
 
         options["current_run"] = rank_method_code
         options["modified_words"] = {}
         options["table"] = config.get(config.get("rank_method", "function"), "table")
         options["use_stemming"] = config.get(config.get("rank_method","function"),"stemming")
         options["remove_stopword"] = config.get(config.get("rank_method","function"),"stopword")
         tags = get_tags(config) #get the tags to include
         options["validset"] = get_valid_range(rank_method_code) #get the records from the collections the method is enabled for
         function = config.get("rank_method","function")
         wordTable = WordTable(options["table"], tags)
         wordTable.report_on_table_consistency()
         try:
             if options["cmd"] == "del":
                 if options["id"]:
                     wordTable.del_recIDs(options["id"])
                 elif options["collection"]:
                     l_of_colls = string.split(options["collection"], ",")
                     recIDs = perform_request_search(c=l_of_colls)
                     recIDs_range = []
                     for recID in recIDs:
                         recIDs_range.append([recID,recID])
                     wordTable.del_recIDs(recIDs_range)
                 else:
                     write_message("Missing IDs of records to delete from index %s.", wordTable.tablename,
                                   sys.stderr)
                     raise StandardError
             elif options["cmd"] == "add":
                 if options["id"]:
                     wordTable.add_recIDs(options["id"])
                 elif options["collection"]:
                     l_of_colls = string.split(options["collection"], ",")
                     recIDs = perform_request_search(c=l_of_colls)
                     recIDs_range = []
                     for recID in recIDs:
                         recIDs_range.append([recID,recID])
                     wordTable.add_recIDs(recIDs_range)
                 elif options["last_updated"]:
                     wordTable.add_date("")
                     wordTable.update_last_updated(rank_method_code, method_starting_time)
                 else:
                     wordTable.add_recIDs([[0,cfg_max_recID]])
                     #wordTable.add_date(options["modified"])
                     # only update last_updated if run via automatic mode:
             elif options["cmd"] == "repair":
                 wordTable.repair()
                 check_rnkWORD(options["table"])
             elif options["cmd"] == "check":
                 check_rnkWORD(options["table"])
                 options["modified_words"] = {}
             elif options["cmd"] == "stat":
                 rank_method_code_statistics(options["table"])
             else:
                 write_message("Invalid command found processing %s" % \
                      wordTable.tablename, sys.stderr)
                 raise StandardError
             update_rnkWORD(options["table"], options["modified_words"])
         except StandardError, e:
             write_message("Exception caught: %s" % e, sys.stderr)
             if options["verbose"] >= 9:        
                 traceback.print_tb(sys.exc_info()[2])
             sys.exit(1)
         wordTable.report_on_table_consistency()
     # We are done. State it in the database, close and quit
 
     return 1
        
 def get_tags(config):
     """Get the tags that should be used creating the index and each tag's parameter"""
     tags = []
     function = config.get("rank_method","function")
     i = 1
     shown_error = 0
   
     #try:
     if 1:
         while config.has_option(function,"tag%s"% i):
             tag = config.get(function, "tag%s" % i)
             tag = string.split(tag, ",")
             tag[1] = int(string.strip(tag[1]))
             tag[2] = string.strip(tag[2])
   
             #check if stemmer for language is available
             if config.get(function,"stemming") and stem_by_lang("information", "en") != "inform":
                 if shown_error == 0:
                     write_message("Warning: PyStemmer not found. Please read INSTALL.")
                     shown_error = 1
             elif tag[2] and tag[2] != "none" and config.get(function,"stemming") and not lang_available(tag[2]): 
                 write_message("Warning: Language '%s' not available in PyStemmer." % tag[2])
             tags.append(tag)
             i += 1
     #except Exception:
     #    write_message("Could not read data from configuration file, please check for errors")
     #    raise StandardError
 
     return tags
 
 def get_valid_range(rank_method_code):
     """Returns which records are valid for this rank method, according to which collections it is enabled for."""
 
     #if options["verbose"] >=9:
     #    write_message("Getting records from collections enabled for rank method.")
     #res = run_sql("SELECT collection.name FROM collection,collection_rnkMETHOD,rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" %  rank_method_code)
     #l_of_colls = []
     #for coll in res:
     #    l_of_colls.append(coll[0])
     #if len(l_of_colls) > 0:
     #    recIDs = perform_request_search(c=l_of_colls)
     #else:
     #    recIDs = []
     
     valid = HitSet(Numeric.ones(cfg_max_recID+1, Numeric.Int0))
     #valid.addlist(recIDs)
     return valid
 
 def write_message(msg, stream=sys.stdout):
     """Prints message and flush output stream (may be sys.stdout or sys.stderr)."""
     if stream == sys.stdout or stream == sys.stderr:
         stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
         stream.write("%s\n" % msg)
         stream.flush()
     else:
         sys.stderr.write("Unknown stream %s.  [must be sys.stdout or sys.stderr]\n" % stream)
 
 def check_term(term, termlength):
     """Check if term contains not allowed characters, or for any other reasons for not using this term."""
     try:
         if len(term) <= termlength:
 	    return False
         reg = re.compile(r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]")
         if re.search(reg, term):
             return False
         term = str.replace(term, "-", "")
         term = str.replace(term, ".", "")
         term = str.replace(term, ",", "")
         if int(term):
             return False
     except StandardError, e:
 	pass
     return True
 
 def check_rnkWORD(table):
     """Checks for any problems in rnkWORD tables."""
     i = 0 
     errors = {}
     termslist = run_sql("SELECT term FROM %s" % table)
     N = run_sql("select max(id_bibrec) from %sR" % table[:-1])[0][0]
     write_message("Checking integrity of rank values in %s" % table) 
     terms = map(lambda x: x[0], termslist)
 
     while i < len(terms):
         current_terms = ""
         for j in range(i, ((i+5000)< len(terms) and (i+5000) or len(terms))):
             current_terms += "'%s'," % terms[j]
         terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term in (%s)" % (table, current_terms[:-1]))
         for (t, hitlist) in terms_docs: 
             term_docs = deserialize_via_marshal(hitlist)
             if (term_docs.has_key("Gi") and term_docs["Gi"][1] == 0) or not term_docs.has_key("Gi"):
                 write_message("ERROR: Missing value for term: %s (%s) in %s: %s" % (t, repr(t), table, len(term_docs)))
                 errors[t] = 1
         i += 5000
     write_message("Checking integrity of rank values in %sR" % table[:-1]) 
     i = 0
     while i < N:
         docs_terms = run_sql("SELECT id_bibrec, termlist FROM %sR WHERE id_bibrec>=%s and id_bibrec<=%s" % (table[:-1], i, i+5000))
         for (j, termlist) in docs_terms:
             termlist = deserialize_via_marshal(termlist)
             for (t, tf) in termlist.iteritems():
                 if tf[1] == 0 and not errors.has_key(t):
                     errors[t] = 1
                     write_message("ERROR: Gi missing for record %s and term: %s (%s) in %s" % (j,t,repr(t), table))
                     terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term='%s'" % (table, t))
                     termlist = deserialize_via_marshal(terms_docs[0][1])
     	i += 5000
 
     if len(errors) == 0:
         write_message("No direct errors found, but nonconsistent data may exist.")
     else:
         write_message("%s errors found during integrity check, repair and rebalancing recommended." % len(errors))
     options["modified_words"] = errors
 
 def rank_method_code_statistics(table):
     """Shows some statistics about this rank method."""
 
     maxID = run_sql("select max(id) from %s" % table)
     maxID = maxID[0][0]
     terms = {}
     Gi = {}
 
     write_message("Showing statistics of terms in index:")
     write_message("Important: For the 'Least used terms', the number of terms is shown first, and the number of occurences second.") 
     write_message("Least used terms---Most important terms---Least important terms")
     i = 0
     while i < maxID:
         terms_docs=run_sql("SELECT term, hitlist FROM %s WHERE id>= %s and id < %s" % (table, i, i + 10000))
         for (t, hitlist) in terms_docs:
             term_docs=deserialize_via_marshal(hitlist)
             terms[len(term_docs)] = terms.get(len(term_docs), 0) + 1
             if term_docs.has_key("Gi"):
                 Gi[t] = term_docs["Gi"]
         i=i + 10000
     terms=terms.items()
     terms.sort(lambda x, y: cmp(y[1], x[1]))
     Gi=Gi.items()
     Gi.sort(lambda x, y: cmp(y[1], x[1]))
     for i in range(0, 20):
         write_message("%s/%s---%s---%s" % (terms[i][0],terms[i][1], Gi[i][0],Gi[len(Gi) - i - 1][0]))
 
 def update_rnkWORD(table, terms):
     """Updates rnkWORDF and rnkWORDR with Gi and Nj values. For each term in rnkWORDF, a Gi value for the term is added. And for each term in each document, the Nj value for that document is added. In rnkWORDR, the Gi value for each term in each document is added. For description on how things are computed, look in the hacking docs.
     table - name of forward index to update 
     terms - modified terms"""
 
     stime = time.time()
     Gi = {}
     Nj = {}
     N = run_sql("select count(id_bibrec) from %sR" % table[:-1])[0][0]
  
     write_message("Beginning post-processing of %s terms" % len(terms))
     if len(terms) == 0:
         write_message("No terms to process, ending...")
         return ""
 
     #Locating all documents related to the modified/new/deleted terms, if fast update, 
     #only take into account new/modified occurences
     write_message("Phase 1: Finding records containing modified terms")      
     terms = terms.keys()
     i = 0 
 
     while i < len(terms):
         terms_docs = get_from_forward_index(terms, i, (i+5000), table)
         for (t, hitlist) in terms_docs: 
             term_docs = deserialize_via_marshal(hitlist)
             if term_docs.has_key("Gi"):
                 del term_docs["Gi"]
 	    for (j, tf) in term_docs.iteritems():
                 if (options["quick"] == "yes" and tf[1] == 0) or options["quick"] == "no":
                     Nj[j] = 0
         write_message("Phase 1: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
         i += 5000
     write_message("Phase 1: Finished finding records containing modified terms")
     
     #Find all terms in the records found in last phase
     write_message("Phase 2: Finding all terms in affected records")
     records = Nj.keys()   
     i = 0
     while i < len(records):
         docs_terms = get_from_reverse_index(records, i, (i + 5000), table)
         for (j, termlist) in docs_terms:
             doc_terms = deserialize_via_marshal(termlist)
             for (t, tf) in doc_terms.iteritems(): 
                  Gi[t] = 0
         write_message("Phase 2: ......processed %s/%s records " % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
     	i += 5000
     write_message("Phase 2: Finished finding all terms in affected records")
 
     terms = Gi.keys()
     Gi = {}
     i = 0
     if options["quick"] == "no":
         #Calculating Fi and Gi value for each term
         write_message("Phase 3: Calculating importance of all affected terms")
         while i < len(terms):
             terms_docs = get_from_forward_index(terms, i, (i+5000), table)
             for (t, hitlist) in terms_docs:
                 term_docs = deserialize_via_marshal(hitlist)
                 if term_docs.has_key("Gi"):
                     del term_docs["Gi"]
                 Fi = 0
                 Gi[t] = 1
 	        for (j, tf) in term_docs.iteritems():
                     Fi += tf[0]
                 for (j, tf) in term_docs.iteritems():
                     if tf[0] != Fi:
                         Gi[t] = Gi[t] + ((float(tf[0]) / Fi) * math.log(float(tf[0]) / Fi) / math.log(2)) / math.log(N) 
             write_message("Phase 3: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
             i += 5000
         write_message("Phase 3: Finished calculating importance of all affected terms")
     else:
         #Using existing Gi value instead of calculating a new one. Missing some accurancy.
         write_message("Phase 3: Getting approximate importance of all affected terms")
         while i < len(terms):
             terms_docs = get_from_forward_index(terms, i, (i+5000), table)
             for (t, hitlist) in terms_docs:
                 term_docs = deserialize_via_marshal(hitlist)
                 if term_docs.has_key("Gi"):
                     Gi[t] = term_docs["Gi"][1]
                 elif len(term_docs) == 1:
                     Gi[t] = 1
                 else:
                     Fi = 0
                     Gi[t] = 1
 	            for (j, tf) in term_docs.iteritems():
                         Fi += tf[0]
                     for (j, tf) in term_docs.iteritems():
                         if tf[0] != Fi:
                             Gi[t] = Gi[t] + ((float(tf[0]) / Fi) * math.log(float(tf[0]) / Fi) / math.log(2)) / math.log(N) 
             write_message("Phase 3: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
             i += 5000
         write_message("Phase 3: Finished getting approximate importance of all affected terms")
 
     write_message("Phase 4: Calculating normalization value for all affected records and updating %sR" % table[:-1])
     records = Nj.keys()
     i = 0
     while i < len(records):
         #Calculating the normalization value for each document, and adding the Gi value to each term in each document.
         docs_terms = get_from_reverse_index(records, i, (i + 5000), table)
         for (j, termlist) in docs_terms:
             doc_terms = deserialize_via_marshal(termlist)           
             for (t, tf) in doc_terms.iteritems():
                 if Gi.has_key(t):
                     Nj[j] = Nj.get(j, 0) + math.pow(Gi[t] * (1 + math.log(tf[0])), 2)
                     Git = int(math.floor(Gi[t]*100))
                     if Git >= 0:
                         Git += 1
                     doc_terms[t] = (tf[0], Git)
                 else:  
                     Nj[j] = Nj.get(j, 0) + math.pow(tf[1] * (1 + math.log(tf[0])), 2)
             Nj[j] = 1.0 / math.sqrt(Nj[j])
             Nj[j] = int(Nj[j] * 100)
             if Nj[j] >= 0:
                 Nj[j] += 1
             run_sql("UPDATE %sR SET termlist='%s' WHERE id_bibrec=%s" % (table[:-1], serialize_via_marshal(doc_terms), j))  
         write_message("Phase 4: ......processed %s/%s records" % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
 	i += 5000
     write_message("Phase 4: Finished calculating normalization value for all affected records and updating %sR" % table[:-1])
     write_message("Phase 5: Updating %s with new normalization values" % table)
     i = 0
     terms = Gi.keys()
     while i < len(terms):
         #Adding the Gi value to each term, and adding the normalization value to each term in each document.
         terms_docs = get_from_forward_index(terms, i, (i+5000), table)
         for (t, hitlist) in terms_docs: 
             term_docs = deserialize_via_marshal(hitlist)
             if term_docs.has_key("Gi"):
                 del term_docs["Gi"]
 	    for (j, tf) in term_docs.iteritems():
                 if Nj.has_key(j):
                     term_docs[j] = (tf[0], Nj[j])
             Git = int(math.floor(Gi[t]*100))
             if Git >= 0:
                 Git += 1
             term_docs["Gi"] = (0, Git)
             run_sql("UPDATE %s SET hitlist='%s' WHERE term='%s'" % (table, serialize_via_marshal(term_docs), MySQLdb.escape_string(t)))
         write_message("Phase 5: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
         i += 5000
     write_message("Phase 5:  Finished updating %s with new normalization values" % table)
     write_message("Time used for post-processing: %.1fmin" % ((time.time() - stime) / 60))
     write_message("Finished post-processing") 
     
 
 def get_from_forward_index(terms, start, stop, table):
     current_terms = ""
     for j in range(start, (stop < len(terms) and stop or len(terms))):
         current_terms += "'%s'," % terms[j]
     terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term IN (%s)" % (table,current_terms[:-1]))
     return terms_docs
 
 def get_from_reverse_index(records, start, stop, table):
     current_recs = "%s" % records[start:stop]
     current_recs = current_recs[1:-1]
     docs_terms = run_sql("SELECT id_bibrec, termlist FROM %sR WHERE id_bibrec IN (%s)" % (table[:-1],current_recs))
     return docs_terms
 
 def test_word_separators(phrase="hep-th/0101001"):
     """Tests word separating policy on various input."""
     print "%s:" % phrase
     gwfp = get_words_from_phrase(phrase)
     for (word, count) in gwfp.iteritems():
         print "\t-> %s - %s" % (word, count)
 
 def task_sig_sleep(sig, frame):
     """Signal handler for the 'sleep' signal sent by BibSched."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)
     write_message("sleeping...")
     task_update_status("SLEEPING")
     signal.pause() # wait for wake-up signal
 
 def task_sig_wakeup(sig, frame):
     """Signal handler for the 'wakeup' signal sent by BibSched."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)
     write_message("continuing...")
     task_update_status("CONTINUING")
 
 def task_sig_stop(sig, frame):
     """Signal handler for the 'stop' signal sent by BibSched."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)
     write_message("stopping...")
     task_update_status("STOPPING")
     errcode = 0
     try:
         task_sig_stop_commands()
         write_message("stopped")
         task_update_status("STOPPED")
     except StandardError, err:
         write_message("Error during stopping! %e" % err)
         task_update_status("STOPPINGFAILED")
         errcode = 1
     sys.exit(errcode)
 
 def task_sig_stop_commands():
     """Do all the commands necessary to stop the task before quitting.
     Useful for task_sig_stop() handler.    
     """
     write_message("stopping commands started")    
     for table in wordTables:
         table.put_into_db()    
     write_message("stopping commands ended")    
     
 def task_sig_suicide(sig, frame):
     """Signal handler for the 'suicide' signal sent by BibSched."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)
     write_message("suiciding myself now...")
     task_update_status("SUICIDING")
     write_message("suicided")
     task_update_status("SUICIDED")
     sys.exit(0)
 
 def task_sig_unknown(sig, frame):
     """Signal handler for the other unknown signals sent by shell or user."""
     if options["verbose"] >= 9:
         write_message("got signal %d" % sig)    
     write_message("unknown signal %d ignored" % sig) # do nothing for other signals
 
 def task_update_progress(msg):
     """Updates progress information in the BibSched task table."""
     global task_id, options
     if options["verbose"] >= 9:
         write_message("Updating task progress to %s." % msg)
     return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, task_id))
 
 def task_update_status(val):
     """Updates state information in the BibSched task table."""
     global task_id, options
     if options["verbose"] >= 9:
         write_message("Updating task status to %s." % val)
     return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, task_id))    
 
 def getName(methname, ln=cdslang, type='ln'):
     """Returns the name of the rank method, either in default language or given language.
     methname = short name of the method
     ln - the language to get the name in
     type - which name "type" to get."""
 
     try:
         rnkid = run_sql("SELECT id FROM rnkMETHOD where name='%s'" % methname)
         if rnkid:
             rnkid = str(rnkid[0][0])
             res = run_sql("SELECT value FROM rnkMETHODNAME where type='%s' and ln='%s' and id_rnkMETHOD=%s" % (type, ln, rnkid))
             if not res:
                 res = run_sql("SELECT value FROM rnkMETHODNAME WHERE ln='%s' and id_rnkMETHOD=%s and type='%s'"  % (cdslang, rnkid, type))
             if not res:
                 return methname
             return res[0][0]
         else:
             raise Exception
     except Exception, e:
         write_message("Cannot run rank method, either given code for method is wrong, or it has not been added using the webinterface.")
         raise Exception
 
 def word_similarity(row, run):
     """Call correct method"""
     return word_index(row, run)