Page MenuHomec4science

bibrank.in
No OneTemporary

File Metadata

Created
Tue, Jun 4, 10:43

bibrank.in

##Ranking of records using different parameters and methods.
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
## read config variables:
#include "config.wml"
#include "configbis.wml"
#include "cdswmllib.wml"
## start Python:
<protect>#!</protect><PYTHON>
<protect># -*- coding: utf-8 -*-</protect>
<protect>## $Id$</protect>
<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
"""
BibRank ranking daemon.
Usage: %s [options]
Examples:
%s --id=0-30000,30001-860000 --run=jif --verbose=9
%s --modified='2002-10-27 13:57:26' --run=jif
%s --rebalance --collection=Articles --run=jif
Ranking options:
-c, --collection=c1,c2 Collections to include in this set
-i, --id=idr1,idr2 Record ranges to include in this set
-m, --modified=[from] Update records modified after date
-k, --check=value Check if set needs rebalancing, (if the top
star is higher than given percentage 0-1.0)
-S, --stat Show statistics
-w, --run=rm1,rm2 Runs each rank method in the order given
-r, --rebalance Rebalance, do full update
Scheduling options:
-u, --user=USER user name to store task, password needed
-s, --sleeptime=SLEEP time after which to repeat tasks (no)
e.g.: 1s, 30m, 24h, 7d
-t, --time=TIME moment for the task to be active (now)
e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
"""
__version__ = "<: print generate_pretty_version_string('$Id$'); :>"
## fill config variables:
pylibdir = "<LIBDIR>/python"
try:
from marshal import loads,dumps
from zlib import compress,decompress
from string import split,translate,lower,upper
import getopt
import getpass
import string
import os
import sre
import sys
import time
import MySQLdb
import Numeric
import urllib
import signal
import tempfile
import unicodedata
import traceback
import cStringIO
import urllib
import re
import copy
import sys
import types
import gc
import ConfigParser
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
try:
sys.path.append('%s' % pylibdir)
from cdsware.config import *
from cdsware.search_engine_config import cfg_max_recID
from cdsware.search_engine import perform_request_search, strip_accents
from cdsware.search_engine import HitSet
from cdsware.dbquery import run_sql
from cdsware.access_control_engine import acc_authorize_action
from bisect import bisect
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
try:
import psyco
psyco.bind(sort())
psyco.bind(single_tag_rank())
psyco.bind(serialize_via_numeric_array)
psyco.bind(Authorimpact())
psyco.bind(Merge())
psyco.bind(Citationimpact())
psyco.bind(Accessimpact())
except:
pass
opts_dict = {}
task_id = -1
number_of_stars = 5
def citationimpact(methname, starsets, config):
"""Calculates rankset based on number of citations each document has"""
startCreate = time.time()
starset = starsets[methname]
tempdoc = {}
if opts_dict["verbose"] >= 1:
write_message("Running: %s." % starset.getName())
citation_tag = config.get("citationimpact", "citation_tag")
curr_repnr_tag = config.get("citationimpact", "curr_tag")
old_repnr_tag = config.get("citationimpact", "old_tag")
if not opts_dict["modified"]:
if opts_dict["verbose"] >= 9:
write_message("Rebalancing")
starset.setLastUpdated(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
citations = run_sql("SELECT id_bibrec,value FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id" % (citation_tag[0:2], citation_tag[0:2], citation_tag))
else:
fromDB(starset)
if opts_dict["modified"] == "last_updated":
opts_dict["modified"] = starset.getLastUpdated()
if opts_dict["verbose"] >= 9:
write_message("Updating records modified after: %s" % opts_dict["modified"])
starset.setLastUpdated(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
mod_data = run_sql("SELECT id FROM bibrec WHERE modification_date >=%s", (opts_dict["modified"]),)
for id in mod_data:
citation = run_sql("SELECT id_bibrec,value FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id and id_bibrec=%s" % (citation_tag[0:2], citation_tag[0:2], citation_tag, id))
for id,value in citation:
citations.append((id,value))
for key,value in citations:
data = run_sql("SELECT id_bibrec FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id AND value='%s'" % (curr_repnr_tag[0:2], curr_repnr_tag[0:2], curr_repnr_tag, value))
data = run_sql("SELECT id_bibrec FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id AND value='%s'" % (old_repnr_tag[0:2], old_repnr_tag[0:2], old_repnr_tag, value))
if not opts_dict["modified"]:
starset.setUnsorted(tempdoc)
sort(starset)
else:
merge_two_sets(tempdoc, starset)
#intoDB(starset)
showtime((time.time() - startCreate))
def single_tag_rank(starset, config):
"""Connect the given tag with the data from the kb file given"""
if opts_dict["verbose"] >= 9:
write_message("Loading knowledgebase file")
kb_data = {}
records = []
input = open(config.get("single_tag_rank", "kb_src"), 'r')
data = input.readlines()
for line in data:
if not line[0:2] == "#":
kb_data[string.strip((string.split(string.strip(line),"---"))[0])] = (string.split(string.strip(line), "---"))[1]
tag = config.get("single_tag_rank","tag")
tags = split(config.get("single_tag_rank", "check_mandatory_tags"),",")
if not opts_dict["modified"] or config.get("single_tag_rank", "enable_modified") == "no":
records = run_sql("SELECT id_bibrec,value FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id" % (tag[0:2], tag[0:2], tag))
valid = HitSet(Numeric.ones(cfg_max_recID + 1))
for key in tags:
newset = HitSet()
newset.addlist(run_sql("SELECT id_bibrec FROM bib%sx,bibrec_bib%sx WHERE id_bibxxx=id AND tag='%s'" % (tag[0:2], tag[0:2], key)))
valid.intersect(newset)
if tags:
records = filter(lambda x: valid.contains(x[0]), records)
else:
mod_data = run_sql("SELECT id FROM bibrec WHERE modification_date >= %s", (opts_dict["modified"],))
for id in mod_data:
found = "1"
for key in tags:
if not run_sql("SELECT id_bibrec FROM bib%sx,bibrec_bib%sx WHERE id_bibxxx=id AND tag='%s' AND id_bibrec='%s'" % (tag[0:2], tag[0:2], key, id[0])):
found = ""
break
if found:
record = run_sql("SELECT id_bibrec,value FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id AND id_bibrec='%s'" % (tag[0:2], tag[0:2], tag, id[0]))
for id_bibrec, value in record:
records.append((id_bibrec, value))
documents = {}
for key,value in records:
if kb_data.has_key(value):
if not documents.has_key(key):
documents[key] = (value, float(kb_data[value]))
else:
if journals.has_key(documents[key]) and float(kb_data[value]) > float((documents[key])[1]):
documents[key] = (value, float(kb_data[value]))
else:
documents[key] = (value, - 1.0)
return documents
def accessimpact(methname, starsets, config):
"""Generating rankset based on number of downloads per document"""
startCreate = time.time()
starset = starsets[methname]
opts_dict["dbhost"] = config.get("accessimpact", "dbhost")
opts_dict["dbname"] = config.get("accessimpact", "dbname")
opts_dict["dbuser"] = config.get("accessimpact", "dbuser")
opts_dict["dbpass"] = config.get("accessimpact", "dbpass")
if opts_dict["verbose"] >= 1:
write_message("Running: %s." % starset.getName())
starset.setLastUpdated(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
sysno_tag = config.get("accessimpact", "sysnr_tag")
curr_repnr_tag = config.get("accessimpact", "curr_tag")
old_repnr_tag = config.get("accessimpact", "old_tag")
if not opts_dict["modified"]:
if opts_dict["verbose"] >= 9:
write_message("Rebalancing")
imprec = run_sql2("SELECT imprecno,base,bsysno,bref FROM imprec")
impacc = dict(run_sql2("SELECT imprecno,SUM(nbaccess) FROM impacc group BY imprecno"))
cdssysno = run_sql("SELECT value,id_bibrec FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id" % (sysno_tag[0:2], sysno_tag[0:2], sysno_tag))
else:
fromDB(starset)
impacc = {}
if opts_dict["modified"] == "last_updated":
opts_dict["modified"] = starset.getLastUpdated()
if opts_dict["verbose"] >= 9:
write_message("Updating records modified after: %s" % opts_dict["modified"])
pre_impacc = dict(run_sql2("SELECT distinct imprecno,'' FROM impacc WHERE sdate >=%s", (opts_dict["modified"],)))
imprec = []
cdssysno = []
for key in pre_impacc.keys():
test_impacc = run_sql2("SELECT imprecno,SUM(nbaccess) FROM impacc WHERE imprecno=%s GROUP BY imprecno", (key,))
impacc[test_impacc[0][0]] = test_impacc[0][1]
data = run_sql2("SELECT imprecno,base,bsysno,bref FROM imprec WHERE imprecno=%s", (key,))
if data:
data2 = run_sql2("SELECT imprecno FROM imprec WHERE bsysn=%s", (data[0][2],))
for key2 in data2:
imprec.append((key2, data[0][1], data[0][2], data[0][3]))
sysno = '0' * (9 - len(str(data[0][2]))) + str(data[0][2]) + data[0][1][0:3]
data = run_sql("SELECT value,id_bibrec FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id AND value=%s" % sysno_tag[0:2], sysno_tag[0:2], sysno_tag, sysno)
for key2,value in data:
cdssysno.append((key2, value))
tempdict = {}
for value,key in cdssysno:
if not tempdict.has_key(value):
tempdict[value] = [key]
else:
tempdict[value] = tempdict[value] + [key]
tempdoc = {}
count = 0
notcount = 0
for key, base, bsysno, value in imprec:
if impacc.has_key(key):
sysno = '0' * (9 - len(str(bsysno))) + str(bsysno) + base[0:3]
data = ()
if tempdict.has_key(sysno):
data = tempdict[sysno]
else:
data = run_sql("SELECT id_bibrec FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id AND value=%s" % (curr_repnr_tag[0:2], curr_repnr_tag[0:2], curr_repnr_tag, value))
if len(data) == 0:
data = run_sql("SELECT id_bibrec FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id AND value=%s" % (old_repnr_tag[0:2], old_repnr_tag[0:2], old_repnr_tag, value))
if len(data) != 0:
count = count + int(impacc[key])
for key2 in range(0, len(data)):
if type(data[key2]) is tuple:
key3 = data[key2][0]
else:
key3 = data[key2]
if tempdoc.has_key(key3):
tempdoc[key3] = ("", tempdoc[key3][1] + float(impacc[key]))
else:
tempdoc[key3] = ("", float(impacc[key]))
else:
notcount = notcount + int(impacc[key])
if opts_dict["verbose"] >= 9:
try:
write_message("Percentage match: %s%%,(%s/%s)" % (round((float(count) / float(count+notcount)) * 100, 3), notcount, count))
except:
print count, notcount
if not opts_dict["modified"]:
starset.setUnsorted(tempdoc)
sort(starset)
else:
merge_two_sets(tempdoc,starset)
intoDB(starset)
showtime((time.time() - startCreate))
def single_tag_rank_method(methname, starsets, config):
"""Creating rankset"""
startCreate = time.time()
starset = starsets[methname]
if opts_dict["verbose"] >= 1:
write_message("Running: %s." % starset.getName())
if opts_dict["modified"]:
fromDB(starset)
if opts_dict["modified"] == "last_updated":
opts_dict["modified"] = starset.getLastUpdated()
if opts_dict["verbose"] >= 9:
write_message("Updating records modified after: %s" % opts_dict["modified"])
starset.setLastUpdated(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
tempdoc = single_tag_rank(starset, config)
merge_two_sets(tempdoc, starset)
else:
starset.setLastUpdated(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
tempdoc = single_tag_rank(starset, config)
starset.setUnsorted(tempdoc)
sort(starset)
intoDB(starset)
showtime((time.time() - startCreate))
def merge_two_sets(tempdoc, starset):
"""Merging two sets into one based on given threshold"""
if opts_dict["verbose"] >= 9:
write_message("Merging old and new rankset")
stars = starset.getStars()
threshold = starset.getThreshold()
for key in tempdoc.keys():
i = starset.getSize()
while tempdoc[key][1] < threshold[i]:
i = i - 1
for key2 in stars:
if stars[key2].contains(key):
if key2 != i:
stars[key2].remove(key)
stars[i].add(key)
break
elif key2 == starset.getSize():
stars[i].add(key)
for key in stars:
stars[key].calculate_nbhits()
def authorimpact(methname, starsets, config):
"""Calculating the rankvalue a document has based on its authors"""
startCreate = time.time()
starset = starsets[methname]
if opts_dict["verbose"] >= 1:
write_message("Running: %s" % starset.getName())
tempdoc = single_tag_rank(starset, config)
Auth1 = {}
documents2 = {}
authors = {}
upd_authors = []
sql_src = []
p_author_tag = config.get("authorimpact", "primary_tag")
s_author_tag = config.get("authorimpact", "secondary_tag")
sql_src.append("SELECT id_bibrec,value FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id" % (p_author_tag[0:2], p_author_tag[0:2], p_author_tag))
sql_src.append("SELECT id_bibrec,value FROM bib%sx,bibrec_bib%sx WHERE tag='%s' AND id_bibxxx=id" % (s_author_tag[0:2], s_author_tag[0:2], s_author_tag))
if not opts_dict["modified"]:
increment = 50000
if opts_dict["verbose"] >= 9:
write_message("Rebalancing")
starset.setLastUpdated(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
for key in sql_src:
ij = -increment
while ij <= (cfg_max_recID):
ij = ij + increment
data = run_sql(key + " AND id_bibrec>%i AND id_bibrec<=%i" % (ij, (ij + increment)))
authorimpact_modified(data, Auth1)
else:
fromDB(starset)
mod_data = run_sql("select id from bibrec where modification_date >= %s", (opts_dict["modified"],))
if opts_dict["modified"] == "last_updated":
opts_dict["modified"] = starset.getLastUpdated()
if opts_dict["verbose"] >= 9:
write_message("Updating records modified after: %s" % opts_dict["modified"])
starset.setLastUpdated(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
for key in sql_src:
for id in mod_data:
data = run_sql(key + " AND id_bibrec=%s" % id[0])
authorimpact_modified(data, Auth1)
for key2,value in data:
upd_authors.append((key2,value))
for key in Auth1.keys():
for key2 in sql_src:
data = run_sql(key2 + " AND value=%s", (key,))
authorimpact_modified(data, Auth1)
del data
Auth = []
for key in Auth1.keys():
for key1 in range(0, len(Auth1[key])):
Auth.append((Auth1[key][key1], key))
del Auth1
factor = 0.0
for key, value in Auth:
if tempdoc.has_key(key) and tempdoc[key][1] > 0.0:
factor = tempdoc[key][1]
else:
factor = 0.0
if not authors.has_key(value):
authors[value] = (factor, 1)
else:
authors[value] = (authors[value][0] + factor, authors[value][1] + 1)
if opts_dict["modified"]:
Auth = upd_authors
tempdoc = {}
for key,value in Auth:
if documents2.has_key(key) and authors[value][0] > 0.0:
documents2[key] = (documents2[key][0] + authors[value][0], documents2[key][1] + authors[value][1])
elif authors[value][0] > 0.0:
documents2[key] = authors[value]
del Auth
for key in documents2.keys():
tempdoc[key] = ("", float(documents2[key][0]) / float(documents2[key][1]))
if opts_dict["verbose"] >= 9:
write_message("Number of distinct authors: %s" % len(authors))
if not opts_dict["modified"]:
for key in tempdoc.keys():
if len(tempdoc[key][0]) != 0:
tempdoc[key] = ("", -1.0)
starset.setUnsorted(tempdoc)
sort(starset)
else:
merge_two_sets(tempdoc,starset)
intoDB(starset)
showtime((time.time() - startCreate))
def authorimpact_modified(data, Auth):
"""Adding data to the dictionary"""
for key,value in data:
if not Auth.has_key(value):
Auth[value] = []
Auth[value].append(key)
else:
found=0
for key2 in range(0, len(Auth[value])):
if Auth[value][key2] == key:
found = 1
break
if not found == 1:
Auth[value].append(key)
def merge(methname, starsets, config):
"""Merge several methods into one starset"""
startCreate = time.time()
if opts_dict["verbose"] >= 1:
write_message("Running: %s" % starsets[methname].getName())
starsets[methname].setLastUpdated(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
threshold = {}
finalset = {}
permut = ''
for nr in range(0, starsets[methname].getSize() + 1):
finalset[nr] = HitSet()
permut = permut + "%s" % nr
starsets[methname].setWeigth(1.0)
sum = 0.0
nr = 0
convert = {}
size=-1
for key in starsets:
if key != methname:
sum = sum + starsets[key].getWeigth()
convert[nr] = key
nr=nr + 1
if size > -1 and size != len(starsets[key].getStars()) -1:
write_message("The sets have different sizes, process cancelled")
sys.exit()
else:
size = len(starsets[key].getStars()) -1
sum = 1.0 / sum
for key in starsets:
if key != methname:
starsets[key].setWeigth(starsets[key].getWeigth() * sum)
p = Permutation(permut, len(starsets)-1)
for perm in p:
tempset = copy.copy(starsets[convert[0]].getStar(int(perm[0])))
place = float(perm[0]) * float(starsets[convert[0]].getWeigth())
for i in range(1, len(perm)):
tempset.intersect(starsets[convert[i]].getStar(int(perm[i])))
tempset.calculate_nbhits()
place = place+float(perm[i]) * float(starsets[convert[i]].getWeigth())
finalset[int(round(place))].union(tempset)
for i in range(0, starsets[methname].getSize() + 1):
finalset[i].calculate_nbhits()
threshold[i] = 0
starsets[methname].setStars(finalset)
starsets[methname].setThreshold(threshold)
intoDB(starsets[methname])
showtime((time.time() - startCreate))
def showtime(timeused):
"""Show time used for method"""
if opts_dict["verbose"] >= 9:
write_message("Time used: %d second(s)." % timeused)
def stats(starset):
"""Print statistics"""
try:
total = 0
write_message("Statistics: %s , Top Star size: %s%% , Overall Importance: %s%%" % (starset.getName(), round(float(starset.getTopStar()) * 100,2), round(float(starset.getWeigth())*100, 2)))
for nr in range(0, starset.getSize() + 1):
write_message("%s star(s): Range >= \t%s\t%s" % (nr, round(starset.getThreshold()[nr],3), (starset.getStar(nr))._nbhits))
total = total + (starset.getStar(nr))._nbhits
write_message("Total: %s" % total)
except StandardError, e:
write_message("Error showing statistics: %s" % starset.getName(), sys.stderr())
raise StandardError
def check(starset):
"""Check if rebalancing is necessary"""
try:
size = cfg_max_recID + 1 - starset.getStar(0)._nbhits
if opts_dict["verbose"] >= 9:
for nr in range(1, starset.getSize() + 1):
write_message("%s---%f" % (nr, float(starset.getStar(nr)._nbhits) / float(size)))
if (float(starset.getStar(starset.getSize())._nbhits) / float(size)) >= float(opts_dict["check"]):
write_message("Rebalance: %s" % starset.getName())
except StandardError, e:
write_message("Error checking: %s" % starset.getName(), sys.stderr)
raise StandardError
def sort(starset):
"""Sort starset into stars"""
tempdoc = starset.getUnsorted()
if opts_dict["validset"]:
"""Removing records from set according to collection/id range given"""
for key in tempdoc.keys():
if not opts_dict["validset"].contains(key):
del tempdoc[key]
if starset.getSize() > 0:
if opts_dict["verbose"] >= 9:
write_message("Sorting %s into sets." % starset.getName())
sets = {}
threshold = {}
for nr in range(0, starset.getSize() + 1):
sets[nr] = HitSet()
threshold[nr] = 0
for key in tempdoc.keys():
"""Deletes keys with negative value"""
if tempdoc[key][1] <= 0:
sets[1].add(key)
del tempdoc[key]
set = map(lambda x: (x[0], x[1][1]), tempdoc.items())
set.sort(compare_on_val)
if starset.getTopStar() < 1.0 and starset.getTopStar() > 0.0:
try:
slicevalue = (set[int(round((len(tempdoc) * starset.getTopStar()))) + 1: int(round((len(tempdoc) * starset.getTopStar()))) + 2])[0][1]
except StandardError, e:
slicevalue = 0.0
elif starset.getTopStar() >= 1.0:
slicevalue = 0.0
else:
slicevalue = 999999
threshold[starset.getSize()] = slicevalue
slice = filter(lambda x: x[1] >= slicevalue, set)
if len(slice) > 0:
ubound = float((tempdoc[slice[len(slice) - 1][0]])[1])
sets[starset.getSize()].addlist(map(lambda x: x[0], slice))
set = set[int(round((len(slice)))): len(tempdoc)]
if len(set) > 0:
i = starset.getSize()
lbound = float((tempdoc[(set[len(set) - 1])[0]])[1])
while i > 2:
i=i-1
mbound = float(lbound + ((ubound - lbound) / (starset.getSize() - 2)) * (i - 2))
threshold[i] = mbound
slice = filter(lambda x: x[1]>=mbound,set)
sets[i].addlist(map(lambda x: x[0], slice))
set = set[len(slice): len(set)]
if len(set) != 0:
sets[1].addlist(set)
threshold[1] = -1.0
threshold[0] = -9.9
sets[0] = HitSet(Numeric.ones(cfg_max_recID + 1))
for i in range(1, starset.getSize() + 1):
sets[i].calculate_nbhits()
sets[0].difference(sets[i])
sets[0].calculate_nbhits()
starset.setThreshold(threshold)
starset.setStars(sets)
else:
if opts_dict["verbose"] >= 9:
write_message("Nothing to sort.")
def compare_on_val(first, second):
return cmp(second[1], first[1])
class StarSet:
"""A starset, one for each rank method"""
sets = {}
unsorted = []
weigth = 0.0
top_star = 0.0
name = ""
short_name = ""
run_function = ""
threshold = {}
size = 0
last_updated = ""
def __init__(self, name, s_name, r_func, weigth, sz, top_star):
self.name = name
self.weigth = weigth
self.top_star = top_star
self.unsorted = {}
self.short_name = s_name
self.run_function = r_func
self.size = sz
for i in range(0, self.size + 1):
self.sets[i] = HitSet()
self.threshold[i] = 0.0
def getStar(self, value):
return self.sets[value]
def setStars(self, value):
self.sets = value
def getStars(self):
return self.sets
def getWeigth(self):
return self.weigth
def setWeigth(self,value):
self.weigth = value
def getSize(self):
return self.size
def setThreshold(self, value):
self.threshold = value
def getThreshold(self):
return self.threshold
def getShortName(self):
return self.short_name
def getRunFunction(self):
return self.run_function
def getName(self):
return self.name
def setTopStar(self, value):
self.top_star = value
def getTopStar(self):
return self.top_star
def getLastUpdated(self):
return self.last_updated
def setLastUpdated(self, date):
self.last_updated = date
def setUnsorted(self, value):
if len(self.unsorted) == 0:
self.unsorted = value
else:
for key in value.keys():
self.unsorted[key] = value[key]
def getUnsorted(self):
return self.unsorted
class Permutation:
"""Creates permutations"""
def __init__(self, values, length):
self.values = values
self.length = length
return
def __len__(self):
return len(self.values) ** self.length
def __getitem__(self, n):
"""permutation number n"""
if n >= len(self): raise IndexError
res = []
lv = len(self.values)
vals = self.values
for ndx in range(self.length):
res.append( vals[ n % lv ])
n = n / lv
return res
def serialize_via_numeric_array_dumps(arr):
return Numeric.dumps(arr)
def serialize_via_numeric_array_compr(str):
return compress(str)
def serialize_via_numeric_array_escape(str):
return MySQLdb.escape_string(str)
def serialize_via_numeric_array(arr):
"""Serialize Numeric array into a compressed string."""
return serialize_via_numeric_array_escape(serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr)))
def deserialize_via_numeric_array(string):
"""Decompress and deserialize string into a Numeric array."""
return Numeric.loads(decompress(string))
def _db_login(relogin = 0):
"""Login to the database"""
global DB_CONN
if relogin:
DB_CONN = MySQLdb.connect(host=opts_dict["dbhost"], db=opts_dict["dbname"], user=opts_dict["dbuser"], passwd=opts_dict["dbpass"])
return DB_CONN
else:
try:
d = DB_CONN
return d
except NameError:
DB_CONN = MySQLdb.connect(host=dbhost, db=dbname, user=dbuser, passwd=dbpass)
return DB_CONN
def run_sql2(sql, param=None, n=0, with_desc=0):
""" Runs SQL on the server and returns result"""
db = _db_login()
if param:
param = tuple(param)
try:
cur = db.cursor()
rc = cur.execute(sql, param)
except:
db = _db_login(relogin = 1)
cur = db.cursor()
rc = cur.execute(sql, param)
if string.upper(string.split(sql)[0]) in ("SELECT", "SHOW", "DESC", "DESCRIBE"):
if n:
recset = cur.fetchmany(n)
else:
recset = cur.fetchall()
if with_desc:
return recset, cur.description
else:
return recset
else:
if string.upper(string.split(sql)[0]) == "INSERT":
rc = cur.insert_id()
return rc
def intoDB(starset):
"""Write set to database"""
try:
id = run_sql("SELECT id from rnkMETHOD where name='%s'" % (starset.getShortName()))
if len(id) == 0:
write_message("Could not find method in database")
raise StandardError
if opts_dict["verbose"] >= 9:
write_message("DELETE FROM rnkSET WHERE id_rnkMETHOD=%s" % id[0][0])
write_message("UPDATE rnkMETHOD SET star_category_ranges WHERE id=%s" % id[0][0])
sets = starset.getStars()
data = run_sql("DELETE FROM rnkSET WHERE id_rnkMETHOD=%s" % id[0][0])
data = run_sql("UPDATE rnkMETHOD SET star_category_ranges='%s' WHERE id=%s" % (serialize_via_numeric_array(starset.getThreshold()), id[0][0]))
for key2 in sets:
data = run_sql("INSERT INTO rnkSET(id_rnkMETHOD,star_category,hitset) VALUES(%d,%d,'%s')" % (id[0][0], key2, serialize_via_numeric_array(sets[key2])))
if opts_dict["verbose"] >= 9:
write_message("INSERT INTO rnkSET(id_rnkMETHOD,star_category) VALUES(%s,%s)" % (id[0][0], key2))
data = run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE id=%s",(starset.getLastUpdated(),id[0][0]))
except StandardError, e:
write_message("Could not insert rankdata into database, please check that this rank method has been added to the database.", sys.stderr)
task_update_status("ERROR")
task_sig_stop_commands()
sys.exit(1)
def fromDB(starset):
"""Read from database old starset if it exists"""
try:
query = "SELECT star_category_ranges,last_updated FROM rnkMETHOD WHERE name='%s'" % (starset.getShortName())
if opts_dict["verbose"] >= 9:
write_message(query)
res = run_sql(query)
threshold = deserialize_via_numeric_array(res[0][0])
date = res[0][1]
query = "SELECT star_category,hitset FROM rnkMETHOD,rnkSET WHERE id=id_rnkMETHOD AND name='%s'" % (starset.getShortName())
if opts_dict["verbose"] >= 9:
write_message(query)
res = run_sql(query)
testset = {}
if len(res) != starset.getSize() + 1:
write_message("Rebalance: %s" % starset.getName())
sys.exit()
for i in range(0, len(res)):
testset[res[i][0]] = deserialize_via_numeric_array(res[i][1])
starset.setStars(testset)
starset.setLastUpdated(date)
starset.setThreshold(threshold)
except StandardError, e:
write_message("Rebalance: %s" % starset.getName(), sys.stderr)
task_update_status("ERROR")
task_sig_stop_commands()
sys.exit(1)
def write_message(msg, stream = sys.stdout):
"""Write message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
return
def authenticate(user, header="BibRank Task Submission", action="runbibrank"):
print header
print "=" * len(header)
if user == "":
print>> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print>> sys.stdout, "\rUsername: ", user
res = run_sql("select id,password from user where email=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
if not acc_authorize_action(uid_db, action):
print "Sorry, %s has no right to %s." % (user, action)
sys.exit(1)
return user
def usage(code, msg=''):
"Prints usage for this module."
if msg:
sys.stderr.write("Error: %s.\n" % msg)
print >> sys.stderr, \
""" Usage: %s [options]
Examples:
%s --id=0-30000,30001-860000 --run=jif --verbose=9
%s --modified='2002-10-27 13:57:26' --run=jif
%s --rebalance --collection=Articles --run=jif
Ranking options:
-c, --collection=c1,c2 Collections to include in this set
-i, --id=idr1,idr2 Record ranges to include in this set
-m, --modified=[from] Update records modified after date
-k, --check=value Check if set needs rebalancing, (if the top
star is higher than given percentage 0-1.0)
-S, --stat Show statistics
-w, --run=rm1,rm2 Runs each rank method in the order given
-r, --rebalance Rebalance
Scheduling options:
-u, --user=USER user name to store task, password needed
-s, --sleeptime=SLEEP time after which to repeat tasks (no)
e.g.: 1s, 30m, 24h, 7d
-t, --time=TIME moment for the task to be active (now)
e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
""" % ((sys.argv[0],) * 4)
sys.exit(code)
def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re = sre.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if opts_dict["verbose"]>= 9:
write_message("got signal %d" % sig)
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if opts_dict["verbose"]>= 9:
write_message("got signal %d" % sig)
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
if opts_dict["verbose"]>= 9:
write_message("got signal %d" % sig)
write_message("stopping...")
task_update_status("STOPPING")
errcode = 0
try:
task_sig_stop_commands()
write_message("stopped")
task_update_status("STOPPED")
except StandardError, err:
write_message("Error during stopping! %e" % err)
task_update_status("STOPPINGFAILED")
errcode = 1
sys.exit(errcode)
def task_sig_stop_commands():
"""Do all the commands necessary to stop the task before quitting.
Useful for task_sig_stop() handler.
"""
write_message("stopping commands started")
write_message("stopping commands ended")
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if opts_dict["verbose"]>= 9:
write_message("got signal %d" % sig)
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
if opts_dict["verbose"]>= 9:
write_message("got signal %d" % sig)
write_message("unknown signal %d ignored" % sig) # do nothing for other signals
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
query = "UPDATE schTASK SET progress='%s' where id=%d" % (MySQLdb.escape_string(msg), task_id)
if opts_dict["verbose"]>= 9:
write_message(query)
run_sql(query)
return
def task_update_status(val):
"""Updates state information in the BibSched task table."""
query = "UPDATE schTASK SET status='%s' where id=%d" % (MySQLdb.escape_string(val), task_id)
if opts_dict["verbose"]>= 9:
write_message(query)
run_sql(query)
return
def split_ranges(parse_string):
recIDs = []
ranges = string.split(parse_string, ",")
for range in ranges:
tmp_recIDs = string.split(range, "-")
if len(tmp_recIDs)==1:
recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])])
else:
if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check
tmp = tmp_recIDs[0]
tmp_recIDs[0] = tmp_recIDs[1]
tmp_recIDs[1] = tmp
recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])])
return recIDs
def task_run(row):
"""Run the indexing task. The row argument is the BibSched task
queue row, containing if, arguments, etc.
Return 1 in case of success and 0 in case of failure.
"""
startCreate = time.time()
global opts_dict, task_id
task_id = row[0]
task_proc = row[1]
opts_dict = loads(row[6])
task_status = row[7]
if task_proc != "bibrank":
write_message("-The task #%d does not seem to be a BibRank task." % task_id, sys.stderr)
return 0
if task_status != "WAITING":
write_message("The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr)
return 0
if opts_dict["verbose"]:
write_message("Task #%d started." % task_id)
task_starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
task_update_status("RUNNING")
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
sets = {}
try:
if opts_dict["collection"]:
if opts_dict["verbose"] >=9:
write_message("Getting records from collections")
l_of_colls = string.split(opts_dict["collection"], ",")
recIDs = perform_request_search(c=l_of_colls)
opts_dict["validset"] = HitSet()
opts_dict["validset"].addlist(recIDs)
elif opts_dict["id"]:
opts_dict["validset"] = HitSet()
for i in range(0,len(opts_dict["id"])):
for j in range(opts_dict["id"][i][0],opts_dict["id"][i][1]):
opts_dict["validset"].add(j)
for key in opts_dict["run"]:
#file = "/home/trondaks/w/cdsware/modules/bibrank/etc/" + key + ".cfg"
file = etcdir + "/bibrank/" + key + ".cfg"
if opts_dict["verbose"] >=9:
write_message("Getting configuration from file: %s" % file)
config = ConfigParser.ConfigParser()
try:
config.readfp(open(file))
except StandardError, e:
write_message("Cannot find configurationfile: %s" % file, sys.stderr)
raise StandardError
cfg_stars = number_of_stars
cfg_star = config.getfloat("rank_method", "top_star_percentage")
cfg_short = key
cfg_name = config.get("rank_method", "name")
cfg_function = config.get("rank_method", "function")
cfg_importance = config.getfloat("rank_method", "overall_importance")
sets[key] = StarSet(cfg_name, cfg_short, cfg_function, cfg_importance, cfg_stars, cfg_star)
if opts_dict["verbose"] >= 9:
print ""
func_object = globals().get(cfg_function)
func_object(key,sets,config)
if opts_dict["stat"]:
stats(sets[key])
if opts_dict["check"]:
check(sets[key])
except StandardError, e:
write_message("\nException caught: %s" % e, sys.stderr)
traceback.print_tb(sys.exc_info()[2])
task_update_status("ERROR")
task_sig_stop_commands()
sys.exit(1)
print ""
task_update_status("DONE")
if opts_dict["verbose"]:
write_message("Task #%d finished." % task_id)
showtime((time.time() - startCreate))
return 1
def command_line():
global opts_dict
long_flags = ["stat", "rebalance", "id=", "collection=", "check=", "modified=", "update", "run=", "user=", "sleeptime=", "time=", "help", "version", "verbose="]
short_flags = "Si:c:k:Urm:w:u:s:t:hVv:"
format_string = "%Y-%m-%d %H:%M:%S"
sleeptime = ""
try:
opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags)
except getopt.GetoptError, err:
write_message(err, sys.stderr)
usage(1)
if args:
usage(1)
opts_dict = {"validset":"", "collection":[], "id":[], "check": "", "stat":"", "modified":"last_updated", "run":['jif'], "verbose":1}
sched_time = time.strftime(format_string)
user = ""
try:
for opt in opts:
if opt == ("-h","") or opt == ("--help",""):
usage(1)
elif opt == ("-V","") or opt == ("--version",""):
print __version__
sys.exit(1)
elif opt[0] in ["--verbose", "-v"]:
opts_dict["verbose"] = int(opt[1])
elif opt[0] in ["--run", "-w"]:
opts_dict["run"] = []
run = split(opt[1],",")
for key in range(0,len(run)):
opts_dict["run"].append(run[key])
elif opt[0] in [ "-u", "--user"]:
user = opt[1]
elif opt[0] in [ "-k", "--check"]:
opts_dict["check"]= opt[1]
elif opt[0] in [ "-S", "--stat"]:
opts_dict["stat"] = "true"
elif opt[0] in [ "-i", "--id" ]:
opts_dict["id"] = split_ranges(opt[1])
elif opt[0] in [ "-c", "--collection" ]:
opts_dict["collection"] = opt[1]
elif opt[0] in [ "-r", "--rebalance"]:
opts_dict["modified"] = ""
elif opt[0] in [ "-m", "--modified" ]:
opts_dict["modified"] = get_datetime(opt[1]) #2002-10-27 13:57:26
elif opt[0] in [ "-s", "--sleeptime" ]:
get_datetime(opt[1]) # see if it is a valid shift
sleeptime=opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time = get_datetime(opt[1])
else:
usage(1)
except StandardError, e:
write_message(e, sys.stderr)
sys.exit(1)
#user = authenticate(user)
if opts_dict["verbose"]>=9:
write_message("Storing task options %s" % opts_dict)
query = """INSERT INTO schTASK (proc,user,runtime,sleeptime,arguments,status) VALUES ('bibrank','%s',CAST('%s' AS DATE),'%s','%s','WAITING')""" % (MySQLdb.escape_string(user),MySQLdb.escape_string(sched_time), sleeptime, MySQLdb.escape_string(dumps(opts_dict)))
new_task_id = run_sql(query)
print "Task #%d was successfully scheduled for execution." % new_task_id
return
def main():
if len(sys.argv) == 2:
try:
id = int(sys.argv[1])
except StandardError, err:
command_line()
sys.exit()
res = run_sql("SELECT * FROM schTASK WHERE id='%d'" % (id), None, 1)
if not res:
write_message("Selected task not found.", sys.stderr)
sys.exit(1)
try:
if not task_run(res[0]):
write_message("Error occurred. Exiting.", sys.stderr)
except StandardError, e:
write_message("Unexpected error occurred: %s." % e, sys.stderr)
write_message("Traceback is:")
traceback.print_tb(sys.exc_info()[2])
write_message("Exiting.")
task_update_status("ERROR")
else:
command_line()
if __name__ == "__main__":
main()

Event Timeline