Page MenuHomec4science

bibrank.wml
No OneTemporary

File Metadata

Created
Tue, Nov 5, 16:00

bibrank.wml

##Ranking of records using different parameters and methods.
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002, 2003, 2004, 2005 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
## read config variables:
#include "config.wml"
#include "configbis.wml"
#include "cdswmllib.wml"
## start Python:
<protect>#!</protect><PYTHON>
<protect># -*- coding: utf-8 -*-</protect>
<protect>## $Id$</protect>
<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
"""
BibRank ranking daemon.
Usage: %s [options]
Ranking examples:
%s -wjif -a --id=0-30000,30001-860000 --verbose=9
%s -wjif -d --modified='2002-10-27 13:57:26'
%s -wwrd --rebalance --collection=Articles
%s -wwrd -a -i 234-250,293,300-500 -u admin@cdsware
Ranking options:
-w, --run=r1[,r2] runs each rank method in the order given
-c, --collection=c1[,c2] select according to collection
-i, --id=low[-high] select according to doc recID
-m, --modified=from[,to] select according to modification date
-l, --lastupdate select according to last update
-a, --add add or update words for selected records
-d, --del delete words for selected records
-S, --stat show statistics for a method
-R, --recalculate recalculate weigth data, used by word frequency method
should be used if ca 1% of the document has been changed
since last time -R was used
Repairing options:
-k, --check check consistency for all records in the table(s)
check if update of ranking data is necessary
-r, --repair try to repair all records in the table(s)
Scheduling options:
-u, --user=USER user name to store task, password needed
-s, --sleeptime=SLEEP time after which to repeat tasks (no)
e.g.: 1s, 30m, 24h, 7d
-t, --time=TIME moment for the task to be active (now)
e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
"""
__version__ = "<: print generate_pretty_version_string('$Id$'); :>"
## fill config variables:
pylibdir = "<LIBDIR>/python"
try:
from marshal import loads,dumps
from zlib import compress,decompress
from string import split,translate,lower,upper
import getopt
import getpass
import string
import os
import sre
import sys
import time
import MySQLdb
import urllib
import signal
import tempfile
import traceback
import cStringIO
import re
import copy
import types
import ConfigParser
import cdsware.search_engine
except ImportError, e:
import sys
try:
sys.path.append('%s' % pylibdir)
from cdsware.dbquery import run_sql
from cdsware.bibrank_tag_based_indexer import *
from cdsware.bibrank_word_indexer import *
from cdsware.access_control_engine import acc_authorize_action
from cdsware.search_engine import perform_request_search
except ImportError, e:
import sys
task_id = -1 # the task id
nb_char_in_line = 50 # for verbose pretty printing
chunksize = 1000 # default size of chunks that the records will be treated by
base_process_size = 4500 # process base size
bibrank_options = {} # will hold task options
def serialize_via_numeric_array_dumps(arr):
return Numeric.dumps(arr)
def serialize_via_numeric_array_compr(str):
return compress(str)
def serialize_via_numeric_array_escape(str):
return MySQLdb.escape_string(str)
def serialize_via_numeric_array(arr):
"""Serialize Numeric array into a compressed string."""
return serialize_via_numeric_array_escape(serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr)))
def deserialize_via_numeric_array(string):
"""Decompress and deserialize string into a Numeric array."""
return Numeric.loads(decompress(string))
def serialize_via_marshal(obj):
"""Serialize Python object via marshal into a compressed string."""
return MySQLdb.escape_string(compress(dumps(obj)))
def deserialize_via_marshal(string):
"""Decompress and deserialize string into a Python object via marshal."""
return loads(decompress(string))
def authenticate(user, header="BibRank Task Submission", action="runbibrank"):
print header
print "=" * len(header)
if user == "":
print>> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print>> sys.stdout, "\rUsername: ", user
res = run_sql("select id,password from user where email=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def usage(code, msg=''):
"Prints usage for this module."
if msg:
sys.stderr.write("Error: %s.\n" % msg)
<protect>
print >> sys.stderr, \
""" Usage: %s [options]
Ranking examples:
%s -wjif -a --id=0-30000,30001-860000 --verbose=9
%s -wjif -d --modified='2002-10-27 13:57:26'
%s -wjif --rebalance --collection=Articles
%s -wsbr -a -i 234-250,293,300-500 -u admin@cdsware
Ranking options:
-w, --run=r1[,r2] runs each rank method in the order given
-c, --collection=c1[,c2] select according to collection
-i, --id=low[-high] select according to doc recID
-m, --modified=from[,to] select according to modification date
-l, --lastupdate select according to last update
-a, --add add or update words for selected records
-d, --del delete words for selected records
-S, --stat show statistics for a method
-R, --recalculate recalculate weigth data, used by word frequency method
should be used if ca 1%% of the document has been changed
since last time -R was used
Repairing options:
-k, --check check consistency for all records in the table(s)
check if update of ranking data is necessary
-r, --repair try to repair all records in the table(s)
Scheduling options:
-u, --user=USER user name to store task, password needed
-s, --sleeptime=SLEEP time after which to repeat tasks (no)
e.g.: 1s, 30m, 24h, 7d
-t, --time=TIME moment for the task to be active (now)
e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
""" % ((sys.argv[0],) * 5)
</protect>
sys.exit(code)
def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re = sre.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if bibrank_options["verbose"]>= 9:
write_message("got signal %d" % sig)
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if bibrank_options["verbose"]>= 9:
write_message("got signal %d" % sig)
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop_commands():
"""Do all the commands necessary to stop the task before quitting.
Useful for task_sig_stop() handler.
"""
write_message("stopping commands started")
write_message("stopping commands ended")
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if bibrank_options["verbose"]>= 9:
write_message("got signal %d" % sig)
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
if bibrank_options["verbose"]>= 9:
write_message("got signal %d" % sig)
write_message("unknown signal %d ignored" % sig) # do nothing for other signals
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
query = "UPDATE schTASK SET progress='%s' where id=%d" % (MySQLdb.escape_string(msg), task_id)
if bibrank_options["verbose"]>= 9:
write_message(query)
run_sql(query)
return
def task_update_status(val):
"""Updates state information in the BibSched task table."""
query = "UPDATE schTASK SET status='%s' where id=%d" % (MySQLdb.escape_string(val), task_id)
if bibrank_options["verbose"]>= 9:
write_message(query)
run_sql(query)
return
def split_ranges(parse_string):
recIDs = []
ranges = string.split(parse_string, ",")
for range in ranges:
tmp_recIDs = string.split(range, "-")
if len(tmp_recIDs)==1:
recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])])
else:
if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check
tmp = tmp_recIDs[0]
tmp_recIDs[0] = tmp_recIDs[1]
tmp_recIDs[1] = tmp
recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])])
return recIDs
def get_date_range(var):
"Returns the two dates contained as a low,high tuple"
limits = string.split(var, ",")
if len(limits)==1:
low = get_datetime(limits[0])
return low,None
if len(limits)==2:
low = get_datetime(limits[0])
high = get_datetime(limits[1])
return low,high
def command_line():
"""Storing the task together with the parameters given."""
global bibrank_options
long_flags = ["lastupdate","add","del","repair","maxmem", "flush","stat", "rebalance", "id=", "collection=", "check", "modified=", "update", "run=", "user=", "sleeptime=", "time=", "help", "version", "verbose="]
short_flags = "ladSi:m:c:kUrRM:f:w:u:s:t:hVv:"
format_string = "%Y-%m-%d %H:%M:%S"
sleeptime = ""
try:
opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags)
except getopt.GetoptError, err:
write_message(err, sys.stderr)
usage(1)
if args:
usage(1)
bibrank_options = {"quick":"yes","cmd":"add","flush":100000,"validset":"", "collection":[], "id":[], "check": "", "stat":"", "modified":"", "last_updated":"last_updated","run":"", "verbose":1}
res = run_sql("SELECT name from rnkMETHOD")
bibrank_options["run"] = []
for (name,) in res:
bibrank_options["run"].append(name)
sched_time = time.strftime(format_string)
user = ""
try:
for opt in opts:
if opt == ("-h","") or opt == ("--help",""):
usage(1)
elif opt == ("-V","") or opt == ("--version",""):
print __version__
sys.exit(1)
elif opt[0] in ["--verbose", "-v"]:
bibrank_options["verbose"] = int(opt[1])
elif opt == ("-a","") or opt == ("--add",""):
bibrank_options["cmd"] = "add"
if ("-x","") in opts or ("--del","") in opts:
usage(1)
elif opt[0] in ["--run", "-w"]:
bibrank_options["run"] = []
run = split(opt[1],",")
for key in range(0,len(run)):
bibrank_options["run"].append(run[key])
elif opt == ("-r","") or opt == ("--repair",""):
bibrank_options["cmd"] = "repair"
elif opt == ("-d","") or opt == ("--del",""):
bibrank_options["cmd"]="del"
elif opt[0] in [ "-u", "--user"]:
user = opt[1]
elif opt[0] in [ "-k", "--check"]:
bibrank_options["cmd"]= "check"
elif opt[0] in [ "-S", "--stat"]:
bibrank_options["cmd"] = "stat"
elif opt[0] in [ "-i", "--id" ]:
bibrank_options["id"] = bibrank_options["id"] + split_ranges(opt[1])
bibrank_options["last_updated"] = ""
elif opt[0] in [ "-c", "--collection" ]:
bibrank_options["collection"] = opt[1]
elif opt[0] in [ "-R", "--rebalance"]:
bibrank_options["quick"] = "no"
elif opt[0] in [ "-f", "--flush"]:
bibrank_options["flush"]=int(opt[1])
elif opt[0] in [ "-M", "--maxmem"]:
bibrank_options["maxmem"]=int(opt[1])
if bibrank_options["maxmem"] < base_process_size + 1000:
raise StandardError, "Memory usage should be higher than %d kB" % (base_process_size + 1000)
elif opt[0] in [ "-m", "--modified" ]:
bibrank_options["modified"] = get_date_range(opt[1]) #2002-10-27 13:57:26
bibrank_options["last_updated"] = ""
elif opt[0] in [ "-l", "--lastupdate" ]:
bibrank_options["last_updated"] = "last_updated"
elif opt[0] in [ "-s", "--sleeptime" ]:
get_datetime(opt[1]) # see if it is a valid shift
sleeptime=opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time = get_datetime(opt[1])
else:
usage(1)
except StandardError, e:
write_message(e, sys.stderr)
sys.exit(1)
user = authenticate(user)
if bibrank_options["verbose"]>=9:
write_message("Storing task options %s" % bibrank_options)
new_task_id = run_sql("""INSERT INTO schTASK (proc,user,runtime,sleeptime,arguments,status) VALUES ('bibrank',%s,%s,%s,%s,'WAITING')""", (user, sched_time, sleeptime, dumps(bibrank_options)))
print "Task #%d was successfully scheduled for execution." % new_task_id
return
def task_run(row):
"""Run the indexing task. The row argument is the BibSched task
queue row, containing if, arguments, etc.
Return 1 in case of success and 0 in case of failure.
"""
global task_id, bibrank_options
task_id = row[0]
task_proc = row[1]
bibrank_options = loads(row[6])
task_status = row[7]
# install signal handlers
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
if task_proc != "bibrank":
write_message("-The task #%d does not seem to be a BibRank task." % task_id, sys.stderr)
return 0
if task_status != "WAITING":
write_message("The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr)
return 0
if bibrank_options["verbose"]:
write_message("Task #%d started." % task_id)
task_update_status("RUNNING")
try:
bibrank_options = marshal.loads(row[6])
for key in bibrank_options["run"]:
write_message("")
file = etcdir + "/bibrank/" + key + ".cfg"
if bibrank_options["verbose"] >= 9:
write_message("Getting configuration from file: %s" % file)
config = ConfigParser.ConfigParser()
try:
config.readfp(open(file))
except StandardError, e:
write_message("Cannot find configurationfile: %s. The rankmethod may also not be registered using the BibRank Admin Interface." % file, sys.stderr)
raise StandardError
#Using the function variable to call the function related to the rank method
cfg_function = config.get("rank_method", "function")
func_object = globals().get(cfg_function)
if func_object:
func_object(row, key)
else:
write_message("Cannot run method '%s', no function to call" % key)
except StandardError, e:
write_message("\nException caught: %s" % e, sys.stderr)
traceback.print_tb(sys.exc_info()[2])
task_update_status("ERROR")
sys.exit(1)
task_update_status("DONE")
if bibrank_options["verbose"]:
write_message("Task #%d finished." % task_id)
return 1
def main():
if len(sys.argv) == 2:
try:
id = int(sys.argv[1])
except StandardError, err:
command_line()
sys.exit()
res = run_sql("SELECT * FROM schTASK WHERE id='%d'" % (id), None, 1)
if not res:
write_message("Selected task not found.", sys.stderr)
sys.exit(1)
try:
if not task_run(res[0]):
write_message("Error occurred. Exiting.", sys.stderr)
except StandardError, e:
write_message("Unexpected error occurred: %s." % e, sys.stderr)
write_message("Traceback is:")
traceback.print_tb(sys.exc_info()[2])
write_message("Exiting.")
task_update_status("ERROR")
else:
command_line()
if __name__ == "__main__":
main()

Event Timeline