diff --git a/modules/bibformat/bin/bibreformat.in b/modules/bibformat/bin/bibreformat.in
index 1fa299fe5..88c87f7a5 100644
--- a/modules/bibformat/bin/bibreformat.in
+++ b/modules/bibformat/bin/bibreformat.in
@@ -1,756 +1,757 @@
#!@PYTHON@
## -*- mode: python; coding: utf-8; -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Call BibFormat engine and create HTML brief (and other) formats for bibliographic records.
Upload formats via BibUpload."""
__revision__ = "$Id$"
## import interesting modules:
try:
import sys
from invenio.dbquery import run_sql, escape_string
from invenio.config import *
from invenio.search_engine import perform_request_search
from invenio.search_engine import print_record, encode_for_xml
from invenio.access_control_engine import acc_authorize_action
from invenio.bibformat import format_record
from invenio.bibformat_utils import encode_for_xml
from invenio.bibformat_config import CFG_BIBFORMAT_USE_OLD_BIBFORMAT
from invenio.bibrecord import create_records
import getopt
import getpass
import marshal
import signal
import string
import sys
import os
import re
import time
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
options = {} # global variable to hold task options
sql_queries = [] # holds SQL queries to be executed
cds_query = {} # holds CDS query parameters (fields, collection, pattern)
process_format = 0 # flag, process records without created format
process = 1 # flag, process records (unless count only)
fmt = "hb" # default format to be processed
sleeptime = "" # default sleeptime
format_string = "%Y-%m-%d %H:%M:%S" # date/time format
sched_time = time.strftime(format_string) # scheduled execution time in the date/time format
### run the bibreformat task bibsched scheduled
###
def bibreformat_task(sql, sql_queries, cds_query, process_format):
global process, fmt
t1 = os.times()[4]
### Query the database
###
if process_format:
print "Querying database for records with missing format ..."
without_format = without_fmt(sql)
recIDs = []
if cds_query['field'] != "" or \
cds_query['collection'] != "" or \
cds_query['pattern'] != "":
print "Querying database for records with old format (CDS query)..."
res = perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])
for item in res:
recIDs.append(item)
for sql_query in sql_queries:
print "Querying database for records with old format (SQL query) ..."
res = run_sql(sql_query)
for item in res:
recIDs.append(item[0])
### list of corresponding record IDs was retrieved
### bibformat the records selected
if process_format:
print "Records to be processed: %d" % (len(recIDs)+len(without_format))
print "Out of it records without created format: %d" % len(without_format)
else:
print "Records to be processed: %d" % (len(recIDs))
### Initialize main loop
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
### Iterate over all records prepared in lists I (option)
if process:
if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this when migration from php to python bibformat is done
iterate_over_old(recIDs, weburl, fmt)
else:
iterate_over_new(recIDs, weburl, fmt)
### Iterate over all records prepared in list II (no_format)
if process_format and process:
if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this when migration from php to python bibformat is done
iterate_over_old(without_format, weburl, fmt)
else:
iterate_over_new(without_format, weburl, fmt)
### Final statistics
t2 = os.times()[4]
elapsed = t2 - t1
message = "total records processed: %d" % total_rec
print message
message = "total processing time: %2f sec" % elapsed
print message
message = "Time spent on external call (os.system):"
print message
message = " bibformat: %2f sec" % tbibformat
print message
message = " bibupload: %2f sec" % tbibupload
print message
### Result set operations
###
def lhdiff(l1, l2):
"Does list difference via intermediate hash."
d = {}
ld = []
for e in l2:
d[e]=1
for e in l1:
if not d.has_key(e):
ld.append(e)
return ld
### Result set operations
###
def ldiff(l1, l2):
"Returns l1 - l2."
ld = []
for e in l1:
if not e in l2:
ld.append(e)
return ld
### Identify recIDs of records with missing format
###
def without_fmt(sql):
"List of record IDs to be reformated, not having the specified format yet"
global fmt
xm1, xm2, format1, format2 = [],[],[],[]
q1 = sql['q1']
q2 = sql['q2']
## get complete recID list of xm formatted records
xm1 = run_sql(q1)
for item in xm1:
xm2.append(item[0])
## get complete recID list of formatted records
format1 = run_sql(q2)
for item in format1:
format2.append(item[0])
return lhdiff(xm2,format2)
### Bibreformat all selected records (using new python bibformat)
### (see iterate_over_old further down)
def iterate_over_new(list, weburl, fmt):
"Iterate odver list of IDs"
n_it_rec = 0 # Number of records for current iteration
n_it_max = 10000 # Number of max records in one iteration
n_rec = 0 # Number of formatted records
total_rec = len(list) # Total number of records
formatted_records = '' # (string-)List of formatted record of an iteration
for recID in list:
n_rec +=1
n_it_rec += 1
message = "Processing record %d with format %s (New BibFormat)" % (recID, fmt)
print message
t11 = os.times()[4]
message = "START bibformat external call"
print message
### bibformat external call
###
formatted_record = format_record(recID, fmt, on_the_fly=True)
# Encapsulate record in xml tags that bibupload understands
prologue = '''
%s
%s
''' % (recID, fmt)
epilogue = '''
'''
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
formatted_records += prologue + encode_for_xml(formatted_record) + epilogue
# every n_max record, upload all formatted records.
# also upload if recID is last one
if n_it_rec > n_it_max or n_rec == total_rec:
#Save formatted records to disk for bibupload
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filehandle = open(finalfilename,"w")
filehandle.write(formatted_records)
filehandle.close()
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir, finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
#Reset iteration state
n_it_rec = 0
xml_content = ''
def iterate_over_old(list, weburl, fmt):
"Iterate odver list of IDs"
n_rec = 0
n_max = 10000
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
for record in list:
n_rec = n_rec + 1
total_rec = total_rec + 1
message = "Processing record: %d" % (record)
print message
query = "id=%d&of=xm" % (record)
count = 0
contents = print_record(record, 'xm')
while (contents == "") and (count < 10):
contents = print_record(record, 'xm')
count = count + 1
time.sleep(10)
if count == 10:
sys.stderr.write("Failed to download %s from %s after 10 attempts... terminating" % (query, weburl))
sys.exit(0)
xml_content = xml_content + contents
if xml_content:
if n_rec >= n_max:
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir,finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
n_rec = 0
xml_content = ''
### Process the last re-formated chunk
###
if n_rec > 0:
print "Processing last record set (%d)" % n_rec
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir,finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
return
### Bibshed compatibility procedures
###
def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
print value
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def write_message(msg, stream=sys.stdout):
"""Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame))
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame))
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame))
write_message("stopping...")
task_update_status("STOPPING")
write_message("flushing cache or whatever...")
time.sleep(3)
write_message("closing tables or whatever...")
time.sleep(1)
write_message("stopped")
task_update_status("STOPPED")
sys.exit(0)
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame))
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
# do nothing for unknown signals:
write_message("unknown signal %d (frame %s) ignored" % (sig, frame))
def authenticate(user, header="BibReformat Task Submission", action="runbibformat"):
"""Authenticate the user against the user database.
Check for its password, if it exists.
Check for action access rights.
Return user name upon authorization success,
do system exit upon authorization failure.
"""
print header
print "=" * len(header)
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
## first check user pw:
- res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1)
+ res = run_sql("select id,password from user where email=%s", (user,), 1) + \
+ run_sql("select id,password from user where nickname=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
## secondly check authorization for the action:
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def task_submit():
"""Submits task to the BibSched task queue. This is what people will be invoking via command line."""
global options, sched_time, sleep_time
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
## authenticate user:
user = authenticate(options.get("user", ""))
## submit task:
task_id = run_sql("""INSERT INTO schTASK (id,proc,user,status,arguments,sleeptime,runtime) VALUES (NULL,'bibreformat',%s,'WAITING',%s,%s,%s)""",
(user, marshal.dumps(options),sleeptime,escape_string(sched_time)))
## update task number:
options["task"] = task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id))
write_message("Task #%d submitted." % task_id)
return task_id
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global options
return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"]))
def task_update_status(val):
"""Updates status information in the BibSched task table."""
global options
return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"]))
def task_read_status(task_id):
"""Read status information in the BibSched task table."""
res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1)
try:
out = res[0][0]
except:
out = 'UNKNOWN'
return out
def task_get_options(id):
"""Returns options for the task 'id' read from the BibSched task queue table."""
out = {}
res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibreformat'", (id,))
try:
out = marshal.loads(res[0][0])
except:
write_message("Error: BibReformat task %d does not seem to exist." % id)
sys.exit(1)
return out
def task_run(task_id, process_format):
"""Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call."""
global options, process, fmt, sched_time
options = task_get_options(task_id) # get options from BibSched task table
## check task id:
if not options.has_key("task"):
write_message("Error: The task #%d does not seem to be a BibReformat task." % task_id)
return
## initialize parameters
if options.has_key("format"):
fmt = options["format"]
else:
fmt = "hb"
sql = {
"all" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt,
"last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt,
"q1" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'",
"q2" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt
}
if options.has_key("all"):
sql_queries.append(sql['all'])
if options.has_key("without"):
process_format = 1
if options.has_key("noprocess"):
process = 0
if options.has_key("last"):
sql_queries.append(sql['last'])
if options.has_key("collection"):
cds_query['collection'] = options['collection']
else:
cds_query['collection'] = ""
if options.has_key("field"):
cds_query['field'] = options['field']
else:
cds_query['field'] = ""
if options.has_key("pattern"):
cds_query['pattern'] = options['pattern']
else:
cds_query['pattern'] = ""
### sql commands to be executed during the script run
###
## check task status:
task_status = task_read_status(task_id)
if task_status != "WAITING":
write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status))
return
## update task status:
task_update_status("RUNNING")
## initialize signal handler:
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
## run the task:
bibreformat_task(sql, sql_queries, cds_query, process_format)
## we are done:
task_update_status("DONE")
return
def usage(exitcode=1, msg=""):
"""Prints usage info."""
if msg:
sys.stderr.write("Error: %s.\n" % msg)
sys.stderr.write("Usage: %s [options]\n" % sys.argv[0])
sys.stderr.write(" -u, --user=USER \t\t User name to submit the task as, password needed.\n")
sys.stderr.write(" -h, --help \t\t Print this help.\n")
sys.stderr.write(" -V, --version \t\t Print version information.\n")
sys.stderr.write(" -v, --verbose=LEVEL \t\t Verbose level (0=min,1=normal,9=max).\n")
sys.stderr.write(" -s, --sleeptime=SLEEP\t\t Time after which to repeat tasks (no)\n")
sys.stderr.write(" -t, --time=DATE \t\t Moment for the task to be active (now).\n")
sys.stderr.write(" -a, --all \t\t All records\n")
sys.stderr.write(" -c, --collection \t\t Select records by collection\n")
sys.stderr.write(" -f, --field \t\t Select records by field.\n")
sys.stderr.write(" -p, --pattern \t\t Select records by pattern.\n")
sys.stderr.write(" -o, --format \t\t Specify output format to be (re-)created. (default HB)\n")
sys.stderr.write(" -n, --noprocess \t\t Count records to be processed only (no processing done)\n")
sys.stderr.write("\n")
sys.stderr.write(" Example: bibreformat -n Show how many records are to be bibreformated.")
sys.stderr.write("\n")
sys.exit(exitcode)
def main():
"""Main function that analyzes command line input and calls whatever is appropriate.
Useful for learning on how to write BibSched tasks."""
global options, sched_time, sleeptime
## parse command line:
if len(sys.argv) == 2 and sys.argv[1].isdigit():
## A - run the task
task_id = int(sys.argv[1])
process_format = 0
task_run(task_id, process_format)
else:
## B - submit the task
process_format = 0
options = {} # will hold command-line options
options["verbose"] = 1
try:
opts, args = getopt.getopt(sys.argv[1:], "hVv:u:ac:f:s:p:lo:nt:wl", ["help", "version", "verbose=","user=","all","collection=","field=","sleeptime=","pattern=","format=","noprocess","time=","without","last"])
except getopt.GetoptError, err:
usage(1, err)
clp = 0 # default parameters flag
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
usage(0)
elif opt[0] in ["-V", "--version"]:
print __revision__
sys.exit(0)
elif opt[0] in [ "-u", "--user"]:
options["user"] = opt[1]
elif opt[0] in ["-v", "--verbose"]:
options["verbose"] = int(opt[1])
elif opt[0] in ["-a", "--all"]:
options["all"] = 1
options["without"] = 1
clp = 1
elif opt[0] in ["-c", "--collection"]:
options["collection"]=opt[1]
clp = 1
elif opt[0] in ["-n", "--noprocess"]:
options["noprocess"] = 1
elif opt[0] in ["-f", "--field"]:
options["field"] = opt[1]
clp = 1
elif opt[0] in ["-p","--pattern"]:
options["pattern"] = opt[1]
clp = 1
elif opt[0] in ["-o","--format"]:
options["format"] = opt[1]
elif opt[0] in ["-s", "--sleeptime" ]:
get_date(opt[1]) # see if it is a valid shift
sleeptime = opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time = get_date(opt[1])
if clp == 0: # default
options["without"] = 1
options["last"] = 1
except StandardError, e:
usage(e)
task_submit()
return
### okay, here we go:
if __name__ == '__main__':
main()
diff --git a/modules/bibharvest/bin/oaiarchive.in b/modules/bibharvest/bin/oaiarchive.in
index 4f8a40107..6303b779e 100644
--- a/modules/bibharvest/bin/oaiarchive.in
+++ b/modules/bibharvest/bin/oaiarchive.in
@@ -1,370 +1,371 @@
#!@PYTHON@
## -*- mode: python; coding: utf-8; -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
OAI repository archive and management tool
Usage: oaiarchive [options]
Options:
-o --oaiset= Specify setSpec
-h --help Print this help
-V --version Print version information and exit
-a --add Add records to OAI repository
-d --delete Remove records from OAI repository
-r --report Print OAI repository status
-i --info Give info about OAI set (default)
-p --upload Upload records
-n --no-process Do not upload records (default)
Examples:
Expose set -setname- via OAI gateway
oaiarchive --oaiset='setname' --add --upload
oaiarchive -apo 'setname'
Remove records defined by set -setname- from OAI repository
oaiarchive --oaiset='setname' --delete --upload
oaiarchive -dpo 'setname'
Expose entire repository via OAI gateway
oaiarchive --oaiset=global --add --upload
oaiarchive -apo global
Print OAI repository status
oaiarchive -r
"""
__revision__ = "$Id$"
try:
import sys
from invenio.oaiarchive_engine import oaiarchive_task
from invenio.oaiarchive_engine import printInfo
from invenio.dbquery import run_sql, escape_string
from invenio.config import *
from invenio.search_engine import perform_request_search
from invenio.search_engine import print_record
from invenio.access_control_engine import acc_authorize_action
import getopt
import getpass
import string
import marshal
import signal
import time
import re
except ImportError, e:
print "Error: %s" % e
sys.exit(1)
options = {} # global variable to hold task options
sleeptime = "" # default sleeptime
sched_time = time.strftime("%Y-%m-%d %H:%M:%S") # scheduled execution time in the date/time format
### Bibshed compatibility procedures
###
def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def write_message(msg, stream=sys.stdout):
"""Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame))
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame))
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame))
write_message("stopping...")
task_update_status("STOPPING")
write_message("flushing cache or whatever...")
time.sleep(3)
write_message("closing tables or whatever...")
time.sleep(1)
write_message("stopped")
task_update_status("STOPPED")
sys.exit(0)
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame))
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
# do nothing for unknown signals:
write_message("unknown signal %d (frame %s) ignored" % (sig, frame))
def authenticate(user, header="OAI Archive Task Submission", action="runoaiarchive"):
"""Authenticate the user against the user database.
Check for its password, if it exists.
Check for action access rights.
Return user name upon authorization success,
do system exit upon authorization failure.
"""
print header
print "=" * len(header)
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
## first check user pw:
- res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1)
+ res = run_sql("select id,password from user where email=%s", (user,), 1) + \
+ run_sql("select id,password from user where nickname=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
## secondly check authorization for the action:
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def task_submit():
"""Submits task to the BibSched task queue. This is what people will be invoking via command line."""
global options, sched_time, sleep_time
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
## authenticate user:
user = authenticate(options.get("user", ""))
## submit task:
task_id = run_sql("""INSERT INTO schTASK (id,proc,user,status,arguments,sleeptime,runtime) VALUES (NULL,'oaiarchive',%s,'WAITING',%s,%s,%s)""",
(user, marshal.dumps(options),sleeptime,escape_string(sched_time)))
## update task number:
options["task"] = task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id))
write_message("Task #%d submitted." % task_id)
return task_id
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global options
return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"]))
def task_update_status(val):
"""Updates status information in the BibSched task table."""
global options
return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"]))
def task_read_status(task_id):
"""Read status information in the BibSched task table."""
res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1)
try:
out = res[0][0]
except:
out = 'UNKNOWN'
return out
def task_get_options(id):
"""Returns options for the task 'id' read from the BibSched task queue table."""
out = {}
res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='oaiarchive'", (id,))
try:
out = marshal.loads(res[0][0])
except:
write_message("Error: OAIarchive task %d does not seem to exist." % id)
sys.exit(1)
return out
def task_run(task_id):
"""Runs the task"""
global options
options = task_get_options(task_id) # get options from BibSched task table
## check task id:
if not options.has_key("task"):
write_message("Error: The task #%d does not seem to be a OAI archive task." % task_id)
return
## initialize parameters
if options.has_key("option"):
### sql commands to be executed during the script run
###
## check task status:
task_status = task_read_status(task_id)
if task_status != "WAITING":
write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status))
return
## update task status:
task_update_status("RUNNING")
## initialize signal handler:
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
## run the task:
oaiarchive_task(options)
## we are done:
task_update_status("DONE")
return
#########################
def main():
"""Main function that analyzes command line input and calls whatever is appropriate.
Useful for learning on how to write BibSched tasks."""
global options, sched_time, sleeptime
## parse command line:
if len(sys.argv) == 2 and sys.argv[1].isdigit():
## A - run the task
task_id = int(sys.argv[1])
task_run(task_id)
else:
## B - submit the task
options = {} # will hold command-line options
options["verbose"] = 1
try:
opts, args = getopt.getopt(sys.argv[1:], "hVv:u:s:t:ado:pirn", ["help", "version", "verbose=","user=","sleeptime=","time=","add","delete","oaiset=","upload","info","report","no-process"])
except getopt.GetoptError:
printInfo()
sys.exit(1)
## set defaults
options["upload"] = 0
options["mode"] = 0
options["oaiset"] = ""
options["nice"] = 0
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
printInfo()
sys.exit(0)
elif opt[0] in ["-V", "--version"]:
print __revision__
sys.exit(0)
elif opt[0] in [ "-u", "--user"]:
options["user"] = opt[1]
elif opt[0] in ["-v", "--verbose"]:
options["verbose"] = int(opt[1])
elif opt[0] in ["-s", "--sleeptime" ]:
get_date(opt[1]) # see if it is a valid shift
sleeptime = opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time = get_date(opt[1])
elif opt[0] in ["-n", "--nice"]:
options["nice"] = opt[1]
elif opt[0] in ["-o", "--oaiset"]:
options["oaiset"] = opt[1]
elif opt[0] in ["-a", "--add"]:
options["mode"] = 1
elif opt[0] in ["-d", "--delete"]:
options["mode"] = 2
elif opt[0] in ["-p", "--upload"]:
options["upload"] = 1
elif opt[0] in ["-i", "--info"]:
options["mode"] = 0
elif opt[0] in ["-r", "--report"]:
options["mode"] = 3
elif opt[0] in ["-n", "--no-process"]:
options["upload"] = 0
except StandardError:
printInfo()
sys.exit(1)
task_submit()
return
### okay, here we go:
if __name__ == '__main__':
main()
diff --git a/modules/bibharvest/lib/oaiharvestlib.py b/modules/bibharvest/lib/oaiharvestlib.py
index 5fd01eaae..55e8e8681 100644
--- a/modules/bibharvest/lib/oaiharvestlib.py
+++ b/modules/bibharvest/lib/oaiharvestlib.py
@@ -1,640 +1,641 @@
# -*- coding: utf-8 -*-
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
oaiharvest implementation. See oaiharvest executable for entry point.
"""
__revision__ = "$Id$"
import marshal
import getopt
import getpass
import string
import os
import sre
import sys
import time
import signal
import traceback
import calendar
from invenio.config import \
bibconvert, \
bibupload, \
bindir, \
tmpdir, \
version
from invenio.bibindex_engine_config import *
from invenio.dbquery import run_sql, escape_string
from invenio.access_control_engine import acc_authorize_action
options = {} # global variable to hold task options
## precompile some often-used regexp for speed reasons:
sre_subfields = sre.compile('\$\$\w');
sre_html = sre.compile("(?s)<[^>]*>|?\w+;")
sre_datetime_shift = sre.compile("([-\+]{0,1})([\d]+)([dhms])")
tmpHARVESTpath = tmpdir + '/oaiharvest'
def write_message(msg, stream=sys.stdout):
"""Write message and flush output stream (may be sys.stdout or sys.stderr)."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
return
def usage(code, msg=''):
"Prints usage for this module."
if msg:
sys.stderr.write("Error: %s.\n" % msg)
usagetext = """ Usage: oaiharvest [options]
Examples:
oaiharvest -r arxiv -s 24h
oaiharvest -r pubmed -d 2005-05-05:2005-05-10 -t 10m
Specific options:
-r, --repository=REPOS_ONE,"REPOS TWO" name of the OAI repositories to be harvested (default=all)
-d, --dates=yyyy-mm-dd:yyyy-mm-dd harvest repositories between specified dates (overrides repositories' last updated timestamps)
Scheduling options:
-u, --user=USER user name to store task, password needed
-s, --sleeptime=SLEEP time after which to repeat tasks (no)
e.g.: 1s, 30m, 24h, 7d
-t, --time=TIME moment for the task to be active (now)
e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
"""
sys.stderr.write(usagetext)
sys.exit(code)
def authenticate(user, header="oaiharvest Task Submission", action="runoaiharvest"):
"""Authenticate the user against the user database.
Check for its password, if it exists.
Check for action access rights.
Return user name upon authorization success,
do system exit upon authorization failure.
"""
print header
print "=" * len(header)
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
## first check user pw:
- res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1)
+ res = run_sql("select id,password from user where email=%s", (user,), 1) + \
+ run_sql("select id,password from user where nickname=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
## secondly check authorization for the action:
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = sre_datetime_shift.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def task_run(row):
"""Run the harvesting task. The row argument is the Bibharvest task
queue row, containing if, arguments, etc.
Return 1 in case of success and 0 in case of failure.
"""
global options
reposlist = []
datelist = []
dateflag = 0
# read from SQL row:
task_id = row[0]
task_proc = row[1]
options = marshal.loads(row[6])
task_status = row[7]
# sanity check:
if task_proc != "oaiharvest":
write_message("The task #%d does not seem to be a oaiharvest task." % task_id, sys.stderr)
return 0
if task_status != "WAITING":
write_message("The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr)
return 0
# we can run the task now:
if options["verbose"]:
write_message("Task #%d started." % task_id)
task_starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
task_update_status("RUNNING")
# install signal handlers
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
### go ahead: build up the reposlist
if options["repository"] != None:
### user requests harvesting from selected repositories
write_message("harvesting from selected repositories")
for reposname in options["repository"]:
row = get_row_from_reposname(reposname)
if row==[]:
write_message("source name " + reposname + " is not valid")
continue
else:
reposlist.append(get_row_from_reposname(reposname))
else:
### user requests harvesting from all repositories
write_message("harvesting from all repositories in the database")
reposlist = get_all_rows_from_db()
### go ahead: check if user requested from-until harvesting
if options["dates"]:
### for each repos simply perform a from-until date harvesting... no need to update anything
dateflag = 1
for element in options["dates"]:
datelist.append(element)
for repos in reposlist:
postmode = str(repos[0][9])
if postmode=="h" or postmode=="h-c" or postmode=="h-u" or postmode=="h-c-u":
harvestpath = tmpdir + "/oaiharvest" + str(os.getpid())
if dateflag == 1:
res = call_bibharvest(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, fro=str(datelist[0]), until=str(datelist[1]))
if res==0 :
write_message("source " + str(repos[0][6]) + " was harvested from " + str(datelist[0]) + " to " + str(datelist[1]))
else :
write_message("an error occurred while harvesting from source " + str(repos[0][6]) + " for the dates chosen")
continue
elif dateflag != 1 and repos[0][7] == None and repos[0][8] != 0:
write_message("source " + str(repos[0][6]) + " was never harvested before - harvesting whole repository")
res = call_bibharvest(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath)
if res==0 :
update_lastrun(repos[0][0])
else :
write_message("an error occurred while harvesting from source " + str(repos[0][6]))
continue
elif dateflag != 1 and repos[0][8] != 0:
### check that update is actually needed, i.e. lastrun+frequency>today
timenow = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lastrundate = sre.sub(r'\.[0-9]+$', '', str(repos[0][7])) # remove trailing .00
timeinsec = int(repos[0][8])*60*60
updatedue = add_timestamp_and_timelag(lastrundate, timeinsec)
proceed = compare_timestamps_with_tolerance(updatedue, timenow)
if proceed==0 or proceed==-1 : #update needed!
write_message("source " + str(repos[0][6]) + " is going to be updated")
fromdate = str(repos[0][7])
fromdate = fromdate.split()[0] # get rid of time of the day for the moment
res = call_bibharvest(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, fro=fromdate)
if res==0 :
update_lastrun(repos[0][0])
else :
write_message("an error occurred while harvesting from source " + str(repos[0][6]))
continue
else:
write_message("source " + str(repos[0][6]) + " does not need updating")
continue
elif dateflag != 1 and repos[0][8] == 0:
write_message("source " + str(repos[0][6]) + " has frequency set to 'Never' so it will not be updated")
continue
if postmode=="h-u":
res = call_bibupload(convertpath=harvestpath)
if res==0 :
write_message("material harvested from source " + str(repos[0][6]) + " was successfully uploaded")
else :
write_message("an error occurred while uploading harvest from " + str(repos[0][6]))
continue
if postmode=="h-c" or postmode=="h-c-u":
convertpath = tmpdir + "/bibconvertrun" + str(os.getpid())
res = call_bibconvert(config=str(repos[0][5]), harvestpath=harvestpath, convertpath=convertpath)
if res==0 :
write_message("material harvested from source " + str(repos[0][6]) + " was successfully converted")
else :
write_message("an error occurred while converting from " + str(repos[0][6]))
continue
if postmode=="h-c-u":
res = call_bibupload(convertpath=convertpath)
if res==0 :
write_message("material harvested from source " + str(repos[0][6]) + " was successfully uploaded")
else :
write_message("an error occurred while uploading harvest from " + str(repos[0][6]))
continue
elif postmode not in ["h", "h-c", "h-u", "h-c-u"] : ### this should not happen
write_message("invalid postprocess mode: " + postmode + " skipping repository")
continue
task_update_status("DONE")
if options["verbose"]:
write_message("Task #%d finished." % task_id)
return 1
def add_timestamp_and_timelag(timestamp,
timelag):
""" Adds a time lag in seconds to a given date (timestamp). Returns the resulting date. """
# remove any trailing .00 in timestamp:
timestamp = sre.sub(r'\.[0-9]+$', '', timestamp)
# first convert timestamp to Unix epoch seconds:
timestamp_seconds = calendar.timegm(time.strptime(timestamp, "%Y-%m-%d %H:%M:%S"))
# now add them:
result_seconds = timestamp_seconds + timelag
result = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(result_seconds))
return result
def update_lastrun(index):
""" A method that updates the lastrun of a repository successfully harvested """
try:
today = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
sql = 'UPDATE oaiHARVEST SET lastrun="%s" WHERE id=%s' % (today, index)
res = run_sql(sql)
return 1
except StandardError, e:
return (0,e)
def call_bibharvest(prefix, baseurl, harvestpath, fro="", until=""):
""" A method that calls bibharvest and writes harvested output to disk """
try:
command = '%s/bibharvest -o %s -v ListRecords -p %s ' % (bindir, harvestpath, prefix)
if fro!="":
command += '-f %s ' % fro
if until!="":
command += '-u %s ' % until
command += baseurl
print "Start harvesting"
#print command
ret = os.system(command)
print "Harvesting finished, merging files"
harvestdir, filename = os.path.split(harvestpath)
#print "get files"
files = os.listdir(harvestdir)
#print "sort file"
files.sort()
#print "open dest file"
hf = file(harvestpath, 'w')
for f in files:
if f[:len(filename)] == filename:
print "processing file %s"%f
rf = file(os.path.join(harvestdir, f), 'r')
hf.write(rf.read())
hf.write("\n")
rf.close()
#os.remove(os.path.join(harvestdir, f))
hf.close()
print "Files merged"
return 0
except StandardError, e:
return (0,e)
def call_bibconvert(config, harvestpath, convertpath):
""" A method that reads from a file and converts according to a BibConvert Configuration file. Converted output is returned """
command = """%s/bibconvert -c %s < %s > %s """ % (bindir, config, harvestpath, convertpath)
stdout = os.popen(command)
return 0
def call_bibupload(convertpath):
""" A method that uploads a file to the database - calls bibUpload """
command = '%s/bibupload -i %s ' % (bindir, convertpath)
p=os.system(command)
return p
def get_row_from_reposname(reposname):
""" Returns all information about a row (OAI source) from the source name """
try:
sql = 'select * from oaiHARVEST where name="%s"' % escape_string(reposname)
res = run_sql(sql)
reposdata = []
for element in res:
reposdata.append(element)
return reposdata
except StandardError, e:
return (0,e)
def get_all_rows_from_db():
""" This method retrieves the full database of repositories and returns a list containing (in exact order):
| id | baseurl | metadataprefix | arguments | comment | bibconvertcfgfile | name | lastrun | frequency | postprocess |
"""
try:
reposlist = []
sql = """select id from oaiHARVEST"""
idlist = run_sql(sql)
for index in idlist:
sql = """select * from oaiHARVEST where id=%s""" % index
reposelements = run_sql(sql)
repos = []
for element in reposelements:
repos.append(element)
reposlist.append(repos)
return reposlist
except StandardError, e:
return (0,e)
def compare_timestamps_with_tolerance(timestamp1,
timestamp2,
tolerance=0):
"""Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form
'2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument
(in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2
minus TOLERANCE, 0 if they are equal within TOLERANCE limit,
and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE.
"""
# remove any trailing .00 in timestamps:
timestamp1 = sre.sub(r'\.[0-9]+$', '', timestamp1)
timestamp2 = sre.sub(r'\.[0-9]+$', '', timestamp2)
# first convert timestamps to Unix epoch seconds:
timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S"))
timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S"))
# now compare them:
if timestamp1_seconds < timestamp2_seconds - tolerance:
return -1
elif timestamp1_seconds > timestamp2_seconds + tolerance:
return 1
else:
return 0
def command_line():
global options
long_flags =["repository=", "dates="
"user=","sleeptime=","time=",
"help", "version", "verbose="]
short_flags ="r:d:u:s:t:hVv:"
format_string = "%Y-%m-%d %H:%M:%S"
repositories = None
dates = None
sleeptime = ""
try:
opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags)
except getopt.GetoptError, err:
write_message(err, sys.stderr)
usage(1)
if args:
usage(1)
options={"sleeptime":0, "verbose":1, "repository":0, "dates":0}
sched_time = time.strftime(format_string)
user = ""
# Check for key options
try:
for opt in opts:
if opt == ("-h","") or opt == ("--help",""):
usage(1)
elif opt == ("-V","") or opt == ("--version",""):
print __revision__
sys.exit(1)
elif opt[0] in ["--verbose", "-v"]:
options["verbose"] = int(opt[1])
elif opt[0] in [ "-r", "--repository" ]:
repositories = opt[1]
elif opt[0] in [ "-d", "--dates" ]:
dates = opt[1]
elif opt[0] in [ "-u", "--user"]:
user = opt[1]
elif opt[0] in [ "-s", "--sleeptime" ]:
get_datetime(opt[1]) # see if it is a valid shift
sleeptime= opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time= get_datetime(opt[1])
else: usage(1)
except StandardError, e:
write_message(e, sys.stderr)
sys.exit(1)
options["repository"]=get_repository_names(repositories)
if dates != None:
options["dates"]=get_dates(dates)
if dates != None and options["dates"]==None:
write_message("Date format not valid. Quitting task...")
sys.exit(1)
user = authenticate(user)
if options["verbose"] >= 9:
print ""
write_message("storing task options %s\n" % options)
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
new_task_id = run_sql("""INSERT INTO schTASK (proc,user,runtime,sleeptime,arguments,status)
VALUES ('oaiharvest',%s,%s,%s,%s,'WAITING')""",
(user, sched_time, sleeptime, marshal.dumps(options)))
## update task number:
options["task"] = new_task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), new_task_id))
print "Task #%d was successfully scheduled for execution." % new_task_id
return
def get_dates(dates):
""" A method to validate and process the dates input by the user at the command line """
twodates = []
if dates:
datestring = string.split(dates, ":")
if len(datestring)==2:
for date in datestring:
### perform some checks on the date format
datechunks = string.split(date, "-")
if len(datechunks)==3:
try:
if int(datechunks[0]) and int(datechunks[1]) and int(datechunks[2]):
twodates.append(date)
except StandardError:
write_message("Dates have invalid format, not 'yyyy-mm-dd:yyyy-mm-dd'")
twodates=None
return twodates
else:
write_message("Dates have invalid format, not 'yyyy-mm-dd:yyyy-mm-dd'")
twodates=None
return twodates
## final check.. date1 must me smaller than date2
date1 = str(twodates[0]) + " 01:00:00"
date2 = str(twodates[1]) + " 01:00:00"
if compare_timestamps_with_tolerance(date1, date2)!=-1:
write_message("First date must be before second date.")
twodates=None
return twodates
else:
write_message("Dates have invalid format, not 'yyyy-mm-dd:yyyy-mm-dd'")
twodates=None
else:
twodates=None
return twodates
def get_repository_names(repositories):
""" A method to validate and process the repository names input by the user at the command line """
repository_names = []
if repositories:
names = string.split(repositories, ",")
for name in names:
### take into account both single word names and multiple word names (which get wrapped around "" or '')
quote = "'"
doublequote = '"'
if name.find(quote)==0 and name.find(quote)==len(name):
name = name.split(quote)[1]
if name.find(doublequote)==0 and name.find(doublequote)==len(name):
name = name.split(doublequote)[1]
repository_names.append(name)
else:
repository_names=None
return repository_names
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame))
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame))
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame))
write_message("stopping...")
task_update_status("STOPPING")
errcode = 0
try:
task_sig_stop_commands()
write_message("stopped")
task_update_status("STOPPED")
except StandardError, err:
write_message("Error during stopping! %e" % err)
task_update_status("STOPPINGFAILED")
errcode = 1
sys.exit(errcode)
def task_sig_stop_commands():
"""Do all the commands necessary to stop the task before quitting.
Useful for task_sig_stop() handler.
"""
write_message("stopping commands started")
pass
write_message("stopping commands ended")
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame))
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
# do nothing for unknown signals:
write_message("unknown signal %d (frame %s) ignored" % (sig, frame))
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global options
if options["verbose"] >= 9:
write_message("Updating task progress to %s." % msg)
return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"]))
def task_update_status(val):
"""Updates state information in the BibSched task table."""
global options
if options["verbose"] >= 9:
write_message("Updating task status to %s." % val)
return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"]))
def main():
"""Reads arguments and either runs the task, or starts user-interface (command line)."""
if len(sys.argv) == 2:
try:
task_id = int(sys.argv[1])
except StandardError:
command_line()
sys.exit()
res = run_sql("SELECT * FROM schTASK WHERE id='%d'" % (task_id), None, 1)
if not res:
write_message("Selected task not found.", sys.stderr)
sys.exit(1)
try:
if not task_run(res[0]):
write_message("Error occurred. Exiting.", sys.stderr)
except StandardError, e:
write_message("Unexpected error occurred: %s." % e, sys.stderr)
if options["verbose"] >= 9:
traceback.print_tb(sys.exc_info()[2])
write_message("Exiting.")
task_update_status("ERROR")
else:
command_line()
diff --git a/modules/bibindex/lib/bibindex_engine.py b/modules/bibindex/lib/bibindex_engine.py
index dfade3868..f2f42fe79 100644
--- a/modules/bibindex/lib/bibindex_engine.py
+++ b/modules/bibindex/lib/bibindex_engine.py
@@ -1,1659 +1,1660 @@
# -*- coding: utf-8 -*-
##
## $Id$
## BibIndxes bibliographic data, reference and fulltext indexing utility.
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BibIndex indexing engine implementation. See bibindex executable for entry point.
"""
__revision__ = "$Id$"
import marshal
from zlib import compress,decompress
import string
import getopt
import getpass
import os
import sre
import sys
import time
import Numeric
import urllib2
import signal
import tempfile
import traceback
import cStringIO
from invenio.config import \
CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS, \
CFG_BIBINDEX_CHARS_PUNCTUATION, \
CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY, \
CFG_BIBINDEX_MIN_WORD_LENGTH, \
CFG_BIBINDEX_REMOVE_HTML_MARKUP, \
CFG_BIBINDEX_STEMMER_DEFAULT_LANGUAGE, \
CFG_MAX_RECID, \
version, \
weburl
from invenio.bibindex_engine_config import *
from invenio.search_engine import perform_request_search, strip_accents
from invenio.dbquery import run_sql, escape_string, DatabaseError
from invenio.access_control_engine import acc_authorize_action
from invenio.bibindex_engine_stopwords import is_stopword
from invenio.bibindex_engine_stemmer import stem
## import optional modules:
try:
import psyco
psyco.bind(get_words_from_phrase)
psyco.bind(WordTable.merge_with_old_recIDs)
psyco.bind(serialize_via_numeric_array)
psyco.bind(serialize_via_marshal)
psyco.bind(deserialize_via_numeric_array)
psyco.bind(deserialize_via_marshal)
except:
pass
def write_message(msg, stream=sys.stdout):
"""Write message and flush output stream (may be sys.stdout or sys.stderr)."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
return
## precompile some often-used regexp for speed reasons:
sre_subfields = sre.compile('\$\$\w');
sre_html = sre.compile("(?s)<[^>]*>|?\w+;")
sre_block_punctuation_begin = sre.compile(r"^"+CFG_BIBINDEX_CHARS_PUNCTUATION+"+")
sre_block_punctuation_end = sre.compile(CFG_BIBINDEX_CHARS_PUNCTUATION+"+$")
sre_punctuation = sre.compile(CFG_BIBINDEX_CHARS_PUNCTUATION)
sre_separators = sre.compile(CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS)
sre_datetime_shift = sre.compile("([-\+]{0,1})([\d]+)([dhms])")
nb_char_in_line = 50 # for verbose pretty printing
chunksize = 1000 # default size of chunks that the records will be treated by
wordTables = []
base_process_size = 4500 # process base size
options = {} # will hold task options
## Dictionary merging functions
def intersection(dict1, dict2):
"Returns intersection of the two dictionaries."
int_dict = {}
if len(dict1) > len(dict2):
for e in dict2:
if dict1.has_key(e):
int_dict[e] = 1
else:
for e in dict1:
if dict2.has_key(e):
int_dict[e] = 1
return int_dict
def union(dict1, dict2):
"Returns union of the two dictionaries."
union_dict = {}
for e in dict1.keys():
union_dict[e] = 1
for e in dict2.keys():
union_dict[e] = 1
return union_dict
def diff(dict1, dict2):
"Returns dict1 - dict2."
diff_dict = {}
for e in dict1.keys():
if not dict2.has_key(e):
diff_dict[e] = 1
return diff_dict
def list_union(list1, list2):
"Returns union of the two lists."
union_dict = {}
for e in list1:
union_dict[e] = 1
for e in list2:
union_dict[e] = 1
return union_dict.keys()
## safety function for killing slow DB threads:
def kill_sleepy_mysql_threads(max_threads=CFG_MAX_MYSQL_THREADS, thread_timeout=CFG_MYSQL_THREAD_TIMEOUT):
"""Check the number of DB threads and if there are more than
MAX_THREADS of them, lill all threads that are in a sleeping
state for more than THREAD_TIMEOUT seconds. (This is useful
for working around the the max_connection problem that appears
during indexation in some not-yet-understood cases.) If some
threads are to be killed, write info into the log file.
"""
res = run_sql("SHOW FULL PROCESSLIST")
if len(res) > max_threads:
for row in res:
r_id,r_user,r_host,r_db,r_command,r_time,r_state,r_info = row
if r_command == "Sleep" and int(r_time) > thread_timeout:
run_sql("KILL %s", (r_id,))
if options["verbose"] >= 1:
write_message("WARNING: too many DB threads, killing thread %s" % r_id)
return
## MARC-21 tag/field access functions
def get_fieldvalues(recID, tag):
"""Returns list of values of the MARC-21 'tag' fields for the record
'recID'."""
out = []
bibXXx = "bib" + tag[0] + tag[1] + "x"
bibrec_bibXXx = "bibrec_" + bibXXx
query = "SELECT value FROM %s AS b, %s AS bb WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id AND tag LIKE '%s'" \
% (bibXXx, bibrec_bibXXx, recID, tag)
res = run_sql(query)
for row in res:
out.append(row[0])
return out
def get_associated_subfield_value(recID, tag, value, associated_subfield_code):
"""Return list of ASSOCIATED_SUBFIELD_CODE, if exists, for record
RECID and TAG of value VALUE. Used by fulltext indexer only.
Note: TAG must be 6 characters long (tag+ind1+ind2+sfcode),
otherwise en empty string is returned.
FIXME: what if many tag values have the same value but different
associated_subfield_code? Better use bibrecord library for this.
"""
out = ""
if len(tag) != 6:
return out
bibXXx = "bib" + tag[0] + tag[1] + "x"
bibrec_bibXXx = "bibrec_" + bibXXx
query = """SELECT bb.field_number, b.tag, b.value FROM %s AS b, %s AS bb
WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id AND tag LIKE '%s%%'""" % \
(bibXXx, bibrec_bibXXx, recID, tag[:-1])
res = run_sql(query)
field_number = -1
for row in res:
if row[1] == tag and row[2] == value:
field_number = row[0]
if field_number > 0:
for row in res:
if row[0] == field_number and row[1] == tag[:-1] + associated_subfield_code:
out = row[2]
break
return out
def get_field_tags(field):
"""Returns a list of MARC tags for the field code 'field'.
Returns empty list in case of error.
Example: field='author', output=['100__%','700__%']."""
out = []
query = """SELECT t.value FROM tag AS t, field_tag AS ft, field AS f
WHERE f.code='%s' AND ft.id_field=f.id AND t.id=ft.id_tag
ORDER BY ft.score DESC""" % field
res = run_sql(query)
for row in res:
out.append(row[0])
return out
## Fulltext word extraction functions
def get_fulltext_urls_from_html_page(htmlpagebody):
"""Parses htmlpagebody data looking for url_directs referring to
probable fulltexts.
Returns an array of (ext,url_direct) to fulltexts.
Note: it looks for file format extensions as defined by global
'CONV_PROGRAMS'structure.
"""
out = []
for ext in CONV_PROGRAMS.keys():
expr = sre.compile( r"\"(http://[\w]+\.+[\w]+[^\"'><]*\." + \
ext + r")\"")
match = expr.search(htmlpagebody)
if match:
out.append([ext,match.group(1)])
else: # FIXME: workaround for getfile, should use bibdoc tables
expr_getfile = sre.compile(r"\"(http://.*getfile\.py\?.*format=" + ext + r"&version=.*)\"")
match = expr_getfile.search(htmlpagebody)
if match:
out.append([ext,match.group(1)])
return out
def get_words_from_fulltext(url_direct_or_indirect,
split=string.split,
lower=string.lower,
force_file_extension=None):
"""Returns all the words contained in the document specified by
URL_DIRECT_OR_INDIRECT with the words being split by various
SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is
set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
file. (This is interesting to index Indico for example.) Note
also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
the fulltext file or an URL to a setlink-like page body that
presents the links to be indexed. In the latter case the
URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
to fulltext documents, for all knows file extensions as
specified by global CONV_PROGRAMS config variable.
"""
if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY and string.find(url_direct_or_indirect, weburl) < 0:
return []
if options["verbose"] >= 2:
write_message("... reading fulltext files from %s started" % url_direct_or_indirect)
fulltext_urls = None
if not force_file_extension:
url_direct = None
fulltext_urls = None
# check for direct link in url
url_direct_or_indirect_ext = lower(split(url_direct_or_indirect,".")[-1])
if url_direct_or_indirect_ext in CONV_PROGRAMS.keys():
fulltext_urls = [(url_direct_or_indirect_ext,url_direct_or_indirect)]
# Indirect url. Try to fetch the real fulltext(s)
if not fulltext_urls:
# read "setlink" data
try:
htmlpagebody = urllib2.urlopen(url_direct_or_indirect).read()
except:
sys.stderr.write("Error: Cannot read %s.\n" % url_direct_or_indirect)
return []
fulltext_urls = get_fulltext_urls_from_html_page(htmlpagebody)
if options["verbose"] >= 9:
write_message("... fulltext_urls = %s" % fulltext_urls)
else:
fulltext_urls = [[force_file_extension, url_direct_or_indirect]]
words = {}
# process as many urls as they were found:
for (ext,url_direct) in fulltext_urls:
if options["verbose"] >= 2:
write_message(".... processing %s from %s started" % (ext,url_direct))
# sanity check:
if not url_direct:
break;
# read fulltext file:
try:
url = urllib2.urlopen(url_direct)
except:
sys.stderr.write("Error: Cannot read %s.\n" % url_direct)
break # try other fulltext files...
tmp_fd, tmp_name = tempfile.mkstemp('invenio.tmp')
data_chunk = url.read(8*1024)
while data_chunk:
os.write(tmp_fd, data_chunk)
data_chunk = url.read(8*1024)
os.close(tmp_fd)
# try all available conversion programs according to their order:
bingo = 0
for conv_program in CONV_PROGRAMS[ext]:
if os.path.exists(conv_program):
# intelligence on how to run various conversion programs:
cmd = "" # wil keep command to run
bingo = 0 # had we success?
if os.path.basename(conv_program) == "pdftotext":
cmd = "%s -enc UTF-8 %s %s.txt" % (conv_program, tmp_name, tmp_name)
elif os.path.basename(conv_program) == "pstotext":
if ext == "ps.gz":
# is there gzip available?
if os.path.exists(CONV_PROGRAMS_HELPERS["gz"]):
cmd = "%s -cd %s | %s > %s.txt" \
% (CONV_PROGRAMS_HELPERS["gz"], tmp_name, conv_program, tmp_name)
else:
cmd = "%s %s > %s.txt" \
% (conv_program, tmp_name, tmp_name)
elif os.path.basename(conv_program) == "ps2ascii":
if ext == "ps.gz":
# is there gzip available?
if os.path.exists(CONV_PROGRAMS_HELPERS["gz"]):
cmd = "%s -cd %s | %s > %s.txt"\
% (CONV_PROGRAMS_HELPERS["gz"], tmp_name,
conv_program, tmp_name)
else:
cmd = "%s %s %s.txt" \
% (conv_program, tmp_name, tmp_name)
elif os.path.basename(conv_program) == "antiword":
cmd = "%s %s > %s.txt" % (conv_program, tmp_name, tmp_name)
elif os.path.basename(conv_program) == "catdoc":
cmd = "%s %s > %s.txt" % (conv_program, tmp_name, tmp_name)
elif os.path.basename(conv_program) == "wvText":
cmd = "%s %s %s.txt" % (conv_program, tmp_name, tmp_name)
elif os.path.basename(conv_program) == "ppthtml":
# is there html2text available?
if os.path.exists(CONV_PROGRAMS_HELPERS["html"]):
cmd = "%s %s | %s > %s.txt"\
% (conv_program, tmp_name,
CONV_PROGRAMS_HELPERS["html"], tmp_name)
else:
cmd = "%s %s > %s.txt" \
% (conv_program, tmp_name, tmp_name)
elif os.path.basename(conv_program) == "xlhtml":
# is there html2text available?
if os.path.exists(CONV_PROGRAMS_HELPERS["html"]):
cmd = "%s %s | %s > %s.txt" % \
(conv_program, tmp_name,
CONV_PROGRAMS_HELPERS["html"], tmp_name)
else:
cmd = "%s %s > %s.txt" % \
(conv_program, tmp_name, tmp_name)
else:
sys.stderr.write("Error: Do not know how to handle %s conversion program.\n" % conv_program)
# try to run it:
try:
if options["verbose"] >= 9:
write_message("..... launching %s" % cmd)
errcode = os.system(cmd)
if errcode == 0 and os.path.exists("%s.txt" % tmp_name):
bingo = 1
break # bingo!
else:
write_message("Error while running %s for %s.\n" % (cmd, url_direct), sys.stderr)
except:
write_message("Error running %s for %s.\n" % (cmd, url_direct), sys.stderr)
# were we successful?
if bingo:
tmp_name_txt_file = open("%s.txt" % tmp_name)
for phrase in tmp_name_txt_file.xreadlines():
for word in get_words_from_phrase(phrase):
if not words.has_key(word):
words[word] = 1;
tmp_name_txt_file.close()
else:
if options["verbose"]:
write_message("No conversion success for %s.\n" % (url_direct), sys.stderr)
# delete temp files (they might not exist):
try:
os.unlink(tmp_name)
os.unlink(tmp_name + ".txt")
except StandardError:
write_message("Error: Could not delete file. It didn't exist", sys.stderr)
if options["verbose"] >= 2:
write_message(".... processing %s from %s ended" % (ext,url_direct))
if options["verbose"] >= 2:
write_message("... reading fulltext files from %s ended" % url_direct_or_indirect)
return words.keys()
# tagToFunctions mapping. It offers an indirection level necesary for
# indexing fulltext. The default is get_words_from_phrase
tagToWordsFunctions = {'8564_u':get_words_from_fulltext}
def get_words_from_phrase(phrase, split=string.split):
"""Return list of words found in PHRASE. Note that the phrase is
split into groups depending on the alphanumeric characters and
punctuation characters definition present in the config file.
"""
words = {}
if CFG_BIBINDEX_REMOVE_HTML_MARKUP and string.find(phrase, "") > -1:
phrase = sre_html.sub(' ', phrase)
phrase = string.lower(phrase)
# 1st split phrase into blocks according to whitespace
for block in split(strip_accents(phrase)):
# 2nd remove leading/trailing punctuation and add block:
block = sre_block_punctuation_begin.sub("", block)
block = sre_block_punctuation_end.sub("", block)
if block:
block = apply_stemming_and_stopwords_and_length_check(block)
if block:
words[block] = 1
# 3rd break each block into subblocks according to punctuation and add subblocks:
for subblock in sre_punctuation.split(block):
subblock = apply_stemming_and_stopwords_and_length_check(subblock)
if subblock:
words[subblock] = 1
# 4th break each subblock into alphanumeric groups and add groups:
for alphanumeric_group in sre_separators.split(subblock):
alphanumeric_group = apply_stemming_and_stopwords_and_length_check(alphanumeric_group)
if alphanumeric_group:
words[alphanumeric_group] = 1
return words.keys()
def apply_stemming_and_stopwords_and_length_check(word):
"""Return WORD after applying stemming and stopword and length checks.
See the config file in order to influence these.
"""
# stem word, when configured so:
if CFG_BIBINDEX_STEMMER_DEFAULT_LANGUAGE != "":
word = stem(word, CFG_BIBINDEX_STEMMER_DEFAULT_LANGUAGE)
# now check against stopwords:
if is_stopword(word):
return ""
# finally check the word length:
if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH:
return ""
return word
def remove_subfields(s):
"Removes subfields from string, e.g. 'foo $$c bar' becomes 'foo bar'."
return sre_subfields.sub(' ', s)
def get_index_id(indexname):
"""Returns the words/phrase index id for INDEXNAME.
Returns empty string in case there is no words table for this index.
Example: field='author', output=4."""
out = 0
query = """SELECT w.id FROM idxINDEX AS w
WHERE w.name='%s' LIMIT 1""" % indexname
res = run_sql(query, None, 1)
if res:
out = res[0][0]
return out
def get_index_tags(indexname):
"""Returns the list of tags that are indexed inside INDEXNAME.
Returns empty list in case there are no tags indexed in this index.
Note: uses get_field_tags() defined before.
Example: field='author', output=['100__%', '700__%']."""
out = []
query = """SELECT f.code FROM idxINDEX AS w, idxINDEX_field AS wf,
field AS f WHERE w.name='%s' AND w.id=wf.id_idxINDEX
AND f.id=wf.id_field""" % indexname
res = run_sql(query)
for row in res:
out.extend(get_field_tags(row[0]))
return out
def get_all_indexes():
"""Returns the list of the names of all defined words indexes.
Returns empty list in case there are no tags indexed in this index.
Example: output=['global', 'author']."""
out = []
query = """SELECT name FROM idxINDEX"""
res = run_sql(query)
for row in res:
out.append(row[0])
return out
def usage(code, msg=''):
"Prints usage for this module."
if msg:
sys.stderr.write("Error: %s.\n" % msg)
print >> sys.stderr, \
""" Usage: %s [options]
Examples:
%s -a -i 234-250,293,300-500 -u admin@localhost
%s -a -w author,fulltext -M 8192 -v3
%s -d -m +4d -A on --flush=10000
Indexing options:
-a, --add add or update words for selected records
-d, --del delete words for selected records
-i, --id=low[-high] select according to doc recID
-m, --modified=from[,to] select according to modification date
-c, --collection=c1[,c2] select according to collection
Repairing options:
-k, --check check consistency for all records in the table(s)
-r, --repair try to repair all records in the table(s)
Specific options:
-w, --windex=w1[,w2] word/phrase indexes to consider (all)
-M, --maxmem=XXX maximum memory usage in kB (no limit)
-f, --flush=NNN full consistent table flush after NNN records (10000)
Scheduling options:
-u, --user=USER user name to store task, password needed
-s, --sleeptime=SLEEP time after which to repeat tasks (no)
e.g.: 1s, 30m, 24h, 7d
-t, --time=TIME moment for the task to be active (now)
e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
""" % ((sys.argv[0],) * 4)
sys.exit(code)
def authenticate(user, header="BibIndex Task Submission", action="runbibindex"):
"""Authenticate the user against the user database.
Check for its password, if it exists.
Check for action access rights.
Return user name upon authorization success,
do system exit upon authorization failure.
"""
print header
print "=" * len(header)
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
## first check user pw:
- res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1)
+ res = run_sql("select id,password from user where email=%s", (user,), 1) + \
+ run_sql("select id,password from user where nickname=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
## secondly check authorization for the action:
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def split_ranges(parse_string):
recIDs = []
ranges = string.split(parse_string, ",")
for arange in ranges:
tmp_recIDs = string.split(arange, "-")
if len(tmp_recIDs)==1:
recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])])
else:
if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check
tmp = tmp_recIDs[0]
tmp_recIDs[0] = tmp_recIDs[1]
tmp_recIDs[1] = tmp
recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])])
return recIDs
def get_word_tables(tables):
global wordTables
if tables:
indexes = string.split(tables, ",")
for index in indexes:
index_id = get_index_id(index)
if index_id:
wordTables.append({"idxWORD%02dF" % index_id: \
get_index_tags(index)})
else:
write_message("Error: There is no %s words table." % index, sys.stderr)
else:
for index in get_all_indexes():
index_id = get_index_id(index)
wordTables.append({"idxWORD%02dF" % index_id: \
get_index_tags(index)})
return wordTables
def get_date_range(var):
"Returns the two dates contained as a low,high tuple"
limits = string.split(var, ",")
if len(limits)==1:
low = get_datetime(limits[0])
return low,None
if len(limits)==2:
low = get_datetime(limits[0])
high = get_datetime(limits[1])
return low,high
return None,None
def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = sre_datetime_shift.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def create_range_list(res):
"""Creates a range list from a recID select query result contained
in res. The result is expected to have ascending numerical order."""
if not res:
return []
row = res[0]
if not row:
return []
else:
range_list = [[row[0],row[0]]]
for row in res[1:]:
row_id = row[0]
if row_id == range_list[-1][1] + 1:
range_list[-1][1] = row_id
else:
range_list.append([row_id,row_id])
return range_list
def beautify_range_list(range_list):
"""Returns a non overlapping, maximal range list"""
ret_list = []
for new in range_list:
found = 0
for old in ret_list:
if new[0] <= old[0] <= new[1] + 1 or new[0] - 1 <= old[1] <= new[1]:
old[0] = min(old[0], new[0])
old[1] = max(old[1], new[1])
found = 1
break
if not found:
ret_list.append(new)
return ret_list
def serialize_via_numeric_array(arr):
"""Serialize Numeric array into a compressed string."""
return compress(Numeric.dumps(arr))
def deserialize_via_numeric_array(string):
"""Decompress and deserialize string into a Numeric array."""
return Numeric.loads(decompress(string))
def serialize_via_marshal(obj):
"""Serialize Python object via marshal into a compressed string."""
return escape_string(compress(marshal.dumps(obj)))
def deserialize_via_marshal(string):
"""Decompress and deserialize string into a Python object via marshal."""
return marshal.loads(decompress(string))
class WordTable:
"A class to hold the words table."
def __init__(self, tablename, fields_to_index, separators="[^\s]"):
"Creates words table instance."
self.tablename = tablename
self.recIDs_in_mem = []
self.fields_to_index = fields_to_index
self.separators = separators
self.value = {}
def get_field(self, recID, tag):
"""Returns list of values of the MARC-21 'tag' fields for the
record 'recID'."""
out = []
bibXXx = "bib" + tag[0] + tag[1] + "x"
bibrec_bibXXx = "bibrec_" + bibXXx
query = """SELECT value FROM %s AS b, %s AS bb
WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id
AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID, tag);
res = run_sql(query)
for row in res:
out.append(row[0])
return out
def clean(self):
"Cleans the words table."
self.value={}
def put_into_db(self, mode="normal"):
"""Updates the current words table in the corresponding DB
idxFOO table. Mode 'normal' means normal execution,
mode 'emergency' means words index reverting to old state.
"""
if options["verbose"]:
write_message("%s %s wordtable flush started" % (self.tablename,mode))
write_message('...updating %d words into %s started' % \
(len(self.value), self.tablename))
task_update_progress("%s flushed %d/%d words" % (self.tablename, 0, len(self.value)))
self.recIDs_in_mem = beautify_range_list(self.recIDs_in_mem)
if mode == "normal":
for group in self.recIDs_in_mem:
query = """UPDATE %sR SET type='TEMPORARY' WHERE id_bibrec
BETWEEN '%d' AND '%d' AND type='CURRENT'""" % \
(self.tablename[:-1], group[0], group[1])
if options["verbose"] >= 9:
write_message(query)
run_sql(query)
nb_words_total = len(self.value)
nb_words_report = int(nb_words_total/10.0)
nb_words_done = 0
for word in self.value.keys():
self.put_word_into_db(word)
nb_words_done += 1
if nb_words_report!=0 and ((nb_words_done % nb_words_report) == 0):
if options["verbose"]:
write_message('......processed %d/%d words' % (nb_words_done, nb_words_total))
task_update_progress("%s flushed %d/%d words" % (self.tablename, nb_words_done, nb_words_total))
if options["verbose"] >= 9:
write_message('...updating %d words into %s ended' % \
(nb_words_total, self.tablename))
if options["verbose"]:
write_message('...updating reverse table %sR started' % self.tablename[:-1])
if mode == "normal":
for group in self.recIDs_in_mem:
query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec
BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
(self.tablename[:-1], group[0], group[1])
if options["verbose"] >= 9:
write_message(query)
run_sql(query)
query = """DELETE FROM %sR WHERE id_bibrec
BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
(self.tablename[:-1], group[0], group[1])
if options["verbose"] >= 9:
write_message(query)
run_sql(query)
if options["verbose"] >= 9:
write_message('End of updating wordTable into %s' % self.tablename)
elif mode == "emergency":
for group in self.recIDs_in_mem:
query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec
BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
(self.tablename[:-1], group[0], group[1])
if options["verbose"] >= 9:
write_message(query)
run_sql(query)
query = """DELETE FROM %sR WHERE id_bibrec
BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
(self.tablename[:-1], group[0], group[1])
if options["verbose"] >= 9:
write_message(query)
run_sql(query)
if options["verbose"] >= 9:
write_message('End of emergency flushing wordTable into %s' % self.tablename)
if options["verbose"]:
write_message('...updating reverse table %sR ended' % self.tablename[:-1])
self.clean()
self.recIDs_in_mem = []
if options["verbose"]:
write_message("%s %s wordtable flush ended" % (self.tablename, mode))
task_update_progress("%s flush ended" % (self.tablename))
def load_old_recIDs(self,word):
"""Load existing hitlist for the word from the database index files."""
query = "SELECT hitlist FROM %s WHERE term=%%s" % self.tablename
res = run_sql(query, (word,))
if res:
return deserialize_via_numeric_array(res[0][0])
else:
return None
def merge_with_old_recIDs(self,word,set):
"""Merge the system numbers stored in memory (hash of recIDs with value +1 or -1
according to whether to add/delete them) with those stored in the database index
and received in set universe of recIDs for the given word.
Return 0 in case no change was done to SET, return 1 in case SET was changed.
"""
set_changed_p = 0
for recID,sign in self.value[word].items():
if sign == -1 and set[recID]==1:
# delete recID if existent in set and if marked as to be deleted
set[recID] = 0
set_changed_p = 1
elif set[recID] == 0:
# add recID if not existent in set and if marked as to be added
set[recID] = 1
set_changed_p = 1
return set_changed_p
def put_word_into_db(self, word):
"""Flush a single word to the database and delete it from memory"""
set = self.load_old_recIDs(word)
if set: # merge the word recIDs found in memory:
if self.merge_with_old_recIDs(word,set) == 0:
# nothing to update:
if options["verbose"] >= 9:
write_message("......... unchanged hitlist for ``%s''" % word)
pass
else:
# yes there were some new words:
if options["verbose"] >= 9:
write_message("......... updating hitlist for ``%s''" % word)
run_sql("UPDATE %s SET hitlist=%%s WHERE term=%%s" % self.tablename,
(serialize_via_numeric_array(set), word))
else: # the word is new, will create new set:
set = Numeric.zeros(CFG_MAX_RECID+1, Numeric.Int0)
Numeric.put(set, self.value[word].keys(), 1)
if options["verbose"] >= 9:
write_message("......... inserting hitlist for ``%s''" % word)
run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, %%s)" % self.tablename,
(word, serialize_via_numeric_array(set)))
if not set: # never store empty words
run_sql("DELETE from %s WHERE term=%%s" % self.tablename,
(word,))
del self.value[word]
def display(self):
"Displays the word table."
keys = self.value.keys()
keys.sort()
for k in keys:
if options["verbose"]:
write_message("%s: %s" % (k, self.value[k]))
def count(self):
"Returns the number of words in the table."
return len(self.value)
def info(self):
"Prints some information on the words table."
if options["verbose"]:
write_message("The words table contains %d words." % self.count())
def lookup_words(self, word=""):
"Lookup word from the words table."
if not word:
done = 0
while not done:
try:
word = raw_input("Enter word: ")
done = 1
except (EOFError, KeyboardInterrupt):
return
if self.value.has_key(word):
if options["verbose"]:
write_message("The word '%s' is found %d times." \
% (word, len(self.value[word])))
else:
if options["verbose"]:
write_message("The word '%s' does not exist in the word file."\
% word)
def update_last_updated(self, starting_time=None):
"""Update last_updated column of the index table in the database.
Puts starting time there so that if the task was interrupted for record download,
the records will be reindexed next time."""
if starting_time is None:
return None
if options["verbose"] >= 9:
write_message("updating last_updated to %s...", starting_time)
return run_sql("UPDATE idxINDEX SET last_updated=%s WHERE id=%s",
(starting_time, self.tablename[-3:-1],))
def add_recIDs(self, recIDs):
"""Fetches records which id in the recIDs range list and adds
them to the wordTable. The recIDs range list is of the form:
[[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
"""
global chunksize
flush_count = 0
records_done = 0
records_to_go = 0
for arange in recIDs:
records_to_go = records_to_go + arange[1] - arange[0] + 1
time_started = time.time() # will measure profile time
for arange in recIDs:
i_low = arange[0]
chunksize_count = 0
while i_low <= arange[1]:
# calculate chunk group of recIDs and treat it:
i_high = min(i_low+options["flush"]-flush_count-1,arange[1])
i_high = min(i_low+chunksize-chunksize_count-1, i_high)
try:
self.chk_recID_range(i_low, i_high)
except StandardError, e:
write_message("Exception caught: %s" % e, sys.stderr)
if options["verbose"] >= 9:
traceback.print_tb(sys.exc_info()[2])
task_update_status("ERROR")
task_sig_stop_commands()
sys.exit(1)
if options["verbose"]:
write_message("%s adding records #%d-#%d started" % \
(self.tablename, i_low, i_high))
if CFG_CHECK_MYSQL_THREADS:
kill_sleepy_mysql_threads()
task_update_progress("%s adding recs %d-%d" % (self.tablename, i_low, i_high))
self.del_recID_range(i_low, i_high)
just_processed = self.add_recID_range(i_low, i_high)
flush_count = flush_count + i_high - i_low + 1
chunksize_count = chunksize_count + i_high - i_low + 1
records_done = records_done + just_processed
if options["verbose"]:
write_message("%s adding records #%d-#%d ended " % \
(self.tablename, i_low, i_high))
if chunksize_count >= chunksize:
chunksize_count = 0
# flush if necessary:
if flush_count >= options["flush"]:
self.put_into_db()
self.clean()
if options["verbose"]:
write_message("%s backing up" % (self.tablename))
flush_count = 0
self.log_progress(time_started,records_done,records_to_go)
# iterate:
i_low = i_high + 1
if flush_count > 0:
self.put_into_db()
self.log_progress(time_started,records_done,records_to_go)
def add_recIDs_by_date(self, dates):
"""Add records that were modified between DATES[0] and DATES[1].
If DATES is not set, then add records that were modified since
the last update of the index.
"""
if not dates:
table_id = self.tablename[-3:-1]
query = """SELECT last_updated FROM idxINDEX WHERE id='%s'
""" % table_id
res = run_sql(query)
if not res:
return
if not res[0][0]:
dates = ("0000-00-00", None)
else:
dates = (res[0][0], None)
if dates[1] is None:
res = run_sql("""SELECT b.id FROM bibrec AS b
WHERE b.modification_date >= %s ORDER BY b.id ASC""",
(dates[0],))
elif dates[0] is None:
res = run_sql("""SELECT b.id FROM bibrec AS b
WHERE b.modification_date <= %s ORDER BY b.id ASC""",
(dates[1],))
else:
res = run_sql("""SELECT b.id FROM bibrec AS b
WHERE b.modification_date >= %s AND
b.modification_date <= %s ORDER BY b.id ASC""",
(dates[0], dates[1]))
alist = create_range_list(res)
if not alist:
if options["verbose"]:
write_message( "No new records added. %s is up to date" % self.tablename)
else:
self.add_recIDs(alist)
def add_recID_range(self, recID1, recID2):
empty_list_string = serialize_via_marshal([])
wlist = {}
self.recIDs_in_mem.append([recID1,recID2])
# secondly fetch all needed tags:
for tag in self.fields_to_index:
if tag in tagToWordsFunctions.keys():
get_words_function = tagToWordsFunctions[tag]
else:
get_words_function = get_words_from_phrase
bibXXx = "bib" + tag[0] + tag[1] + "x"
bibrec_bibXXx = "bibrec_" + bibXXx
query = """SELECT bb.id_bibrec,b.value FROM %s AS b, %s AS bb
WHERE bb.id_bibrec BETWEEN %d AND %d
AND bb.id_bibxxx=b.id AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID1, recID2, tag)
res = run_sql(query)
for row in res:
recID,phrase = row
if not wlist.has_key(recID): wlist[recID] = []
if tag == "8564_u":
# Special treatment for fulltext indexing. 8564
# $$u contains URL, and $$y link name. If $$y is
# actually a file name, that is if it ends with
# something like .pdf or .ppt, then $$u is treated
# as direct URL to the PDF file, and is indexed as
# such. This is useful to index Indico files.
# FIXME: this is a quick fix only. We should
# rather download all 856 $$u files and analyze
# content in order to decide how to index them
# (directly for Indico, indirectly for Setlink).
filename = get_associated_subfield_value(recID,'8564_u', phrase, 'y')
filename_extension = string.lower(string.split(filename, ".")[-1])
if filename_extension in CONV_PROGRAMS.keys():
new_words = get_words_function(phrase, force_file_extension=filename_extension) # ,self.separators
else:
new_words = get_words_function(phrase) # ,self.separators
else:
new_words = get_words_function(phrase) # ,self.separators
wlist[recID] = list_union(new_words,wlist[recID])
# were there some words for these recIDs found?
if len(wlist) == 0: return 0
recIDs = wlist.keys()
for recID in recIDs:
# was this record marked as deleted?
if "DELETED" in self.get_field(recID, "980__c"):
wlist[recID] = []
if options["verbose"] >= 9:
write_message("... record %d was declared deleted, removing its word list" % recID)
if options["verbose"] >= 9:
write_message("... record %d, termlist: %s" % (recID, wlist[recID]))
# Using cStringIO for speed.
query_factory = cStringIO.StringIO()
qwrite = query_factory.write
qwrite( "INSERT INTO %sR (id_bibrec,termlist,type) VALUES" % self.tablename[:-1])
qwrite( "('" )
qwrite( str(recIDs[0]) )
qwrite( "','" )
qwrite( serialize_via_marshal(wlist[recIDs[0]]) )
qwrite( "','FUTURE')" )
for recID in recIDs[1:]:
qwrite(",('")
qwrite(str(recID))
qwrite("','")
qwrite(serialize_via_marshal(wlist[recID]))
qwrite("','FUTURE')")
query = query_factory.getvalue()
query_factory.close()
run_sql(query)
query_factory = cStringIO.StringIO()
qwrite = query_factory.write
qwrite("INSERT INTO %sR (id_bibrec,termlist,type) VALUES" % self.tablename[:-1])
qwrite("('")
qwrite(str(recIDs[0]))
qwrite("','")
qwrite(serialize_via_marshal(wlist[recIDs[0]]))
qwrite("','CURRENT')")
for recID in recIDs[1:]:
qwrite( ",('" )
qwrite( str(recID) )
qwrite( "','" )
qwrite( empty_list_string )
qwrite( "','CURRENT')" )
query = query_factory.getvalue()
query_factory.close()
try:
run_sql(query)
except DatabaseError:
# ok, we tried to add an existent record. No problem
pass
put = self.put
for recID in recIDs:
for w in wlist[recID]:
put(recID, w, 1)
return len(recIDs)
def log_progress(self, start, done, todo):
"""Calculate progress and store it.
start: start time,
done: records processed,
todo: total number of records"""
time_elapsed = time.time() - start
# consistency check
if time_elapsed == 0 or done > todo:
return
time_recs_per_min = done/(time_elapsed/60.0)
if options["verbose"]:
write_message("%d records took %.1f seconds to complete.(%1.f recs/min)"\
% (done, time_elapsed, time_recs_per_min))
if time_recs_per_min:
if options["verbose"]:
write_message("Estimated runtime: %.1f minutes" % \
((todo-done)/time_recs_per_min))
def put(self, recID, word, sign):
"Adds/deletes a word to the word list."
try:
word = string.lower(word[:50])
if self.value.has_key(word):
# the word 'word' exist already: update sign
self.value[word][recID] = sign
else:
self.value[word] = {recID: sign}
except:
write_message("Error: Cannot put word %s with sign %d for recID %s." % (word, sign, recID))
def del_recIDs(self, recIDs):
"""Fetches records which id in the recIDs range list and adds
them to the wordTable. The recIDs range list is of the form:
[[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
"""
count = 0
for arange in recIDs:
self.del_recID_range(arange[0],arange[1])
count = count + arange[1] - arange[0]
self.put_into_db()
def del_recID_range(self, low, high):
"""Deletes records with 'recID' system number between low
and high from memory words index table."""
if options["verbose"] > 2:
write_message("%s fetching existing words for records #%d-#%d started" % \
(self.tablename, low, high))
self.recIDs_in_mem.append([low,high])
query = """SELECT id_bibrec,termlist FROM %sR as bb WHERE bb.id_bibrec
BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high)
recID_rows = run_sql(query)
for recID_row in recID_rows:
recID = recID_row[0]
wlist = deserialize_via_marshal(recID_row[1])
for word in wlist:
self.put(recID, word, -1)
if options["verbose"] > 2:
write_message("%s fetching existing words for records #%d-#%d ended" % \
(self.tablename, low, high))
def report_on_table_consistency(self):
"""Check reverse words index tables (e.g. idxWORD01R) for
interesting states such as 'TEMPORARY' state.
Prints small report (no of words, no of bad words).
"""
# find number of words:
query = """SELECT COUNT(*) FROM %s""" % (self.tablename)
res = run_sql(query, None, 1)
if res:
nb_words = res[0][0]
else:
nb_words = 0
# find number of records:
query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR""" % (self.tablename[:-1])
res = run_sql(query, None, 1)
if res:
nb_records = res[0][0]
else:
nb_records = 0
# report stats:
if options["verbose"]:
write_message("%s contains %d words from %d records" % (self.tablename, nb_words, nb_records))
# find possible bad states in reverse tables:
query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR WHERE type <> 'CURRENT'""" % (self.tablename[:-1])
res = run_sql(query)
if res:
nb_bad_records = res[0][0]
else:
nb_bad_records = 999999999
if nb_bad_records:
write_message("EMERGENCY: %s needs to repair %d of %d records" % \
(self.tablename, nb_bad_records, nb_records))
else:
if options["verbose"]:
write_message("%s is in consistent state" % (self.tablename))
return nb_bad_records
def repair(self):
"""Repair the whole table"""
# find possible bad states in reverse tables:
query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR WHERE type <> 'CURRENT'""" % (self.tablename[:-1])
res = run_sql(query, None, 1)
if res:
nb_bad_records = res[0][0]
else:
nb_bad_records = 0
if nb_bad_records == 0:
return
query = """SELECT id_bibrec FROM %sR WHERE type <> 'CURRENT' ORDER BY id_bibrec""" \
% (self.tablename[:-1])
res = run_sql(query)
recIDs = create_range_list(res)
flush_count = 0
records_done = 0
records_to_go = 0
for arange in recIDs:
records_to_go = records_to_go + arange[1] - arange[0] + 1
time_started = time.time() # will measure profile time
for arange in recIDs:
i_low = arange[0]
chunksize_count = 0
while i_low <= arange[1]:
# calculate chunk group of recIDs and treat it:
i_high = min(i_low+options["flush"]-flush_count-1,arange[1])
i_high = min(i_low+chunksize-chunksize_count-1, i_high)
try:
self.fix_recID_range(i_low, i_high)
except StandardError, e:
write_message("Exception caught: %s" % e, sys.stderr)
if options["verbose"] >= 9:
traceback.print_tb(sys.exc_info()[2])
task_update_status("ERROR")
task_sig_stop_commands()
sys.exit(1)
flush_count = flush_count + i_high - i_low + 1
chunksize_count = chunksize_count + i_high - i_low + 1
records_done = records_done + i_high - i_low + 1
if chunksize_count >= chunksize:
chunksize_count = 0
# flush if necessary:
if flush_count >= options["flush"]:
self.put_into_db("emergency")
self.clean()
flush_count = 0
self.log_progress(time_started,records_done,records_to_go)
# iterate:
i_low = i_high + 1
if flush_count > 0:
self.put_into_db("emergency")
self.log_progress(time_started,records_done,records_to_go)
write_message("%s inconsistencies repaired." % self.tablename)
def chk_recID_range(self, low, high):
"""Check if the reverse index table is in proper state"""
## check db
query = """SELECT COUNT(*) FROM %sR WHERE type <> 'CURRENT'
AND id_bibrec BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high)
res = run_sql(query, None, 1)
if res[0][0]==0:
if options["verbose"]:
write_message("%s for %d-%d is in consistent state"%(self.tablename,low,high))
return # okay, words table is consistent
## inconsistency detected!
write_message("EMERGENCY: %s inconsistencies detected..." % self.tablename)
write_message("""EMERGENCY: Errors found. You should check consistency of the %s - %sR tables.\nRunning 'bibindex --repair' is recommended.""" \
% (self.tablename, self.tablename[:-1]))
raise StandardError
def fix_recID_range(self, low, high):
"""Try to fix reverse index database consistency (e.g. table idxWORD01R) in the low,high doc-id range.
Possible states for a recID follow:
CUR TMP FUT: very bad things have happened: warn!
CUR TMP : very bad things have happened: warn!
CUR FUT: delete FUT (crash before flushing)
CUR : database is ok
TMP FUT: add TMP to memory and del FUT from memory
flush (revert to old state)
TMP : very bad things have happened: warn!
FUT: very bad things have happended: warn!
"""
state = {}
query = "SELECT id_bibrec,type FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d'"\
% (self.tablename[:-1], low, high)
res = run_sql(query)
for row in res:
if not state.has_key(row[0]):
state[row[0]]=[]
state[row[0]].append(row[1])
ok = 1 # will hold info on whether we will be able to repair
for recID in state.keys():
if not 'TEMPORARY' in state[recID]:
if 'FUTURE' in state[recID]:
if 'CURRENT' not in state[recID]:
write_message("EMERGENCY: Record %d is in inconsistent state. Can't repair it" % recID)
ok = 0
else:
write_message("EMERGENCY: Inconsistency in record %d detected" % recID)
query = """DELETE FROM %sR
WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID)
run_sql(query)
write_message("EMERGENCY: Inconsistency in record %d repaired." % recID)
else:
if 'FUTURE' in state[recID] and not 'CURRENT' in state[recID]:
self.recIDs_in_mem.append([recID,recID])
# Get the words file
query = """SELECT type,termlist FROM %sR
WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID)
if options["verbose"] >= 9:
write_message(query)
res = run_sql(query)
for row in res:
wlist = deserialize_via_marshal(row[1])
if options["verbose"] >= 9:
write_message("Words are %s " % wlist)
if row[0] == 'TEMPORARY':
sign = 1
else:
sign = -1
for word in wlist:
self.put(recID, word, sign)
else:
write_message("EMERGENCY: %s for %d is in inconsistent state. Couldn't repair it." % (self.tablename, recID))
ok = 0
if not ok:
write_message("""EMERGENCY: Unrepairable errors found. You should check consistency
of the %s - %sR tables. Deleting affected records is
recommended.""" % (self.tablename, self.tablename[:-1]))
raise StandardError
def task_run(row):
"""Run the indexing task. The row argument is the BibSched task
queue row, containing if, arguments, etc.
Return 1 in case of success and 0 in case of failure.
"""
global options, wordTables, stemmer, stopwords
# read from SQL row:
task_id = row[0]
task_proc = row[1]
options = marshal.loads(row[6])
task_status = row[7]
# sanity check:
if task_proc != "bibindex":
write_message("The task #%d does not seem to be a BibIndex task." % task_id, sys.stderr)
return 0
if task_status != "WAITING":
write_message("The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr)
return 0
# we can run the task now:
if options["verbose"]:
write_message("Task #%d started." % task_id)
task_starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
task_update_status("RUNNING")
# install signal handlers
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
## go ahead and treat each table :
for table in options["windex"]:
wordTable = WordTable(table.keys()[0], table.values()[0])
wordTable.report_on_table_consistency()
try:
if options["cmd"] == "del":
if options["id"]:
wordTable.del_recIDs(options["id"])
elif options["collection"]:
l_of_colls = string.split(options["collection"], ",")
recIDs = perform_request_search(c=l_of_colls)
recIDs_range = []
for recID in recIDs:
recIDs_range.append([recID,recID])
wordTable.del_recIDs(recIDs_range)
else:
write_message("Missing IDs of records to delete from index %s." % wordTable.tablename,
sys.stderr)
raise StandardError
elif options["cmd"] == "add":
if options["id"]:
wordTable.add_recIDs(options["id"])
elif options["collection"]:
l_of_colls = string.split(options["collection"], ",")
recIDs = perform_request_search(c=l_of_colls)
recIDs_range = []
for recID in recIDs:
recIDs_range.append([recID,recID])
wordTable.add_recIDs(recIDs_range)
else:
wordTable.add_recIDs_by_date(options["modified"])
# only update last_updated if run via automatic mode:
wordTable.update_last_updated(task_starting_time)
elif options["cmd"] == "repair":
wordTable.repair()
else:
write_message("Invalid command found processing %s" % \
wordTable.tablename, sys.stderr)
raise StandardError
except StandardError, e:
write_message("Exception caught: %s" % e, sys.stderr)
if options["verbose"] >= 9:
traceback.print_tb(sys.exc_info()[2])
task_update_status("ERROR")
task_sig_stop_commands()
sys.exit(1)
wordTable.report_on_table_consistency()
# We are done. State it in the database, close and quit
task_update_status("DONE")
if options["verbose"]:
write_message("Task #%d finished." % task_id)
return 1
def command_line():
global options
long_flags =["add","del","id=","modified=","collection=", "windex=",
"check","repair","maxmem=", "flush=","user=","sleeptime=",
"time=","help", "version", "verbose="]
short_flags ="adi:m:c:w:krM:f:u:s:t:hVv:"
format_string = "%Y-%m-%d %H:%M:%S"
tables = None
sleeptime = ""
try:
opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags)
except getopt.GetoptError, err:
write_message(err, sys.stderr)
usage(1)
if args:
usage(1)
options={"cmd":"add", "id":[], "modified":[], "collection":[], "maxmem":0,
"flush":10000, "sleeptime":0, "verbose":1 }
sched_time = time.strftime(format_string)
user = ""
# Check for key options
try:
for opt in opts:
if opt == ("-h","") or opt == ("--help",""):
usage(1)
elif opt == ("-V","") or opt == ("--version",""):
print BIBINDEX_ENGINE_VERSION
sys.exit(1)
elif opt[0] in ["--verbose", "-v"]:
options["verbose"] = int(opt[1])
elif opt == ("-a","") or opt == ("--add",""):
options["cmd"] = "add"
if ("-x","") in opts or ("--del","") in opts:
usage(1)
elif opt == ("-k","") or opt == ("--check",""):
options["cmd"] = "check"
elif opt == ("-r","") or opt == ("--repair",""):
options["cmd"] = "repair"
elif opt == ("-d","") or opt == ("--del",""):
options["cmd"]="del"
elif opt[0] in [ "-i", "--id" ]:
options["id"] = options["id"] + split_ranges(opt[1])
elif opt[0] in [ "-m", "--modified" ]:
options["modified"] = get_date_range(opt[1])
elif opt[0] in [ "-c", "--collection" ]:
options["collection"] = opt[1]
elif opt[0] in [ "-w", "--windex" ]:
tables = opt[1]
elif opt[0] in [ "-M", "--maxmem"]:
options["maxmem"]=int(opt[1])
if options["maxmem"] < base_process_size + 1000:
raise StandardError, "Memory usage should be higher than %d kB" % (base_process_size + 1000)
elif opt[0] in [ "-f", "--flush"]:
options["flush"]=int(opt[1])
elif opt[0] in [ "-u", "--user"]:
user = opt[1]
elif opt[0] in [ "-s", "--sleeptime" ]:
get_datetime(opt[1]) # see if it is a valid shift
sleeptime= opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time= get_datetime(opt[1])
else: usage(1)
except StandardError, e:
write_message(e, sys.stderr)
sys.exit(1)
options["windex"]=get_word_tables(tables)
if options["cmd"] == "check":
for table in options["windex"]:
wordTable = WordTable(table.keys()[0], table.values()[0])
wordTable.report_on_table_consistency()
return
user = authenticate(user)
if options["verbose"] >= 9:
print ""
write_message("storing task options %s\n" % options)
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
new_task_id = run_sql("""INSERT INTO schTASK (proc,user,runtime,sleeptime,arguments,status)
VALUES ('bibindex',%s,%s,%s,%s,'WAITING')""",
(user, sched_time, sleeptime, marshal.dumps(options)))
## update task number:
options["task"] = new_task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), new_task_id))
print "Task #%d was successfully scheduled for execution." % new_task_id
return
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame))
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame))
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame))
write_message("stopping...")
task_update_status("STOPPING")
errcode = 0
try:
task_sig_stop_commands()
write_message("stopped")
task_update_status("STOPPED")
except StandardError, err:
write_message("Error during stopping! %e" % err)
task_update_status("STOPPINGFAILED")
errcode = 1
sys.exit(errcode)
def task_sig_stop_commands():
"""Do all the commands necessary to stop the task before quitting.
Useful for task_sig_stop() handler.
"""
write_message("stopping commands started")
for table in wordTables:
table.put_into_db()
write_message("stopping commands ended")
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame))
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
# do nothing for unknown signals:
write_message("unknown signal %d (frame %s) ignored" % (sig, frame))
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global options
if options["verbose"] >= 9:
write_message("Updating task progress to %s." % msg)
return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"]))
def task_update_status(val):
"""Updates state information in the BibSched task table."""
global options
if options["verbose"] >= 9:
write_message("Updating task status to %s." % val)
return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"]))
def test_fulltext_indexing():
"""Tests fulltext indexing programs on PDF, PS, DOC, PPT,
XLS. Prints list of words and word table on the screen. Does not
integrate anything into the database. Useful when debugging
problems with fulltext indexing: call this function instead of main().
"""
global options
options["verbose"] = 9
print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=atlnot&categ=Communication&id=com-indet-2002-012") # protected URL
print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=agenda&categ=a00388&id=a00388s2t7") # XLS
print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=agenda&categ=a02883&id=a02883s1t6/transparencies") # PPT
print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=agenda&categ=a99149&id=a99149s1t10/transparencies") # DOC
print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=preprint&categ=cern&id=lhc-project-report-601") # PDF
sys.exit(0)
def test_word_separators(phrase="hep-th/0101001"):
"""Tests word separating policy on various input."""
print "%s:" % phrase
for word in get_words_from_phrase(phrase):
print "\t-> %s" % word
def main():
"""Reads arguments and either runs the task, or starts user-interface (command line)."""
if len(sys.argv) == 2:
try:
task_id = int(sys.argv[1])
except StandardError:
command_line()
sys.exit()
res = run_sql("SELECT * FROM schTASK WHERE id='%d'" % (task_id), None, 1)
if not res:
write_message("Selected task not found.", sys.stderr)
sys.exit(1)
try:
if not task_run(res[0]):
write_message("Error occurred. Exiting.", sys.stderr)
except StandardError, e:
write_message("Unexpected error occurred: %s." % e, sys.stderr)
if options["verbose"] >= 9:
traceback.print_tb(sys.exc_info()[2])
write_message("Exiting.")
task_update_status("ERROR")
else:
command_line()
diff --git a/modules/bibrank/bin/bibrank.in b/modules/bibrank/bin/bibrank.in
index 844b4dae7..60b37bd44 100644
--- a/modules/bibrank/bin/bibrank.in
+++ b/modules/bibrank/bin/bibrank.in
@@ -1,479 +1,480 @@
#!@PYTHON@
## -*- mode: python; coding: utf-8; -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BibRank ranking daemon.
Usage: %s [options]
Ranking examples:
%s -wjif -a --id=0-30000,30001-860000 --verbose=9
%s -wjif -d --modified='2002-10-27 13:57:26'
%s -wwrd --rebalance --collection=Articles
%s -wwrd -a -i 234-250,293,300-500 -u admin
Ranking options:
-w, --run=r1[,r2] runs each rank method in the order given
-c, --collection=c1[,c2] select according to collection
-i, --id=low[-high] select according to doc recID
-m, --modified=from[,to] select according to modification date
-l, --lastupdate select according to last update
-a, --add add or update words for selected records
-d, --del delete words for selected records
-S, --stat show statistics for a method
-R, --recalculate recalculate weigth data, used by word frequency method
should be used if ca 1% of the document has been changed
since last time -R was used
Repairing options:
-k, --check check consistency for all records in the table(s)
check if update of ranking data is necessary
-r, --repair try to repair all records in the table(s)
Scheduling options:
-u, --user=USER user name to store task, password needed
-s, --sleeptime=SLEEP time after which to repeat tasks (no)
e.g.: 1s, 30m, 24h, 7d
-t, --time=TIME moment for the task to be active (now)
e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
"""
__revision__ = "$Id$"
try:
import marshal
from zlib import compress,decompress
from string import split,translate,lower,upper
import getopt
import getpass
import string
import os
import sre
import sys
import time
import urllib
import signal
import tempfile
import traceback
import cStringIO
import re
import copy
import types
import ConfigParser
except ImportError, e:
import sys
try:
from invenio.dbquery import run_sql, escape_string
from invenio.bibrank_tag_based_indexer import *
from invenio.bibrank_word_indexer import *
from invenio.access_control_engine import acc_authorize_action
from invenio.search_engine import perform_request_search
except ImportError, e:
import sys
nb_char_in_line = 50 # for verbose pretty printing
chunksize = 1000 # default size of chunks that the records will be treated by
base_process_size = 4500 # process base size
bibrank_options = {} # will hold task options
def serialize_via_numeric_array_dumps(arr):
return Numeric.dumps(arr)
def serialize_via_numeric_array_compr(str):
return compress(str)
def serialize_via_numeric_array_escape(str):
return escape_string(str)
def serialize_via_numeric_array(arr):
"""Serialize Numeric array into a compressed string."""
return serialize_via_numeric_array_escape(serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr)))
def deserialize_via_numeric_array(string):
"""Decompress and deserialize string into a Numeric array."""
return Numeric.loads(decompress(string))
def serialize_via_marshal(obj):
"""Serialize Python object via marshal into a compressed string."""
return escape_string(compress(marshal.dumps(obj)))
def deserialize_via_marshal(string):
"""Decompress and deserialize string into a Python object via marshal."""
return marshal.loads(decompress(string))
def authenticate(user, header="BibRank Task Submission", action="runbibrank"):
print header
print "=" * len(header)
if user == "":
print>> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print>> sys.stdout, "\rUsername:", user
- res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1)
+ res = run_sql("select id,password from user where email=%s", (user,), 1) + \
+ run_sql("select id,password from user where nickname=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def usage(code, msg=''):
"Prints usage for this module."
if msg:
sys.stderr.write("Error: %s.\n" % msg)
print >> sys.stderr, \
""" Usage: %s [options]
Ranking examples:
%s -wjif -a --id=0-30000,30001-860000 --verbose=9
%s -wjif -d --modified='2002-10-27 13:57:26'
%s -wjif --rebalance --collection=Articles
%s -wsbr -a -i 234-250,293,300-500 -u admin
Ranking options:
-w, --run=r1[,r2] runs each rank method in the order given
-c, --collection=c1[,c2] select according to collection
-i, --id=low[-high] select according to doc recID
-m, --modified=from[,to] select according to modification date
-l, --lastupdate select according to last update
-a, --add add or update words for selected records
-d, --del delete words for selected records
-S, --stat show statistics for a method
-R, --recalculate recalculate weigth data, used by word frequency method
should be used if ca 1%% of the document has been changed
since last time -R was used
Repairing options:
-k, --check check consistency for all records in the table(s)
check if update of ranking data is necessary
-r, --repair try to repair all records in the table(s)
Scheduling options:
-u, --user=USER user name to store task, password needed
-s, --sleeptime=SLEEP time after which to repeat tasks (no)
e.g.: 1s, 30m, 24h, 7d
-t, --time=TIME moment for the task to be active (now)
e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
""" % ((sys.argv[0],) * 5)
sys.exit(code)
def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re = sre.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if bibrank_options["verbose"] >= 9:
write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame))
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if bibrank_options["verbose"] >= 9:
write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame))
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop_commands():
"""Do all the commands necessary to stop the task before quitting.
Useful for task_sig_stop() handler.
"""
write_message("stopping commands started")
write_message("stopping commands ended")
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
global bibrank_options
if bibrank_options["verbose"] >= 9:
write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame))
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
# do nothing for unknown signals:
write_message("unknown signal %d (frame %s) ignored" % (sig, frame))
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global bibrank_options
query = "UPDATE schTASK SET progress='%s' where id=%d" % (escape_string(msg), bibrank_options["task"])
if bibrank_options["verbose"]>= 9:
write_message(query)
run_sql(query)
return
def task_update_status(val):
"""Updates state information in the BibSched task table."""
global bibrank_options
query = "UPDATE schTASK SET status='%s' where id=%d" % (escape_string(val), bibrank_options["task"])
if bibrank_options["verbose"]>= 9:
write_message(query)
run_sql(query)
return
def split_ranges(parse_string):
recIDs = []
ranges = string.split(parse_string, ",")
for range in ranges:
tmp_recIDs = string.split(range, "-")
if len(tmp_recIDs)==1:
recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])])
else:
if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check
tmp = tmp_recIDs[0]
tmp_recIDs[0] = tmp_recIDs[1]
tmp_recIDs[1] = tmp
recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])])
return recIDs
def get_date_range(var):
"Returns the two dates contained as a low,high tuple"
limits = string.split(var, ",")
if len(limits)==1:
low = get_datetime(limits[0])
return low,None
if len(limits)==2:
low = get_datetime(limits[0])
high = get_datetime(limits[1])
return low,high
def command_line():
"""Storing the task together with the parameters given."""
global bibrank_options
long_flags = ["lastupdate","add","del","repair","maxmem", "flush","stat", "rebalance", "id=", "collection=", "check", "modified=", "update", "run=", "user=", "sleeptime=", "time=", "help", "version", "verbose="]
short_flags = "ladSi:m:c:kUrRM:f:w:u:s:t:hVv:"
format_string = "%Y-%m-%d %H:%M:%S"
sleeptime = ""
try:
opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags)
except getopt.GetoptError, err:
write_message(err, sys.stderr)
usage(1)
if args:
usage(1)
bibrank_options = {"quick":"yes","cmd":"add","flush":100000,"validset":"", "collection":[], "id":[], "check": "", "stat":"", "modified":"", "last_updated":"last_updated","run":"", "verbose":1}
res = run_sql("SELECT name from rnkMETHOD")
bibrank_options["run"] = []
for (name,) in res:
bibrank_options["run"].append(name)
sched_time = time.strftime(format_string)
user = ""
try:
for opt in opts:
if opt == ("-h","") or opt == ("--help",""):
usage(1)
elif opt == ("-V","") or opt == ("--version",""):
print __revision__
sys.exit(1)
elif opt[0] in ["--verbose", "-v"]:
bibrank_options["verbose"] = int(opt[1])
elif opt == ("-a","") or opt == ("--add",""):
bibrank_options["cmd"] = "add"
if ("-x","") in opts or ("--del","") in opts:
usage(1)
elif opt[0] in ["--run", "-w"]:
bibrank_options["run"] = []
run = split(opt[1],",")
for key in range(0,len(run)):
bibrank_options["run"].append(run[key])
elif opt == ("-r","") or opt == ("--repair",""):
bibrank_options["cmd"] = "repair"
elif opt == ("-d","") or opt == ("--del",""):
bibrank_options["cmd"]="del"
elif opt[0] in [ "-u", "--user"]:
user = opt[1]
elif opt[0] in [ "-k", "--check"]:
bibrank_options["cmd"]= "check"
elif opt[0] in [ "-S", "--stat"]:
bibrank_options["cmd"] = "stat"
elif opt[0] in [ "-i", "--id" ]:
bibrank_options["id"] = bibrank_options["id"] + split_ranges(opt[1])
bibrank_options["last_updated"] = ""
elif opt[0] in [ "-c", "--collection" ]:
bibrank_options["collection"] = opt[1]
elif opt[0] in [ "-R", "--rebalance"]:
bibrank_options["quick"] = "no"
elif opt[0] in [ "-f", "--flush"]:
bibrank_options["flush"]=int(opt[1])
elif opt[0] in [ "-M", "--maxmem"]:
bibrank_options["maxmem"]=int(opt[1])
if bibrank_options["maxmem"] < base_process_size + 1000:
raise StandardError, "Memory usage should be higher than %d kB" % (base_process_size + 1000)
elif opt[0] in [ "-m", "--modified" ]:
bibrank_options["modified"] = get_date_range(opt[1]) #2002-10-27 13:57:26
bibrank_options["last_updated"] = ""
elif opt[0] in [ "-l", "--lastupdate" ]:
bibrank_options["last_updated"] = "last_updated"
elif opt[0] in [ "-s", "--sleeptime" ]:
get_datetime(opt[1]) # see if it is a valid shift
sleeptime=opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time = get_datetime(opt[1])
else:
usage(1)
except StandardError, e:
write_message(e, sys.stderr)
sys.exit(1)
user = authenticate(user)
if bibrank_options["verbose"]>=9:
write_message("Storing task options %s" % bibrank_options)
## sanity check: remove eventual "task" option:
if bibrank_options.has_key("task"):
del bibrank_options["task"]
new_task_id = run_sql("""INSERT INTO schTASK (proc,user,runtime,sleeptime,arguments,status) VALUES ('bibrank',%s,%s,%s,%s,'WAITING')""",
(user, sched_time, sleeptime, marshal.dumps(bibrank_options)))
## update task number:
bibrank_options["task"] = new_task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(bibrank_options), new_task_id))
print "Task #%d was successfully scheduled for execution." % new_task_id
return
def task_run(row):
"""Run the indexing task. The row argument is the BibSched task
queue row, containing if, arguments, etc.
Return 1 in case of success and 0 in case of failure.
"""
global bibrank_options
task_id = row[0]
task_proc = row[1]
bibrank_options = marshal.loads(row[6])
task_status = row[7]
# install signal handlers
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
if task_proc != "bibrank":
write_message("-The task #%d does not seem to be a BibRank task." % task_id, sys.stderr)
return 0
if task_status != "WAITING":
write_message("The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr)
return 0
if bibrank_options["verbose"]:
write_message("Task #%d started." % task_id)
task_update_status("RUNNING")
try:
bibrank_options = marshal.loads(row[6])
for key in bibrank_options["run"]:
write_message("")
file = etcdir + "/bibrank/" + key + ".cfg"
if bibrank_options["verbose"] >= 9:
write_message("Getting configuration from file: %s" % file)
config = ConfigParser.ConfigParser()
try:
config.readfp(open(file))
except StandardError, e:
write_message("Cannot find configurationfile: %s. The rankmethod may also not be registered using the BibRank Admin Interface." % file, sys.stderr)
raise StandardError
#Using the function variable to call the function related to the rank method
cfg_function = config.get("rank_method", "function")
func_object = globals().get(cfg_function)
if func_object:
func_object(row, key)
else:
write_message("Cannot run method '%s', no function to call" % key)
except StandardError, e:
write_message("\nException caught: %s" % e, sys.stderr)
traceback.print_tb(sys.exc_info()[2])
task_update_status("ERROR")
sys.exit(1)
task_update_status("DONE")
if bibrank_options["verbose"]:
write_message("Task #%d finished." % task_id)
return 1
def main():
global bibrank_options
if len(sys.argv) == 2:
try:
id = int(sys.argv[1])
except StandardError, err:
command_line()
sys.exit()
res = run_sql("SELECT * FROM schTASK WHERE id='%d'" % (id), None, 1)
if not res:
write_message("Selected task not found.", sys.stderr)
sys.exit(1)
try:
if not task_run(res[0]):
write_message("Error occurred. Exiting.", sys.stderr)
except StandardError, e:
write_message("Unexpected error occurred: %s." % e, sys.stderr)
write_message("Traceback is:")
traceback.print_tb(sys.exc_info()[2])
write_message("Exiting.")
task_update_status("ERROR")
else:
command_line()
if __name__ == "__main__":
main()
diff --git a/modules/bibsched/lib/bibtaskex.py b/modules/bibsched/lib/bibtaskex.py
index 340b75fff..f430a6760 100644
--- a/modules/bibsched/lib/bibtaskex.py
+++ b/modules/bibsched/lib/bibtaskex.py
@@ -1,339 +1,340 @@
# -*- coding: utf-8 -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""CDS Invenio Bibliographic Task Example.
Demonstrates BibTask <-> BibSched connectivity, signal handling,
error handling, etc.
"""
__revision__ = "$Id$"
import sys
from invenio.dbquery import run_sql
from invenio.access_control_engine import acc_authorize_action
import getopt
import getpass
import marshal
import signal
import sre
import string
import time
import traceback
options = {} # global variable to hold task options
cfg_n_default = 30 # how many Fibonacci numbers to calculate if none submitted?
def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re = sre.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def write_message(msg, stream=sys.stdout):
"""Write message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
return
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame))
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame))
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame))
write_message("stopping...")
task_update_status("STOPPING")
write_message("flushing cache or whatever...")
time.sleep(3)
write_message("closing tables or whatever...")
time.sleep(1)
write_message("stopped")
task_update_status("STOPPED")
sys.exit(0)
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame))
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
# do nothing for unknown signals:
write_message("unknown signal %d (frame %s) ignored" % (sig, frame))
def fib(n):
"""Returns Fibonacci number for 'n'."""
out = 1
if n >= 2:
out = fib(n-2) + fib(n-1)
return out
def authenticate(user, header="BibTaskEx Task Submission", action="runbibtaskex"):
"""Authenticate the user against the user database.
Check for its password, if it exists.
Check for action access rights.
Return user name upon authorization success,
do system exit upon authorization failure.
"""
print header
print "=" * len(header)
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
## first check user pw:
- res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1)
+ res = run_sql("select id,password from user where email=%s", (user,), 1) + \
+ run_sql("select id,password from user where nickname=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
## secondly check authorization for the action:
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def task_submit():
"""Submits task to the BibSched task queue. This is what people will be invoking via command line."""
global options
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
## authenticate user:
user = authenticate(options.get("user", ""))
## submit task:
if options["verbose"] >= 9:
print ""
write_message("storing task options %s\n" % options)
task_id = run_sql("""INSERT INTO schTASK (id,proc,user,runtime,sleeptime,status,arguments)
VALUES (NULL,'bibtaskex',%s,%s,%s,'WAITING',%s)""",
(user, options["runtime"], options["sleeptime"], marshal.dumps(options)))
## update task number:
options["task"] = task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), task_id))
write_message("Task #%d submitted." % task_id)
return task_id
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global options
if options["verbose"] >= 9:
write_message("Updating task progress to %s." % msg)
return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"]))
def task_update_status(val):
"""Updates status information in the BibSched task table."""
global options
if options["verbose"] >= 9:
write_message("Updating task status to %s." % val)
return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"]))
def task_read_status(task_id):
"""Read status information in the BibSched task table."""
res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1)
try:
out = res[0][0]
except:
out = 'UNKNOWN'
return out
def task_get_options(id):
"""Returns options for the task 'id' read from the BibSched task queue table."""
out = {}
res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibtaskex'", (id,))
try:
out = marshal.loads(res[0][0])
except:
write_message("Error: BibTaskEx task %d does not seem to exist." % id, sys.stderr)
sys.exit(1)
return out
def task_run(task_id):
"""Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call.
The task prints Fibonacci numbers for up to NUM on the stdout, and some messages on stderr.
Return 1 in case of success and 0 in case of failure."""
global options
options = task_get_options(task_id) # get options from BibSched task table
## check task id:
if not options.has_key("task"):
write_message("Error: The task #%d does not seem to be a BibTaskEx task." % task_id, sys.stderr)
return 0
## check task status:
task_status = task_read_status(task_id)
if task_status != "WAITING":
write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr)
return 0
## we can run the task now:
if options["verbose"]:
write_message("Task #%d started." % task_id)
task_update_status("RUNNING")
## initialize signal handler:
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
## run the task:
if options.has_key("number"):
n = options["number"]
else:
n = cfg_n_default
if options["verbose"] >= 9:
write_message("Printing %d Fibonacci numbers." % n)
for i in range(0, n):
if i > 0 and i % 4 == 0:
if options["verbose"] >= 3:
write_message("Error: water in the CPU. Ignoring and continuing.", sys.stderr)
elif i > 0 and i % 5 == 0:
if options["verbose"]:
write_message("Error: floppy drive dropped on the floor. Ignoring and continuing.", sys.stderr)
if options["verbose"]:
write_message("fib(%d)=%d" % (i, fib(i)))
task_update_progress("Done %d out of %d." % (i, n))
time.sleep(1)
## we are done:
task_update_progress("Done %d out of %d." % (n, n))
task_update_status("DONE")
if options["verbose"]:
write_message("Task #%d finished." % task_id)
return 1
def usage(exitcode=1, msg=""):
"""Prints usage info."""
if msg:
sys.stderr.write("Error: %s.\n" % msg)
sys.stderr.write("Usage: %s [options]\n" % sys.argv[0])
sys.stderr.write("Command options:\n")
sys.stderr.write(" -n, --number=NUM\t Print Fibonacci numbers for up to NUM. [default=%d]\n" % cfg_n_default)
sys.stderr.write("Scheduling options:\n")
sys.stderr.write(" -u, --user=USER \t User name to submit the task as, password needed.\n")
sys.stderr.write(" -t, --runtime=TIME \t Time to execute the task (now), e.g.: +15s, 5m, 3h, 2002-10-27 13:57:26\n")
sys.stderr.write(" -s, --sleeptime=SLEEP \t Sleeping frequency after which to repeat task (no), e.g.: 30m, 2h, 1d\n")
sys.stderr.write("General options:\n")
sys.stderr.write(" -h, --help \t\t Print this help.\n")
sys.stderr.write(" -V, --version \t\t Print version information.\n")
sys.stderr.write(" -v, --verbose=LEVEL \t Verbose level (0=min, 1=default, 9=max).\n")
sys.exit(exitcode)
def main():
"""Main function that analyzes command line input and calls whatever is appropriate.
Useful for learning on how to write BibSched tasks."""
global options
## parse command line:
if len(sys.argv) == 2 and sys.argv[1].isdigit():
## A - run the task
task_id = int(sys.argv[1])
try:
if not task_run(task_id):
write_message("Error occurred. Exiting.", sys.stderr)
except StandardError, e:
write_message("Unexpected error occurred: %s." % e, sys.stderr)
write_message("Traceback is:", sys.stderr)
traceback.print_tb(sys.exc_info()[2])
write_message("Exiting.", sys.stderr)
task_update_status("ERROR")
else:
## B - submit the task
# set default values:
options = {}
options["runtime"] = time.strftime("%Y-%m-%d %H:%M:%S")
options["sleeptime"] = ""
options["verbose"] = 1
# set user-defined options:
try:
opts, args = getopt.getopt(sys.argv[1:], "hVv:n:u:s:t:", ["help", "version", "verbose=", "number=", "user=", "sleep=", "time="])
except getopt.GetoptError, err:
usage(1, err)
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
usage(0)
elif opt[0] in ["-V", "--version"]:
print __revision__
sys.exit(0)
elif opt[0] in [ "-u", "--user"]:
options["user"] = opt[1]
elif opt[0] in ["-v", "--verbose"]:
options["verbose"] = int(opt[1])
elif opt[0] in ["-n", "--number"]:
options["number"] = int(opt[1])
elif opt[0] in [ "-s", "--sleeptime" ]:
get_datetime(opt[1]) # see if it is a valid shift
options["sleeptime"] = opt[1]
elif opt[0] in [ "-t", "--runtime" ]:
options["runtime"] = get_datetime(opt[1])
else:
usage(1)
except StandardError, e:
usage(e)
task_submit()
return
### okay, here we go:
if __name__ == '__main__':
main()
diff --git a/modules/bibupload/lib/bibupload.py b/modules/bibupload/lib/bibupload.py
index cc18a8bb7..828c4c592 100644
--- a/modules/bibupload/lib/bibupload.py
+++ b/modules/bibupload/lib/bibupload.py
@@ -1,1301 +1,1302 @@
# -*- coding: utf-8 -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
# pylint: disable-msg=C0301
"""
BibUpload: Receive MARC XML file and update the appropriate database
tables according to options.
Usage: bibupload [options] input.xml
Examples:
$ bibupload -i input.xml
Options:
-a, --append new fields are appended to the existing record
-c, --correct fields are replaced by the new ones in the existing record
-f, --format takes only the FMT fields into account. Does not update
-i, --insert insert the new record in the database
-r, --replace the existing record is entirely replaced by the new one
-z, --reference update references (update only 999 fields)
-s, --stage=STAGE stage to start from in the algorithm (0: always done; 1: FMT tags;
2: FFT tags; 3: BibFmt; 4: Metadata update; 5: time update)
-n, --notimechange do not change record last modification date when updating
Scheduling options:
-u, --user=USER user name to store task, password needed
General options:
-h, --help print this help and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
-V --version print the script version
"""
__revision__ = "$Id$"
import os
import sys
import getopt
import getpass
import signal
import string
import marshal
import time
import traceback
from zlib import compress
import MySQLdb
import re
from invenio.bibupload_config import *
from invenio.access_control_engine import acc_authorize_action
from invenio.dbquery import run_sql, \
Error
from invenio.bibrecord import create_records, \
create_record, \
record_add_field, \
record_delete_field, \
record_xml_output, \
record_get_field_instances, \
field_get_subfield_values
from invenio.dateutils import convert_datestruct_to_datetext
from invenio.search_engine import print_record
from invenio.config import filedir, \
filedirsize, \
htdocsurl
# Global variables
options = {}
options['mode'] = None
options['verbose'] = 1
options['tag'] = None
options['file_path'] = None
options['notimechange'] = 0
options['stage_to_start_from'] = 1
#Statistic variables
stat = {}
stat['nb_records_to_upload'] = 0
stat['nb_records_updated'] = 0
stat['nb_records_inserted'] = 0
stat['nb_errors'] = 0
stat['exectime'] = time.localtime()
### bibsched task related functions:
def write_message(msg, stream=sys.stdout, verbose=1):
"""Write message and flush output stream (may be sys.stdout or
sys.stderr). Useful for debugging stuff. Do not print anything
if the global verbose option is lower than VERBOSE.
"""
if stream == sys.stdout or stream == sys.stderr:
if options['verbose'] >= verbose:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ",
time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
return
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame))
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame))
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame))
write_message("stopping...")
task_update_status("STOPPING")
write_message("flushing cache or whatever...")
time.sleep(3)
write_message("closing tables or whatever...")
time.sleep(1)
write_message("stopped")
task_update_status("STOPPED")
sys.exit(0)
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame))
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
# do nothing for unknown signals:
write_message("unknown signal %d (frame %s) ignored" % (sig, frame))
def authenticate(user, header="BibUpload Task Submission", action="runbibupload"):
"""Authenticate the user against the user database.
Check for its password, if it exists.
Check for action access rights.
Return user name upon authorization success,
do system exit upon authorization failure.
"""
# FIXME: for the time being do not authenticate but always let the
# tasks in, because of automated inserts. Maybe we shall design
# an internal user here that will always be let in.
return user
print header
print "=" * len(header)
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
## first check user pw:
- res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1)
+ res = run_sql("select id,password from user where email=%s", (user,), 1) + \
+ run_sql("select id,password from user where nickname=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
## secondly check authorization for the action:
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def task_submit():
"""Submits task to the BibSched task queue. This is what people will be invoking via command line."""
global options
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
## authenticate user:
user = authenticate(options.get("user", ""))
## submit task:
if options["verbose"] >= 9:
print ""
write_message("storing task options %s\n" % options)
task_id = run_sql("""INSERT INTO schTASK (id,proc,user,runtime,sleeptime,status,arguments)
VALUES (NULL,'bibupload',%s,%s,%s,'WAITING',%s)""",
(user, options["runtime"], options["sleeptime"], marshal.dumps(options)))
## update task number:
options["task"] = task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), task_id))
write_message("Task #%d submitted." % task_id)
return task_id
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global options
if options["verbose"] >= 9:
write_message("Updating task progress to %s." % msg)
return run_sql("UPDATE schTASK SET progress=%s WHERE id=%s", (msg, options["task"]))
def task_update_status(val):
"""Updates status information in the BibSched task table."""
global options
if options["verbose"] >= 9:
write_message("Updating task status to %s." % val)
return run_sql("UPDATE schTASK SET status=%s WHERE id=%s", (val, options["task"]))
def task_read_status(task_id):
"""Read status information in the BibSched task table."""
res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1)
try:
out = res[0][0]
except:
out = 'UNKNOWN'
return out
def task_get_options(task_id):
"""Returns options for the task 'task_id' read from the BibSched task queue table."""
out = {}
res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibupload'", (task_id,))
try:
out = marshal.loads(res[0][0])
except:
write_message("Error: BibUpload task %d does not seem to exist." % task_id, sys.stderr)
sys.exit(1)
return out
def task_run(task_id):
"""Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call.
The task prints Fibinacci numbers for up to NUM on the stdout, and some messages on stderr.
Return 1 in case of success and 0 in case of failure."""
global options, stat
options = task_get_options(task_id) # get options from BibSched task table
## check task id:
if not options.has_key("task"):
write_message("Error: The task #%d does not seem to be a BibUpload task." % task_id, sys.stderr)
return 0
## check task status:
task_status = task_read_status(task_id)
if task_status != "WAITING":
write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr)
return 0
## we can run the task now:
if options["verbose"]:
write_message("Task #%d started." % task_id)
task_update_status("RUNNING")
## initialize signal handler:
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
## run the task:
error = 0
write_message("BibUpload Mode "+options['mode']+" has been choosen.", verbose=2)
write_message("STAGE 0:", verbose=2)
if options['file_path'] != None:
recs = xml_marc_to_records(open_marc_file(options['file_path']))
stat['nb_records_to_upload'] = len(recs)
write_message(" -Open XML marc: DONE", verbose=2)
if recs != None:
#We proceed each record by record
for record in recs:
error = bibupload(record)
if error[0] == 1:
stat['nb_errors'] += 1
task_update_progress("Done %d out of %d." % (stat['nb_records_inserted'] + stat['nb_records_updated'],
stat['nb_records_to_upload']))
else:
write_message(" Error bibupload failed: No record found", verbose=1, stream=sys.stderr)
if options['verbose'] >= 1:
#Print out the statistics
print_out_bibupload_statistics()
# Check if they were errors
if stat['nb_errors'] >= 1:
task_update_status("DONE WITH ERRORS")
else:
## we are done:
task_update_status("DONE")
if options["verbose"]:
write_message("Task #%d finished." % task_id)
return 1
### bibupload engine functions:
def parse_command():
"""Analyze the command line and retrieve arguments (xml file,
mode, etc) into global options variable.
Return 0 in case everything went well, 1 in case of errors, 2
in case only help or version number were asked for.
"""
# FIXME: add treatment of `time'
try:
opts, args = getopt.getopt(sys.argv[1:], "i:r:c:a:z:s:f:u:hv:Vn",
[
"insert=",
"replace=",
"correct=",
"append=",
"reference=",
"stage=",
"format=",
"user=",
"help",
"verbose",
"version",
"notimechange",
])
except getopt.GetoptError,erro:
usage()
write_message("Stage 0 error: %s" % erro, verbose=1, stream=sys.stderr)
return 1
# set the proper mode depending on the argument value
for opt, opt_value in opts:
# Verbose mode option
if opt in ["-v", "--verbose"]:
try:
options['verbose'] = int(opt_value)
except ValueError:
write_message("Failed: enter a valid number for verbose mode (between 0 and 9).", verbose=1, stream=sys.stderr)
return 1
# stage mode option
if opt in ["-s", "--stage"]:
try:
options['stage_to_start_from'] = int(opt_value)
except ValueError:
write_message("Failed: enter a valid number for the stage to start from(>0).", verbose=1, stream=sys.stderr)
return 1
# No time change option
if opt in ["-n", "--notimechange"]:
options['notimechange'] = 1
# Insert mode option
if opt in ["-i", "--insert"]:
options['mode'] = 'insert'
options['file_path'] = os.path.abspath(opt_value)
# Replace mode option
if opt in ["-r", "--replace"]:
# We check if there is not the mode insert after replace
if opt_value == '-i' or opt_value == '--insert':
options['replace_insert'] = 1
options['mode'] = 'replace_insert'
options['file_path'] = os.path.abspath(args[0])
else:
# Creation of the records from the xml Marc in argument
options['mode'] = 'replace'
options['file_path'] = os.path.abspath(opt_value)
# Correct mode option
if opt in ["-c", "--correct"]:
# We check if there is not just a special tag to correct
try:
if int(opt_value) > 0 and int(opt_value) < 999:
options['mode'] = 'correct'
options['tag'] = opt_value
options['file_path'] = os.path.abspath(args[0])
except ValueError:
if opt_value == 'FMT' or opt_value == 'FFT':
options['mode'] = 'correct'
options['tag'] = opt_value
options['file_path'] = os.path.abspath(args[0])
else:
options['mode'] = 'correct'
options['file_path'] = os.path.abspath(opt_value)
# Append mode option
if opt in ["-a", "--append"]:
# We check if there is not just a special tag to append
try:
if int(opt_value) > 0 and int(opt_value) < 999:
options['mode'] = 'append'
options['tag'] = opt_value
options['file_path'] = os.path.abspath(args[0])
except ValueError:
if opt_value == 'FMT' or opt_value == 'FFT':
options['mode'] = 'append'
options['tag'] = opt_value
options['file_path'] = os.path.abspath(args[0])
else:
options['mode'] = 'append'
options['file_path'] = os.path.abspath(opt_value)
# Reference mode option
if opt in ["-z", "--reference"]:
options['mode'] = 'reference'
options['file_path'] = os.path.abspath(opt_value)
# Format mode option
if opt in ["-f", "--format"]:
options['mode'] = 'format'
options['file_path'] = os.path.abspath(opt_value)
# Detection of user
if opt in ["-u", "--user"]:
options['user'] = opt_value
# Help mode option
if opt in ["-h", "--help"]:
usage()
return 2
# Version mode option
if opt in ["-V", "--version"]:
write_message(__revision__, verbose=1)
return 2
if options['mode'] == None:
write_message("Please specify at least one update/insert mode!")
return 1
if options['file_path'] == None:
write_message("Missing filename! -h for help.")
return 1
return 0
def bibupload(record):
"""Main function: process a record and fit it in the tables
bibfmt, bibrec, bibrec_bibxxx, bibxxx with proper record
metadata.
Return (error_code, recID) of the processed record.
"""
error = None
#If there are special tags to proceed check if it exists in the record
if options['tag'] != None and not(record.has_key(options['tag'])):
write_message(" Failed: Tag not found, enter a valid tag to update.", verbose=1, stream=sys.stderr)
return (1, -1)
#Extraction of the Record Id
rec_id = retrieve_rec_id(record)
if rec_id == -1:
return (1, -1)
else:
write_message(" -Retrieve record Id (found %s): DONE." % rec_id, verbose=2)
write_message(" -Check if the xml marc file is already in the database: DONE" , verbose=2)
# Reference mode check if there are reference tag
if options['mode'] == 'reference':
error = extract_tag_from_record(record, CFG_BIBUPLOAD_REFERENCE_TAG)
if error == None:
write_message(" Failed: No reference tags has been found...", verbose=1, stream=sys.stderr)
return (1, -1)
else:
error = None
write_message(" -Check if reference tags exist: DONE", verbose=2)
if options['mode'] == 'insert':
# Insert the record into the bibrec databases to have a recordId
rec_id = create_new_record()
write_message(" -Creation of a new record id (%d): DONE" % rec_id, verbose=2)
# we add the record Id control field to the record
error = record_add_field(record, '001', '', '', rec_id, [], 0)
if error == None:
write_message(" Failed: " \
"Error during adding the 001 controlfield " \
"to the record", verbose=1, stream=sys.stderr)
return (1, rec_id)
else:
error = None
elif options['mode'] != 'insert' and options['mode'] != 'format' and options['stage_to_start_from'] != 5:
# Update Mode
#Retrieve the old record to update
rec_old = create_record(print_record(int(rec_id),'xm'), 2)[0]
if rec_old == None:
write_message(" Failed during the creation of the old record!", verbose=1, stream=sys.stderr)
return (1, rec_id)
else:
write_message(" -Retrieve the old record to update: DONE", verbose=2)
#Delete tags to correct in the record
if options['mode'] == 'correct' or options['mode'] == 'reference':
delete_tags_to_correct(record, rec_old)
write_message(" -Delete the old tags to correct in the old record: DONE", verbose=2)
# Append new tag to the old record and update the new record with the old_record modified
if options['mode'] == 'append' or options['mode'] == 'correct' or options['mode'] == 'reference':
record = append_new_tag_to_old_record(record, rec_old)
write_message(" -Append new tags to the old record: DONE", verbose=2)
# now we clear all the rows from bibrec_bibxxx from the old record
delete_bibrec_bibxxx(rec_old, rec_id)
write_message(" -Clean bibrec_bibxxx: DONE", verbose=2)
write_message(" -Stage COMPLETED", verbose=2)
#Have a look if we have FMT tags
write_message("Stage 1: Start (Insert of FMT tags if exist).", verbose=2)
if options['stage_to_start_from'] <= 1 and extract_tag_from_record(record, 'FMT') != None:
record = insert_fmt_tags(record, rec_id)
if record == None:
write_message(" Stage 1 failed: Error while inserting FMT tags", verbose=1, stream=sys.stderr)
return (1, rec_id)
elif record == 0:
#Mode format finished
stat['nb_records_updated'] += 1
return (0, rec_id)
write_message(" -Stage COMPLETED", verbose=2)
else:
write_message(" -Stage NOT NEEDED", verbose=2)
#Have a look if we have FFT tags
write_message("Stage 2: Start (Process FFT tags if exist).", verbose=2)
if options['stage_to_start_from'] <= 2 and extract_tag_from_record(record, 'FFT') != None:
if options['mode'] == 'insert' or options['mode'] == 'append':
record = insert_fft_tags(record, rec_id)
else:
record = update_fft_tag(record, rec_id)
write_message(" -Stage COMPLETED", verbose=2)
else:
write_message(" -Stage NOT NEEDED", verbose=2)
# Update of the BibFmt
write_message("Stage 3: Start (Update bibfmt).", verbose=2)
if options['stage_to_start_from'] <= 3:
# format the single record as xml
rec_xml_new = record_xml_output(record)
#Update bibfmt with the format xm of this record
if options['mode'] != 'format':
error = update_bibfmt_format(rec_id, rec_xml_new, 'xm')
if error == 1:
write_message(" Failed: error during update_bibfmt_format", verbose=1, stream=sys.stderr)
return (1, rec_id)
write_message(" -Stage COMPLETED", verbose=2)
# Update the database MetaData
write_message("Stage 4: Start (Update the database with the metadata).", verbose=2)
if options['stage_to_start_from'] <= 4:
update_database_with_metadata(record, rec_id)
write_message(" -Stage COMPLETED", verbose=2)
else:
write_message(" -Stage NOT NEEDED", verbose=2)
# Finally we update the bibrec table with the current date
write_message("Stage 5: Start (Update bibrec table with current date).", verbose=2)
if options['stage_to_start_from'] <= 5 and options['mode'] != 'insert' and options['notimechange'] == 0:
now = convert_datestruct_to_datetext(time.localtime())
write_message(" -Retrieve current localtime: DONE", verbose=2)
update_bibrec_modif_date(now, rec_id)
write_message(" -Stage COMPLETED", verbose=2)
else:
write_message(" -Stage NOT NEEDED", verbose=2)
#Increase statistics
if options['mode'] == 'insert':
stat['nb_records_inserted'] += 1
else:
stat['nb_records_updated'] += 1
#Upload of this record finish
write_message("Record "+str(rec_id)+" DONE", verbose=1)
return (0, rec_id)
def usage():
"""Print help"""
print """Receive MARC XML file and update appropriate database tables according to options.
Usage: bibupload [options] input.xml
Examples:
$ bibupload -i input.xml
Options:
-a, --append new fields are appended to the existing record
-c, --correct fields are replaced by the new ones in the existing record
-f, --format takes only the FMT fields into account. Does not update
-i, --insert insert the new record in the database
-r, --replace the existing record is entirely replaced by the new one
-z, --reference update references (update only 999 fields)
-s, --stage=STAGE stage to start from in the algorithm (0: always done; 1: FMT tags;
2: FFT tags; 3: BibFmt; 4: Metadata update; 5: time update)
-n, --notimechange do not change record last modification date when updating
Scheduling options:
-u, --user=USER user name to store task, password needed
General options:
-h, --help print this help and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
-V --version print the script version
"""
def print_out_bibupload_statistics():
"""Print the statistics of the process"""
out = "Task stats: %(nb_input)d input records, %(nb_updated)d updated, " \
"%(nb_inserted)d inserted, %(nb_errors)d errors. Time %(nb_sec).2f sec." % { \
'nb_input': stat['nb_records_to_upload'],
'nb_updated': stat['nb_records_updated'],
'nb_inserted': stat['nb_records_inserted'],
'nb_errors': stat['nb_errors'],
'nb_sec': time.time() - time.mktime(stat['exectime']) }
write_message(out)
def open_marc_file(path):
"""Open a file and return the data"""
try:
# open the file containing the marc document
marc_file = open(path,'r')
marc = marc_file.read()
marc_file.close()
except IOError, erro:
write_message("Error: %s" % erro, verbose=1, stream=sys.stderr)
write_message("Exiting.", sys.stderr)
task_update_status("ERROR")
sys.exit(1)
return marc
def xml_marc_to_records(xml_marc):
"""create the records"""
# Creation of the records from the xml Marc in argument
recs = create_records(xml_marc, 1, 1)
if recs == []:
write_message("Error: Cannot parse MARCXML file.", verbose=1, stream=sys.stderr)
write_message("Exiting.", sys.stderr)
task_update_status("ERROR")
sys.exit(1)
elif recs[0][0] == None:
write_message("Error: MARCXML file has wrong format: %s" % recs[0][2], verbose=1, stream=sys.stderr)
write_message("Exiting.", sys.stderr)
task_update_status("ERROR")
sys.exit(1)
else:
recs = map((lambda x:x[0]), recs)
return recs
def find_record_bibrec(rec_id):
""" receives the record ID and returns if this record exist in bibrec """
query = """SELECT id FROM bibrec WHERE id = %s"""
params = (rec_id,)
try:
res = run_sql(query, params)
except Error, error:
write_message(" Error during find_record_bibrec function : %s " % error, verbose=1, stream=sys.stderr)
if len(res):
return res
else:
return None
def find_record_format(rec_id, format):
"""Look whether record REC_ID is formatted in FORMAT,
i.e. whether FORMAT exists in the bibfmt table for this record.
Return the number of times it is formatted: 0 if not, 1 if yes,
2 if found more than once (should never occur).
"""
out = 0
query = """SELECT COUNT(id) FROM bibfmt WHERE id_bibrec=%s AND format=%s"""
params = (rec_id, format)
res = []
try:
res = run_sql(query, params)
out = res[0][0]
except Error, error:
write_message(" Error during find_record_format() : %s " % error, verbose=1, stream=sys.stderr)
return out
def find_record_bibfmt(marc):
""" receives the xmlmarc containing a record and returns the id in bibrec if the record exists in bibfmt"""
# compress the marc value
pickled_marc = MySQLdb.escape_string(compress(marc))
query = """SELECT id_bibrec FROM bibfmt WHERE value = %s"""
# format for marc xml is xm
params = (pickled_marc,)
try:
res = run_sql(query, params)
except Error, error:
write_message(" Error during find_record_bibfmt function : %s " % error, verbose=1, stream=sys.stderr)
if len(res):
return res
else:
return None
def find_record_from_sysno(sysno):
"""receive the sysno number and return the record id"""
table_name = 'bib'+CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:2]+'x'
query = """SELECT DISTINCT id FROM `%s` WHERE value = '%s'"""
params = (table_name, sysno)
try:
res = run_sql(query % params)
except Error, error:
write_message(" Error during find_record_from_sysno function 1st query : %s " % error, verbose=1, stream=sys.stderr)
if len(res):
table_name = 'bibrec_bib'+CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:2]+'x'
query = """SELECT DISTINCT id_bibrec FROM `%s` WHERE id_bibxxx = '%s'"""
params = (table_name, res[0][0])
try:
res = run_sql(query % params)
except Error, error:
write_message(" Error during find_record_from_sysno function 2nd query : %s " % error, verbose=1, stream=sys.stderr)
if len(res):
return res[0][0]
else:
return None
else:
return None
def extract_tag_from_record(record, tag_number):
""" Extract the tag_number for record."""
# first step verify if the record is not already in the database
if record:
return record.get(tag_number, None)
return None
def retrieve_rec_id(record):
"""Retrieve the record Id from a record by using tag 001 or SYSNO"""
rec_id = None
tag = None
#1st step: we look for the tag 001
tag = extract_tag_from_record(record, '001')
if tag != None:
#We exctract the record Id from the Tag
rec_id = tag[0][3]
#if we are in insert mode => error
if options['mode'] == 'insert':
write_message(" Failed : Error tag 001 found in the xml submitted"\
", you should use the option replace, correct or append"\
" to replace an existing record. -h for help.", verbose=1, stream=sys.stderr)
return -1
#if we found the rec id and we are not in insert mode => continue
elif options['mode'] != 'insert':
#we try to find the rec_id in the table bibrec
if find_record_bibrec(rec_id) != None:
return rec_id
else:
#The record doesn't exist yet. We will try to check the SYSNO id
rec_id = None
else:
write_message(" -Tag 001 not found in the xml marc file.", verbose=9)
if rec_id == None:
#2nd step we look for the sysno code
sysno = extract_tag_from_record(record, CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG)
if sysno != None:
#retrieve the SYSNO code from the tuple SYSNO
sysno = sysno[0][0][0][1]
write_message(" Check if the SYSNO id "+sysno+" exist in the database", verbose=9)
#Retrieve the rec id from the database
rec_id = find_record_from_sysno(sysno)
if rec_id != None and options['mode'] == 'insert':
write_message(" Failed : Record id found in the database: Please choose another mode than insert. -h for help.",
verbose=1, stream=sys.stderr)
return -1
elif rec_id == None and options['mode'] != 'insert':
if options['mode'] != 'replace_insert':
write_message(" Failed : Record Id not found even with SYSNO id..."\
"Please insert the file before updating it."\
" -h for help", verbose=1, stream=sys.stderr)
return -1
else:
options['mode'] = 'insert'
else:
return rec_id
if sysno == None and options['mode'] != 'insert':
if options['mode'] != 'replace_insert':
write_message(" Failed : SYSNO tag not found in the xml marc file."\
"Please insert the file before updating it."\
" -h for help", verbose=1, stream=sys.stderr)
return -1
else:
options['mode'] = 'insert'
return rec_id
### Insert functions
def create_new_record():
"""Create new record in the database"""
now = convert_datestruct_to_datetext(time.localtime())
query = """INSERT INTO bibrec (creation_date, modification_date)
VALUES (%s, %s)"""
params = (now, now)
try:
rec_id = run_sql(query, params)
return rec_id
except Error, error:
write_message(" Error during the creation_new_record function : %s " % error, verbose=1, stream=sys.stderr)
return None
def insert_bibfmt(id_bibrec, marc, format):
"""Insert the format in the table bibfmt"""
# compress the marc value
#pickled_marc = MySQLdb.escape_string(compress(marc))
pickled_marc = compress(marc)
# get the current time
now = convert_datestruct_to_datetext(time.localtime())
query = """INSERT INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)"""
try:
row_id = run_sql(query, (id_bibrec, format, now, pickled_marc))
return row_id
except Error, error:
write_message(" Error during the insert_bibfmt function : %s " % error, verbose=1, stream=sys.stderr)
return None
def insert_record_bibxxx(tag, value):
"""Insert the record into bibxxx"""
#determine into which table one should insert the record
table_name = 'bib'+tag[0:2]+'x'
# check if the tag, value combination exists in the table
query = """SELECT id FROM %s """ % table_name
query += """ WHERE tag=%s AND value=%s"""
params = (tag, value)
try:
res = run_sql(query, params)
except Error, error:
write_message(" Error during the insert_record_bibxxx function : %s " % error, verbose=1, stream=sys.stderr)
if len(res):
# get the id of the row, if it exists
row_id = res[0][0]
return (table_name, row_id)
else:
# necessary to insert the tag, value into bibxxx table
query = """INSERT INTO %s """ % table_name
query += """ (tag, value) values (%s , %s)"""
params = (tag, value)
try:
row_id = run_sql(query, params)
except Error, error:
write_message(" Error during the insert_record_bibxxx function : %s " % error, verbose=1, stream=sys.stderr)
return (table_name, row_id)
def insert_record_bibrec_bibxxx(table_name, id_bibxxx, field_number, id_bibrec):
"""Insert the record into bibrec_bibxxx"""
#determine into which table one should insert the record
full_table_name = 'bibrec_'+ table_name
# insert the proper row into the table
query = """INSERT INTO %s """ % full_table_name
query += """(id_bibrec,id_bibxxx, field_number) values (%s , %s, %s)"""
params = (id_bibrec, id_bibxxx, field_number)
try:
res = run_sql(query, params)
except Error, error:
write_message(" Error during the insert_record_bibrec_bibxxx function 2nd query : %s " % error, verbose=1, stream=sys.stderr)
return res
def insert_fft_tags(record, rec_id):
"""Process and insert FFT tags"""
tuple_list = None
tuple_list = extract_tag_from_record(record, 'FFT')
#If there is a FFT TAG :)
if tuple_list != None:
for single_tuple in tuple_list:
# Get the inside of the FFT file
docpath = single_tuple[0][0][1]
docname = re.sub("\..*", "", os.path.basename(docpath))
extension = re.sub("^[^\.]*.", "", os.path.basename(docpath)).lower()
#Create a new docId
try:
bib_doc_id = run_sql("insert into bibdoc (docname,creation_date,modification_date) values(%s,NOW(),NOW())", (docname,))
write_message(" -Insert of the file %s into bibdoc : DONE" % docname, verbose=2)
except Error, error:
write_message(" Error during the insert_fft_tags function : %s " % error, verbose=1, stream=sys.stderr)
if bib_doc_id != None:
#we link the document to the record if a rec_id was specified
if rec_id != "":
#TO FIX doc_type : main or additional, fron where the information come from?
doc_type = ""
try:
res = run_sql("insert into bibrec_bibdoc values(%s,%s,%s)", (rec_id, bib_doc_id, doc_type))
if res == None:
write_message(" Failed during creation of link between doc Id and rec Id).", verbose=1, stream=sys.stderr)
else:
write_message(" -Insert of the link bibrec bibdoc for %s : DONE" % docname, verbose=2)
except Error, error:
write_message(" Error during the insert_fft_tags function : %s " % error, verbose=1, stream=sys.stderr)
else:
write_message(" Failed during creation of the new doc Id.", verbose=1, stream=sys.stderr)
#Move the file to the correct place
# Variables from the config file
archivepath = filedir
archivesize = filedirsize
url_path = None
group = "g"+str(int(int(bib_doc_id)/archivesize))
basedir = "%s/%s/%s" % (archivepath, group, bib_doc_id)
# we create the corresponding storage directory
if not os.path.exists(basedir):
try:
os.makedirs(basedir)
write_message(" -Create a new directory %s : DONE" % basedir, verbose=2)
except OSError, error:
write_message(" Error making the directory : %s " % error, verbose=1, stream=sys.stderr)
# and save the father record id if it exists
if rec_id != "":
try:
filep = open("%s/.recid" % basedir, "w")
filep.write(str(bib_doc_id))
filep.close()
except IOError, error:
write_message(" Error writing the file : %s " % error, verbose=1, stream=sys.stderr)
#Move the file to the good directory
try:
os.system("mv %s %s" % (docpath, basedir))
write_message(" -Move the file %s : DONE" % docname, verbose=2)
except OSError, error:
write_message(" Error moving the file : %s " % error, verbose=1, stream=sys.stderr)
#Create the Url Path
url_path = htdocsurl+"/record/"+str(rec_id)+"/files/"+docname+"."+extension
#add tag 856 to the xml marc to proceed
subfield_list = [('u', url_path), ('z', 'Access to Fulltext')]
newfield_number = record_add_field(record, "856", "4", "", "", subfield_list)
if newfield_number == None:
write_message(" Error when adding the field"+ single_tuple, verbose=1, stream=sys.stderr)
else:
write_message(" -Add the new tag 856 to the record for %s : DONE" % docname, verbose=2)
#Delete FFT tag :)
record_delete_field(record, 'FFT', '', '')
write_message(" -Delete FFT tag from source : DONE", verbose=2)
return record
def insert_fmt_tags(record, rec_id):
"""Process and insert FMT tags"""
fmt_fields = record_get_field_instances(record, 'FMT')
if fmt_fields:
for fmt_field in fmt_fields:
# Get the f, g subfields of the FMT tag
try:
f_value = field_get_subfield_values(fmt_field, "f")[0]
except IndexError:
f_value = ""
try:
g_value = field_get_subfield_values(fmt_field, "g")[0]
except IndexError:
g_value = ""
# Update the format
res = update_bibfmt_format(rec_id, g_value, f_value)
if res == 1:
write_message(" Failed: Error during update_bibfmt", verbose=1, stream=sys.stderr)
# If we are in format mode, we only care about the FMT tag
if options['mode'] == 'format':
return 0
# We delete the FMT Tag of the record
record_delete_field(record, 'FMT')
write_message(" -Delete field FMT from record : DONE", verbose=2)
return record
elif options['mode'] == 'format':
write_message(" Failed: Format updated failed : No tag FMT found", verbose=1, stream=sys.stderr)
return None
else:
return record
### Update functions
def update_bibrec_modif_date(now, bibrec_id):
"""Update the date of the record in bibrec table """
query = """UPDATE bibrec SET modification_date=%s WHERE id=%s"""
params = (now, bibrec_id)
try:
res = run_sql(query, params)
if res != 1:
write_message(" Failed : Sql error during the update of the bibrec modification_date", verbose=1, stream=sys.stderr)
else:
write_message(" -Update record modification date : DONE" , verbose=2)
except Error, error:
write_message(" Error during update_bibrec_modif_date function : %s" % error, verbose=1, stream=sys.stderr)
def update_bibfmt_format(id_bibrec, format_value, format_name):
"""Update the format in the table bibfmt"""
# We check if the format is already in bibFmt
nb_found = find_record_format(id_bibrec, format_name)
if nb_found == 1:
# Update the format
# get the current time
now = convert_datestruct_to_datetext(time.localtime())
# compress the format_value value
pickled_format_value = compress(format_value)
query = """UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s"""
params = (now, pickled_format_value, id_bibrec, format_name)
try:
row_id = run_sql(query, params)
if row_id == None:
write_message(" Failed: Error during update_bibfmt_format function", verbose=1, stream=sys.stderr)
return 1
else:
write_message(" -Update the format %s in bibfmt : DONE" % format_name , verbose=2)
return 0
except Error, error:
write_message(" Error during the update_bibfmt_format function : %s " % error, verbose=1, stream=sys.stderr)
elif nb_found > 1:
write_message(" Failed: Same format %s found several time in bibfmt for the same record." % format_name, verbose=1, stream=sys.stderr)
return 1
else:
# Insert the format information in BibFMT
res = insert_bibfmt(id_bibrec, format_value, format_name)
if res == None:
write_message(" Failed: Error during insert_bibfmt", verbose=1, stream=sys.stderr)
return 1
else:
write_message(" -Insert the format %s in bibfmt : DONE" % format_name , verbose=2)
return 0
def update_database_with_metadata(record, rec_id):
"""Update the database tables with the record and the record id given in parameter"""
for tag in record.keys():
# check if tag is not a control field : tag not in CFG_BIBUPLOAD_CONTROLFIELD_TAGS and
if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS:
# for each tag there is a list of tuples representing datafields
tuple_list = record[tag]
# this list should contain the elements of a full tag [tag, ind1, ind2, subfield_code]
tag_list = []
tag_list.append(tag)
for single_tuple in tuple_list:
# these are the contents of a single tuple
subfield_list = single_tuple[0]
ind1 = single_tuple[1]
ind2 = single_tuple[2]
# append the ind's to the full tag
if (ind1 == ''):
tag_list.append('_')
else:
tag_list.append(ind1)
if (ind2 == ''):
tag_list.append('_')
else:
tag_list.append(ind2)
datafield_number = single_tuple[4]
if tag in CFG_BIBUPLOAD_SPECIAL_TAGS:
# nothing to do for special tags (FFT, FMT)
pass
elif tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS and tag != "001":
value = single_tuple[3]
# get the full tag
full_tag = ''.join(tag_list)
# Others modes : update all the tag
if options['mode'] == 'insert' or options['mode'] == 'replace' or options['mode'] == 'append':
write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9)
# insert the tag and value into into bibxxx
(table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value)
#print 'tname, bibrow', table_name, bibxxx_row_id;
if table_name == None or bibxxx_row_id == None:
write_message(" Failed : during insert_record_bibxxx", verbose=1, stream=sys.stderr)
# connect bibxxx and bibrec with the table bibrec_bibxxx
res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id)
if res == None:
write_message(" Failed : during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr)
else:
# get the tag and value from the content of each subfield
for subfield in subfield_list:
subtag = subfield[0]
value = subfield[1]
tag_list.append(subtag)
# get the full tag
full_tag = ''.join(tag_list)
# Others modes : update all the tag
if options['mode'] == 'insert' or options['mode'] == 'replace' or options['mode'] == 'append':
write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9)
# insert the tag and value into into bibxxx
(table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value)
if table_name == None or bibxxx_row_id == None:
write_message(" Failed : during insert_record_bibxxx", verbose=1, stream=sys.stderr)
# connect bibxxx and bibrec with the table bibrec_bibxxx
res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id)
if res == None:
write_message(" Failed : during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr)
# remove the subtag from the list
tag_list.pop()
tag_list.pop()
tag_list.pop()
tag_list.pop()
write_message(" -Update the database with metadata : DONE", verbose=2)
def append_new_tag_to_old_record(record, rec_old):
"""Append new tags to a old record"""
if options['tag'] != None:
tag = options['tag']
if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS:
if tag == '001':
pass
else:
# if it is a controlfield,just access the value
for single_tuple in record[tag]:
controlfield_value = single_tuple[3]
# add the field to the old record
newfield_number = record_add_field(rec_old, tag, "", "", controlfield_value)
if newfield_number == None:
write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr)
else:
# For each tag there is a list of tuples representing datafields
for single_tuple in record[tag]:
# We retrieve the information of the tag
subfield_list = single_tuple[0]
ind1 = single_tuple[1]
ind2 = single_tuple[2]
# We add the datafield to the old record
if options['verbose'] == 9:
print " Adding tag: ", tag, " ind1=", ind1, " ind2=", ind2, " code=", subfield_list
newfield_number = record_add_field(rec_old, tag, ind1, ind2, "", subfield_list)
if newfield_number == None:
write_message("Error when adding the field"+tag, verbose=1, stream=sys.stderr)
else:
# Go through each tag in the appended record
for tag in record.keys():
# Reference mode append only reference tag
if options['mode'] == 'reference':
if tag == CFG_BIBUPLOAD_REFERENCE_TAG:
for single_tuple in record[tag]:
# We retrieve the information of the tag
subfield_list = single_tuple[0]
ind1 = single_tuple[1]
ind2 = single_tuple[2]
# We add the datafield to the old record
if options['verbose'] == 9:
print " Adding tag: ", tag, " ind1=", ind1, " ind2=", ind2, " code=", subfield_list
newfield_number = record_add_field(rec_old, tag, ind1, ind2, "", subfield_list)
if newfield_number == None:
write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr)
else:
if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS:
if tag == '001':
pass
else:
# if it is a controlfield,just access the value
for single_tuple in record[tag]:
controlfield_value = single_tuple[3]
# add the field to the old record
newfield_number = record_add_field(rec_old, tag, "", "", controlfield_value)
if newfield_number == None:
write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr)
else:
# For each tag there is a list of tuples representing datafields
for single_tuple in record[tag]:
# We retrieve the information of the tag
subfield_list = single_tuple[0]
ind1 = single_tuple[1]
ind2 = single_tuple[2]
# We add the datafield to the old record
if options['verbose'] == 9:
print " Adding tag: ", tag, " ind1=", ind1, " ind2=", ind2, " code=", subfield_list
newfield_number = record_add_field(rec_old, tag, ind1, ind2, "", subfield_list)
if newfield_number == None:
write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr)
return rec_old
def update_fft_tag(record, rec_id):
"""Process and Update FFT tags"""
#TO IMPROVE: SELECT THE BIBDOC ID TO DELETE AND FIRST INSERT THE NEW FFT TAGS BEFORE DELETING THE OLD ONE
# We delete the bibdoc corresponding to this record
delete_bibdoc(rec_id)
# We delete the links between bibrec and bibdoc
delete_bibrec_bibdoc(rec_id)
# We delete the tag 856 from the record
record_delete_field(record, '856', '4')
# We add the new fft tags
record = insert_fft_tags(record, rec_id)
return record
###Delete function
def delete_tags_to_correct(record, rec_old):
"""Delete the tags which are existing in both records"""
# Browse through all the tags from the marc file
for tag in record.keys():
#Do we have to delete only a special tag or all?
if options['tag'] == None:
# See if these tags exist in the old record
if rec_old.has_key(tag) and tag != '001':
# Delete the tag found
write_message(" Delete tag: "+tag+" ind1= "+rec_old[tag][0][1]+" ind2= "+rec_old[tag][0][2], verbose=9)
record_delete_field(rec_old, tag, rec_old[tag][0][1], rec_old[tag][0][2])
else:
if rec_old.has_key(tag) and options['tag'] == tag:
# Delete the tag found
write_message(" Delete tag: "+tag+" ind1= "+rec_old[tag][0][1]+" ind2= "+rec_old[tag][0][2], verbose=9)
record_delete_field(rec_old, tag, rec_old[tag][0][1], rec_old[tag][0][2])
def delete_bibrec_bibxxx(record, id_bibrec):
"""Delete the database record from the table bibxxx given in parameters"""
# we clear all the rows from bibrec_bibxxx from the old record
for tag in record.keys():
if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS:
# for each name construct the bibrec_bibxxx table name
table_name = 'bibrec_bib'+tag[0:2]+'x'
# delete all the records with proper id_bibrec
query = """DELETE FROM `%s` where id_bibrec = %s"""
params = (table_name, id_bibrec)
try:
run_sql(query % params)
except Error, error:
write_message(" Error during the delete_bibrec_bibxxx function : %s " % error, verbose=1, stream=sys.stderr)
def delete_bibdoc(id_bibrec):
"""Delete document from bibdoc which correspond to the bibrec id given in parameter"""
query = """UPDATE bibdoc SET status='deleted' WHERE id IN (SELECT id_bibdoc FROM bibrec_bibdoc WHERE id_bibrec=%s)"""
params = (id_bibrec,)
try:
run_sql(query, params)
except Error, error:
write_message(" Error during the delete_bibdoc function : %s " % error, verbose=1, stream=sys.stderr)
def delete_bibrec_bibdoc(id_bibrec):
"""Delete the bibrec record from the table bibrec_bibdoc given in parameter"""
# delete all the records with proper id_bibrec
query = """DELETE FROM bibrec_bibdoc WHERE id_bibrec=%s"""
params = (id_bibrec,)
try:
run_sql(query, params)
except Error, error:
write_message(" Error during the delete_bibrec_bibdoc function : %s " % error, verbose=1, stream=sys.stderr)
def main():
"""main entry point for bibupload"""
global options
## parse command line:
if len(sys.argv) == 2 and sys.argv[1].isdigit():
## A - run the task
task_id = int(sys.argv[1])
try:
if not task_run(task_id):
write_message("Error occurred. Exiting.", sys.stderr)
except StandardError, erro:
write_message("Unexpected error occurred: %s." % erro, sys.stderr)
write_message("Traceback is:", sys.stderr)
traceback.print_tb(sys.exc_info()[2])
write_message("Exiting.", sys.stderr)
task_update_status("ERROR")
else:
## B - submit the task
# set default values:
options["runtime"] = time.strftime("%Y-%m-%d %H:%M:%S")
options["sleeptime"] = ""
# set user-defined options:
error = parse_command()
if error == 0:
task_submit()
else:
sys.exit(1)
return
if __name__ == "__main__":
main()
diff --git a/modules/websearch/lib/websearch_webcoll.py b/modules/websearch/lib/websearch_webcoll.py
index 74789b498..e0dc5d388 100644
--- a/modules/websearch/lib/websearch_webcoll.py
+++ b/modules/websearch/lib/websearch_webcoll.py
@@ -1,1087 +1,1088 @@
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Creates CDS Invenio collection specific pages, using WML and MySQL configuration tables."""
__revision__ = "$Id$"
import calendar
import copy
import getopt
import getpass
import marshal
import signal
import sys
import cgi
import sre
import os
import string
import zlib
import Numeric
import time
import traceback
from invenio.config import \
CFG_CERN_SITE, \
CFG_WEBSEARCH_INSTANT_BROWSE, \
CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, \
cachedir, \
cdslang, \
cdsname, \
weburl
from invenio.messages import gettext_set_language, language_list_long
from invenio.search_engine import HitSet, search_pattern, get_creation_date, get_field_i18nname
from invenio.dbquery import run_sql, escape_string, Error, get_table_update_time
from invenio.access_control_engine import acc_authorize_action
from invenio.bibrank_record_sorter import get_bibrank_methods
from invenio.dateutils import convert_datestruct_to_dategui
from invenio.websearch_external_collections import \
external_collection_load_states, \
dico_collection_external_searches, \
external_collection_sort_engine_by_name
import invenio.template
websearch_templates = invenio.template.load('websearch')
## global vars
collection_house = {} # will hold collections we treat in this run of the program; a dict of {collname2, collobject1}, ...
options = {} # will hold task options
# cfg_cache_last_updated_timestamp_tolerance -- cache timestamp
# tolerance (in seconds), to account for the fact that an admin might
# accidentally happen to edit the collection definitions at exactly
# the same second when some webcoll process was about to be started.
# In order to be safe, let's put an exaggerated timestamp tolerance
# value such as 20 seconds:
cfg_cache_last_updated_timestamp_tolerance = 20
# cfg_cache_last_updated_timestamp_file -- location of the cache
# timestamp file:
cfg_cache_last_updated_timestamp_file = "%s/collections/last_updated" % cachedir
def get_collection(colname):
"""Return collection object from the collection house for given colname.
If does not exist, then create it."""
if not collection_house.has_key(colname):
colobject = Collection(colname)
collection_house[colname] = colobject
return collection_house[colname]
## auxiliary functions:
def mymkdir(newdir, mode=0777):
"""works the way a good mkdir should :)
- already exists, silently complete
- regular file in the way, raise an exception
- parent directory(ies) does not exist, make them as well
"""
if os.path.isdir(newdir):
pass
elif os.path.isfile(newdir):
raise OSError("a file with the same name as the desired " \
"dir, '%s', already exists." % newdir)
else:
head, tail = os.path.split(newdir)
if head and not os.path.isdir(head):
mymkdir(head, mode)
if tail:
os.umask(022)
os.mkdir(newdir, mode)
def is_selected(var, fld):
"Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes."
if var == fld:
return " selected"
else:
return ""
def write_message(msg, stream=sys.stdout):
"""Write message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
return
def get_field(recID, tag):
"Gets list of field 'tag' for the record with 'recID' system number."
out = []
digit = tag[0:2]
bx = "bib%sx" % digit
bibx = "bibrec_bib%sx" % digit
query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag='%s'" \
% (bx, bibx, recID, tag)
res = run_sql(query)
for row in res:
out.append(row[0])
return out
def print_record(recID, format='hb', ln=cdslang):
"Prints record 'recID' formatted accoding to 'format'."
out = ""
# HTML brief format by default
query = "SELECT value FROM bibfmt WHERE id_bibrec='%s' AND format='%s'" % (recID, format)
res = run_sql(query, None, 1)
if res:
# record 'recID' is formatted in 'format', so print it
out += "%s" % zlib.decompress(res[0][0])
else:
# record 'recID' does not exist in format 'format', so print some default format:
# firstly, title:
titles = get_field(recID, "245__a")
# secondly, authors:
authors = get_field(recID, "100__a") + get_field(recID, "700__a")
# thirdly, date of creation:
dates = get_field(recID, "260__c")
# thirdly bis, report numbers:
rns = get_field(recID, "037__a") + get_field(recID, "088__a")
# fourthly, beginning of abstract:
abstracts = get_field(recID, "520__a")
# fifthly, fulltext link:
urls_z = get_field(recID, "8564_z")
urls_u = get_field(recID, "8564_u")
out += websearch_templates.tmpl_record_body(
weburl = weburl,
titles = titles,
authors = authors,
dates = dates,
rns = rns,
abstracts = abstracts,
urls_u = urls_u,
urls_z = urls_z,
ln=ln)
# at the end of HTML mode, print "Detailed record" and "Mark record" functions:
out += websearch_templates.tmpl_record_links(
weburl = weburl,
recid = recID,
ln = ln
)
return out
class Collection:
"Holds the information on collections (id,name,dbquery)."
def __init__(self, name=""):
"Creates collection instance by querying the DB configuration database about 'name'."
self.calculate_reclist_run_already = 0 # to speed things up wihtout much refactoring
self.update_reclist_run_already = 0 # to speed things up wihtout much refactoring
self.reclist_with_nonpublic_subcolls = HitSet()
if not name:
self.name = cdsname # by default we are working on the home page
self.id = 1
self.dbquery = None
self.nbrecs = None
self.reclist = HitSet()
else:
self.name = name
query = "SELECT id,name,dbquery,nbrecs,reclist FROM collection WHERE name='%s'" % escape_string(name)
try:
res = run_sql(query, None, 1)
if res:
self.id = res[0][0]
self.name = res[0][1]
self.dbquery = res[0][2]
self.nbrecs = res[0][3]
try:
self.reclist = HitSet(Numeric.loads(zlib.decompress(res[0][5])))
except:
self.reclist = HitSet()
else: # collection does not exist!
self.id = None
self.dbquery = None
self.nbrecs = None
self.reclist = HitSet()
except Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit(1)
def get_name(self, ln=cdslang, name_type="ln", prolog="", epilog="", prolog_suffix=" ", epilog_suffix=""):
"""Return nicely formatted collection name for language LN.
The NAME_TYPE may be 'ln' (=long name), 'sn' (=short name), etc."""
out = prolog
i18name = ""
res = run_sql("SELECT value FROM collectionname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, name_type))
try:
i18name += res[0][0]
except IndexError:
pass
if i18name:
out += i18name
else:
out += self.name
out += epilog
return out
def get_ancestors(self):
"Returns list of ancestors of the current collection."
ancestors = []
id_son = self.id
while 1:
query = "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "\
"WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son)
res = run_sql(query, None, 1)
if res:
col_ancestor = get_collection(res[0][1])
ancestors.append(col_ancestor)
id_son = res[0][0]
else:
break
ancestors.reverse()
return ancestors
def restricted_p(self):
"""Predicate to test if the collection is restricted or not. Return the contect of the
`restrited' column of the collection table (typically Apache group). Otherwise return
None if the collection is public."""
out = None
query = "SELECT restricted FROM collection WHERE id=%d" % self.id
res = run_sql(query, None, 1)
try:
out = res[0][0]
except:
pass
return out
def get_sons(self, type='r'):
"Returns list of direct sons of type 'type' for the current collection."
sons = []
id_dad = self.id
query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\
"WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC, c.name ASC" % (int(id_dad), type)
res = run_sql(query)
for row in res:
sons.append(get_collection(row[1]))
return sons
def get_descendants(self, type='r'):
"Returns list of all descendants of type 'type' for the current collection."
descendants = []
id_dad = self.id
query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\
"WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC" % (int(id_dad), type)
res = run_sql(query)
for row in res:
col_desc = get_collection(row[1])
descendants.append(col_desc)
descendants += col_desc.get_descendants()
return descendants
def write_cache_file(self, filename='', filebody=''):
"Write a file inside collection cache."
# open file:
dirname = "%s/collections/%d" % (cachedir, self.id)
mymkdir(dirname)
fullfilename = dirname + "/%s.html" % filename
try:
os.umask(022)
f = open(fullfilename, "w")
except IOError, v:
try:
(code, message) = v
except:
code = 0
message = v
print "I/O Error: " + str(message) + " (" + str(code) + ")"
sys.exit(1)
# print user info:
if options["verbose"] >= 6:
write_message("... creating %s" % fullfilename)
sys.stdout.flush()
# print page body:
f.write(filebody)
# close file:
f.close()
def update_webpage_cache(self):
"""Create collection page header, navtrail, body (including left and right stripes) and footer, and
call write_cache_file() afterwards to update the collection webpage cache."""
## do this for each language:
for lang, lang_fullname in language_list_long():
# load the right message language
_ = gettext_set_language(lang)
## first, update navtrail:
for as in range(0, 2):
self.write_cache_file("navtrail-as=%s-ln=%s" % (as, lang), self.create_navtrail_links(as, lang))
## second, update page body:
for as in range(0, 2): # do both simple search and advanced search pages:
body = websearch_templates.tmpl_webcoll_body(
ln=lang, collection=self.name,
te_portalbox = self.create_portalbox(lang, 'te'),
searchfor = self.create_searchfor(as, lang),
np_portalbox = self.create_portalbox(lang, 'np'),
narrowsearch = self.create_narrowsearch(as, lang, 'r'),
focuson = self.create_narrowsearch(as, lang, "v") + self.create_external_collections_box(),
instantbrowse = self.create_instant_browse(as=as, ln=lang),
ne_portalbox = self.create_portalbox(lang, 'ne')
)
self.write_cache_file("body-as=%s-ln=%s" % (as, lang), body)
## third, write portalboxes:
self.write_cache_file("portalbox-tp-ln=%s" % lang, self.create_portalbox(lang, "tp"))
self.write_cache_file("portalbox-te-ln=%s" % lang, self.create_portalbox(lang, "te"))
self.write_cache_file("portalbox-lt-ln=%s" % lang, self.create_portalbox(lang, "lt"))
self.write_cache_file("portalbox-rt-ln=%s" % lang, self.create_portalbox(lang, "rt"))
## fourth, write 'last updated' information:
self.write_cache_file("last-updated-ln=%s" % lang,
convert_datestruct_to_dategui(time.localtime(),
ln=lang))
return
def create_navtrail_links(self, as=0, ln=cdslang):
"""Creates navigation trail links, i.e. links to collection
ancestors (except Home collection). If as==1, then links to
Advanced Search interfaces; otherwise Simple Search.
"""
dads = []
for dad in self.get_ancestors():
if dad.name != cdsname: # exclude Home collection
dads.append((dad.name, dad.get_name(ln)))
return websearch_templates.tmpl_navtrail_links(
as=as, ln=ln, dads=dads)
def create_portalbox(self, lang=cdslang, position="rt"):
"""Creates portalboxes of language CDSLANG of the position POSITION by consulting DB configuration database.
The position may be: 'lt'='left top', 'rt'='right top', etc."""
out = ""
query = "SELECT p.title,p.body FROM portalbox AS p, collection_portalbox AS cp "\
" WHERE cp.id_collection=%d AND p.id=cp.id_portalbox AND cp.ln='%s' AND cp.position='%s' "\
" ORDER BY cp.score DESC" % (self.id, lang, position)
res = run_sql(query)
for row in res:
title, body = row[0], row[1]
if title:
out += websearch_templates.tmpl_portalbox(title = title,
body = body)
else:
# no title specified, so print body ``as is'' only:
out += body
return out
def create_narrowsearch(self, as=0, ln=cdslang, type="r"):
"""Creates list of collection descendants of type 'type' under title 'title'.
If as==1, then links to Advanced Search interfaces; otherwise Simple Search.
Suitable for 'Narrow search' and 'Focus on' boxes."""
# get list of sons and analyse it
sons = self.get_sons(type)
if not sons:
return ''
# get descendents
descendants = self.get_descendants(type)
grandsons = []
if CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS:
# load grandsons for each son
for son in sons:
grandsons.append(son.get_sons())
# return ""
return websearch_templates.tmpl_narrowsearch(
as = as,
ln = ln,
type = type,
father = self,
has_grandchildren = len(descendants)>len(sons),
sons = sons,
display_grandsons = CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS,
grandsons = grandsons
)
def create_external_collections_box(self, ln=cdslang):
external_collection_load_states()
if not dico_collection_external_searches.has_key(self.id):
return ""
engines_list = external_collection_sort_engine_by_name(dico_collection_external_searches[self.id])
return websearch_templates.tmpl_searchalso(ln, engines_list, self.id)
def create_instant_browse(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, as=0, ln=cdslang):
"Searches database and produces list of last 'rg' records."
if self.restricted_p():
return websearch_templates.tmpl_box_restricted_content(ln = ln)
else:
if self.nbrecs and self.reclist:
# firstly, get last 'rg' records:
recIDs = Numeric.nonzero(self.reclist._set)
passIDs = []
total = len(recIDs)
to_display = min(rg, total)
for idx in range(total-1, total-to_display-1, -1):
recid = recIDs[idx]
passIDs.append({'id': recid,
'body': print_record(recid, ln=ln),
'date': get_creation_date(recid, fmt="%Y-%m-%d
%H:%i")})
if self.nbrecs > rg:
url = websearch_templates.build_search_url(
cc=self.name, jrec=rg+1, ln=ln, as=as)
else:
url = ""
return websearch_templates.tmpl_instant_browse(
as=as, ln=ln, recids=passIDs, more_link=url)
return websearch_templates.tmpl_box_no_records(ln=ln)
def create_searchoptions(self):
"Produces 'Search options' portal box."
box = ""
query = """SELECT DISTINCT(cff.id_field),f.code,f.name FROM collection_field_fieldvalue AS cff, field AS f
WHERE cff.id_collection=%d AND cff.id_fieldvalue IS NOT NULL AND cff.id_field=f.id
ORDER BY cff.score DESC""" % self.id
res = run_sql(query)
if res:
for row in res:
field_id = row[0]
field_code = row[1]
field_name = row[2]
query_bis = """SELECT fv.value,fv.name FROM fieldvalue AS fv, collection_field_fieldvalue AS cff
WHERE cff.id_collection=%d AND cff.type='seo' AND cff.id_field=%d AND fv.id=cff.id_fieldvalue
ORDER BY cff.score_fieldvalue DESC, cff.score DESC, fv.name ASC""" % (self.id, field_id)
res_bis = run_sql(query_bis)
if res_bis:
values = [{'value' : '', 'text' : 'any' + field_name}] # FIXME: internationalisation of "any"
for row_bis in res_bis:
values.append({'value' : cgi.escape(row_bis[0], 1), 'text' : row_bis[1]})
box += websearch_templates.tmpl_select(
fieldname = field_code,
values = values
)
return box
def create_sortoptions(self, ln=cdslang):
"Produces 'Sort options' portal box."
# load the right message language
_ = gettext_set_language(ln)
box = ""
query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff
WHERE id_collection=%d AND cff.type='soo' AND cff.id_field=f.id
ORDER BY cff.score DESC, f.name ASC""" % self.id
values = [{'value' : '', 'text': "- %s -" % _("latest first")}]
res = run_sql(query)
if res:
for row in res:
values.append({'value' : row[0], 'text': row[1]})
else:
for tmp in ('title', 'author', 'report number', 'year'):
values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
box = websearch_templates.tmpl_select(
fieldname = 'sf',
css_class = 'address',
values = values
)
box += websearch_templates.tmpl_select(
fieldname = 'so',
css_class = 'address',
values = [
{'value' : 'a' , 'text' : _("asc.")},
{'value' : 'd' , 'text' : _("desc.")}
]
)
return box
def create_rankoptions(self, ln=cdslang):
"Produces 'Rank options' portal box."
# load the right message language
_ = gettext_set_language(ln)
values = [{'value' : '', 'text': "- %s %s -" % (string.lower(_("OR")), _("rank by"))}]
for (code, name) in get_bibrank_methods(self.id, ln):
values.append({'value' : code, 'text': name})
box = websearch_templates.tmpl_select(
fieldname = 'sf',
css_class = 'address',
values = values
)
return box
def create_displayoptions(self, ln=cdslang):
"Produces 'Display options' portal box."
# load the right message language
_ = gettext_set_language(ln)
values = []
for i in ['10', '25', '50', '100', '250', '500']:
values.append({'value' : i, 'text' : i + ' ' + _("results")})
box = websearch_templates.tmpl_select(
fieldname = 'rg',
css_class = 'address',
values = values
)
if self.get_sons():
box += websearch_templates.tmpl_select(
fieldname = 'sc',
css_class = 'address',
values = [
{'value' : '1' , 'text' : _("split by collection")},
{'value' : '0' , 'text' : _("single list")}
]
)
return box
def create_formatoptions(self, ln=cdslang):
"Produces 'Output format options' portal box."
# load the right message language
_ = gettext_set_language(ln)
box = ""
values = []
query = """SELECT f.code,f.name FROM format AS f, collection_format AS cf
WHERE cf.id_collection=%d AND cf.id_format=f.id ORDER BY cf.score DESC, f.name ASC""" % self.id
res = run_sql(query)
if res:
for row in res:
values.append({'value' : row[0], 'text': row[1]})
else:
values.append({'value' : 'hb', 'text' : "HTML %s" % _("brief")})
box = websearch_templates.tmpl_select(
fieldname = 'of',
css_class = 'address',
values = values
)
return box
def create_searchwithin_selection_box(self, fieldname='f', value='', ln='en'):
"Produces 'search within' selection box for the current collection."
# get values
query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff
WHERE cff.type='sew' AND cff.id_collection=%d AND cff.id_field=f.id
ORDER BY cff.score DESC, f.name ASC""" % self.id
res = run_sql(query)
values = [{'value' : '', 'text' : get_field_i18nname("any field", ln)}]
if res:
for row in res:
values.append({'value' : row[0], 'text' : row[1]})
else:
if CFG_CERN_SITE:
for tmp in ['title', 'author', 'abstract', 'report number', 'year']:
values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
else:
for tmp in ['title', 'author', 'abstract', 'keyword', 'report number', 'year', 'fulltext', 'reference']:
values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
return websearch_templates.tmpl_searchwithin_select(
fieldname = fieldname,
ln = ln,
selected = value,
values = values
)
def create_searchexample(self):
"Produces search example(s) for the current collection."
out = "$collSearchExamples = getSearchExample(%d, $se);" % self.id
return out
def create_searchfor(self, as=0, ln=cdslang):
"Produces either Simple or Advanced 'Search for' box for the current collection."
if as == 1:
return self.create_searchfor_advanced(ln)
else:
return self.create_searchfor_simple(ln)
def create_searchfor_simple(self, ln=cdslang):
"Produces simple 'Search for' box for the current collection."
return websearch_templates.tmpl_searchfor_simple(
ln=ln,
collection_id = self.name,
collection_name=self.get_name(ln=ln),
record_count=self.nbrecs,
middle_option = self.create_searchwithin_selection_box(ln=ln),
)
def create_searchfor_advanced(self, ln=cdslang):
"Produces advanced 'Search for' box for the current collection."
return websearch_templates.tmpl_searchfor_advanced(
ln = ln,
collection_id = self.name,
collection_name=self.get_name(ln=ln),
record_count=self.nbrecs,
middle_option_1 = self.create_searchwithin_selection_box('f1', ln=ln),
middle_option_2 = self.create_searchwithin_selection_box('f2', ln=ln),
middle_option_3 = self.create_searchwithin_selection_box('f3', ln=ln),
searchoptions = self.create_searchoptions(),
sortoptions = self.create_sortoptions(ln),
rankoptions = self.create_rankoptions(ln),
displayoptions = self.create_displayoptions(ln),
formatoptions = self.create_formatoptions(ln)
)
def calculate_reclist(self):
"""Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls) tuple for given collection."""
if self.calculate_reclist_run_already:
# do we have to recalculate?
return (self.reclist, self.reclist_with_nonpublic_subcolls)
if options["verbose"] >= 6:
write_message("... calculating reclist of %s" % self.name)
reclist = HitSet() # will hold results for public sons only; good for storing into DB
reclist_with_nonpublic_subcolls = HitSet() # will hold results for both public and nonpublic sons; good for deducing total
# number of documents
if not self.dbquery:
# A - collection does not have dbquery, so query recursively all its sons
# that are either non-restricted or that have the same restriction rules
for coll in self.get_sons():
coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist()
if ((coll.restricted_p() is None) or
(coll.restricted_p() == self.restricted_p())):
# add this reclist ``for real'' only if it is public
reclist.union(coll_reclist)
reclist_with_nonpublic_subcolls.union(coll_reclist_with_nonpublic_subcolls)
else:
# B - collection does have dbquery, so compute it:
reclist = search_pattern(None, self.dbquery)
reclist_with_nonpublic_subcolls = copy.deepcopy(reclist)
# deduce the number of records:
reclist.calculate_nbhits()
reclist_with_nonpublic_subcolls.calculate_nbhits()
# store the results:
self.nbrecs = reclist_with_nonpublic_subcolls._nbhits
self.reclist = reclist
self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls
# last but not least, update the speed-up flag:
self.calculate_reclist_run_already = 1
# return the two sets:
return (self.reclist, self.reclist_with_nonpublic_subcolls)
def update_reclist(self):
"Update the record universe for given collection; nbrecs, reclist of the collection table."
if self.update_reclist_run_already:
# do we have to reupdate?
return 0
if options["verbose"] >= 6:
write_message("... updating reclist of %s (%s recs)" % (self.name, self.nbrecs))
sys.stdout.flush()
try:
query = "UPDATE collection SET nbrecs=%d, reclist='%s' WHERE id=%d" % \
(self.nbrecs, escape_string(zlib.compress(Numeric.dumps(self.reclist._set))), self.id)
run_sql(query)
self.reclist_updated_since_start = 1
except Error, e:
print "Database Query Error %d: %s." % (e.args[0], e.args[1])
sys.exit(1)
# last but not least, update the speed-up flag:
self.update_reclist_run_already = 1
return 0
def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re = sre.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def get_current_time_timestamp():
"""Return timestamp corresponding to the current time."""
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
def compare_timestamps_with_tolerance(timestamp1,
timestamp2,
tolerance=0):
"""Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form
'2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument
(in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2
minus TOLERANCE, 0 if they are equal within TOLERANCE limit,
and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE.
"""
# remove any trailing .00 in timestamps:
timestamp1 = sre.sub(r'\.[0-9]+$', '', timestamp1)
timestamp2 = sre.sub(r'\.[0-9]+$', '', timestamp2)
# first convert timestamps to Unix epoch seconds:
timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S"))
timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S"))
# now compare them:
if timestamp1_seconds < timestamp2_seconds - tolerance:
return -1
elif timestamp1_seconds > timestamp2_seconds + tolerance:
return 1
else:
return 0
def get_database_last_updated_timestamp():
"""Return last updated timestamp for collection-related and
record-related database tables.
"""
database_tables_timestamps = []
database_tables_timestamps.append(get_table_update_time('bibrec'))
database_tables_timestamps.append(get_table_update_time('bibfmt'))
database_tables_timestamps.append(get_table_update_time('idxWORD%'))
database_tables_timestamps.append(get_table_update_time('collection%'))
database_tables_timestamps.append(get_table_update_time('portalbox'))
database_tables_timestamps.append(get_table_update_time('field%'))
database_tables_timestamps.append(get_table_update_time('format%'))
database_tables_timestamps.append(get_table_update_time('rnkMETHODNAME'))
return max(database_tables_timestamps)
def get_cache_last_updated_timestamp():
"""Return last updated cache timestamp."""
try:
f = open(cfg_cache_last_updated_timestamp_file, "r")
except:
return "1970-01-01 00:00:00"
timestamp = f.read()
f.close()
return timestamp
def set_cache_last_updated_timestamp(timestamp):
"""Set last updated cache timestamp to TIMESTAMP."""
try:
f = open(cfg_cache_last_updated_timestamp_file, "w")
except:
pass
f.write(timestamp)
f.close()
return timestamp
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame))
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame))
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame))
write_message("stopping...")
task_update_status("STOPPING")
pass # FIXME: is there anything to be done?
task_update_status("STOPPED")
sys.exit(0)
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame))
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
# do nothing for unknown signals:
write_message("unknown signal %d (frame %s) ignored" % (sig, frame))
def authenticate(user, header="WebColl Task Submission", action="runwebcoll"):
"""Authenticate the user against the user database.
Check for its password, if it exists.
Check for action access rights.
Return user name upon authorization success,
do system exit upon authorization failure.
"""
print header
print "=" * len(header)
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
## first check user pw:
- res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1)
+ res = run_sql("select id,password from user where email=%s", (user,), 1) + \
+ run_sql("select id,password from user where nickname=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
## secondly check authorization for the action:
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def task_submit():
"""Submits task to the BibSched task queue. This is what people will be invoking via command line."""
global options
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
## authenticate user:
user = authenticate(options.get("user", ""))
## submit task:
if options["verbose"] >= 9:
print ""
write_message("storing task options %s\n" % options)
task_id = run_sql("""INSERT INTO schTASK (id,proc,user,runtime,sleeptime,status,arguments)
VALUES (NULL,'webcoll',%s,%s,%s,'WAITING',%s)""",
(user, options["runtime"], options["sleeptime"], marshal.dumps(options)))
## update task number:
options["task"] = task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), task_id))
write_message("Task #%d submitted." % task_id)
return task_id
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global options
return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"]))
def task_update_status(val):
"""Updates status information in the BibSched task table."""
global options
return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"]))
def task_read_status(task_id):
"""Read status information in the BibSched task table."""
res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1)
try:
out = res[0][0]
except:
out = 'UNKNOWN'
return out
def task_get_options(id):
"""Returns options for the task 'id' read from the BibSched task queue table."""
out = {}
res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='webcoll'", (id,))
try:
out = marshal.loads(res[0][0])
except:
write_message("Error: WebColl task %d does not seem to exist." % id)
sys.exit(1)
return out
def task_run(task_id):
"""Run the WebColl task by fetching arguments from the BibSched task queue.
This is what BibSched will be invoking via daemon call.
The task will update collection reclist cache and collection web pages for
given collection. (default is all).
Arguments described in usage() function.
Return 1 in case of success and 0 in case of failure."""
global options
task_run_start_timestamp = get_current_time_timestamp()
options = task_get_options(task_id) # get options from BibSched task table
## check task id:
if not options.has_key("task"):
write_message("Error: The task #%d does not seem to be a WebColl task." % task_id)
return 0
## check task status:
task_status = task_read_status(task_id)
if task_status != "WAITING":
write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status))
return 0
## we can run the task now:
if options["verbose"]:
write_message("Task #%d started." % task_id)
task_update_status("RUNNING")
## initialize signal handler:
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
colls = []
# decide whether we need to run or not, by comparing last updated timestamps:
if options["verbose"] >= 3:
write_message("Database timestamp is %s." % get_database_last_updated_timestamp())
write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp())
if options.has_key("force") or \
compare_timestamps_with_tolerance(get_database_last_updated_timestamp(),
get_cache_last_updated_timestamp(),
cfg_cache_last_updated_timestamp_tolerance) >= 0:
## either forced update was requested or cache is not up to date, so recreate it:
# firstly, decide which collections to do:
if options.has_key("collection"):
coll = get_collection(options["collection"])
if coll.id == None:
usage(1, 'Collection %s does not exist' % coll.name)
colls.append(coll)
else:
res = run_sql("SELECT name FROM collection ORDER BY id")
for row in res:
colls.append(get_collection(row[0]))
# secondly, update collection reclist cache:
i = 0
for coll in colls:
i += 1
if options["verbose"]:
write_message("%s / reclist cache update" % coll.name)
coll.calculate_reclist()
coll.update_reclist()
task_update_progress("Part 1/2: done %d/%d" % (i, len(colls)))
# thirdly, update collection webpage cache:
i = 0
for coll in colls:
i += 1
if options["verbose"]:
write_message("%s / web cache update" % coll.name)
coll.update_webpage_cache()
task_update_progress("Part 2/2: done %d/%d" % (i, len(colls)))
# finally update the cache last updated timestamp:
# (but only when all collections were updated, not when only
# some of them were forced-updated as per admin's demand)
if not options.has_key("collection"):
set_cache_last_updated_timestamp(task_run_start_timestamp)
if options["verbose"] >= 3:
write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp())
else:
## cache up to date, we don't have to run
if options["verbose"]:
write_message("Collection cache is up to date, no need to run.")
pass
## we are done:
task_update_progress("Done.")
task_update_status("DONE")
if options["verbose"]:
write_message("Task #%d finished." % task_id)
return 1
def usage(exitcode=1, msg=""):
"""Prints usage info."""
if msg:
sys.stderr.write("Error: %s.\n" % msg)
sys.stderr.write("Usage: %s [options]\n" % sys.argv[0])
sys.stderr.write("Command options:\n")
sys.stderr.write(" -c, --collection\t Update cache for the given collection only. [all]\n")
sys.stderr.write(" -f, --force\t Force update even if cache is up to date. [no]\n")
sys.stderr.write("Scheduling options:\n")
sys.stderr.write(" -u, --user=USER \t User name to submit the task as, password needed.\n")
sys.stderr.write(" -t, --runtime=TIME \t Time to execute the task (now), e.g.: +15s, 5m, 3h, 2002-10-27 13:57:26\n")
sys.stderr.write(" -s, --sleeptime=SLEEP \t Sleeping frequency after which to repeat task (no), e.g.: 30m, 2h, 1d\n")
sys.stderr.write("General options:\n")
sys.stderr.write(" -h, --help \t\t Print this help.\n")
sys.stderr.write(" -V, --version \t\t Print version information.\n")
sys.stderr.write(" -v, --verbose=LEVEL \t Verbose level (from 0 to 9, default 1).\n")
sys.stderr.write("""Description: %s updates the collection cache
(record universe for a given collection plus web page elements)
based on WML and DB configuration parameters.
If the collection name is passed as the second argument, it'll update
this collection only. If the collection name is immediately followed
by a plus sign, it will also update all its desdendants. The
top-level collection name may be entered as the void string.\n""" % sys.argv[0])
sys.exit(exitcode)
def main():
"""Main function that analyzes command line input and calls whatever is appropriate.
Useful for learning on how to write BibSched tasks."""
global options
## parse command line:
if len(sys.argv) == 2 and sys.argv[1].isdigit():
## A - run the task
task_id = int(sys.argv[1])
try:
if not task_run(task_id):
write_message("Error occurred. Exiting.", sys.stderr)
except StandardError, e:
write_message("Unexpected error occurred: %s." % e, sys.stderr)
write_message("Traceback is:", sys.stderr)
traceback.print_tb(sys.exc_info()[2])
write_message("Exiting.", sys.stderr)
task_update_status("ERROR")
else:
## B - submit the task
# set default values:
options["runtime"] = time.strftime("%Y-%m-%d %H:%M:%S")
options["verbose"] = 1
options["sleeptime"] = ""
# set user-defined options:
try:
opts, args = getopt.getopt(sys.argv[1:], "hVv:u:s:t:c:f",
["help", "version", "verbose=","user=","sleep=","time=","collection=","force"])
except getopt.GetoptError, err:
usage(1, err)
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
usage(0)
elif opt[0] in ["-V", "--version"]:
print __revision__
sys.exit(0)
elif opt[0] in [ "-u", "--user"]:
options["user"] = opt[1]
elif opt[0] in ["-v", "--verbose"]:
options["verbose"] = int(opt[1])
elif opt[0] in [ "-s", "--sleeptime" ]:
get_datetime(opt[1]) # see if it is a valid shift
options["sleeptime"] = opt[1]
elif opt[0] in [ "-t", "--runtime" ]:
options["runtime"] = get_datetime(opt[1])
elif opt[0] in [ "-c", "--collection"]:
options["collection"] = opt[1]
elif opt[0] in [ "-f", "--force"]:
options["force"] = 1
else:
usage(1)
except StandardError, e:
usage(e)
task_submit()
return
### okay, here we go:
if __name__ == '__main__':
main()
diff --git a/modules/websession/lib/webuser.py b/modules/websession/lib/webuser.py
index f5bf04183..ee36a0dc5 100644
--- a/modules/websession/lib/webuser.py
+++ b/modules/websession/lib/webuser.py
@@ -1,738 +1,739 @@
# -*- coding: utf-8 -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
This file implements all methods necessary for working with users and
sessions in CDS Invenio. Contains methods for logging/registration
when a user log/register into the system, checking if it is a guest
user or not.
At the same time this presents all the stuff it could need with
sessions managements, working with websession.
It also contains Apache-related user authentication stuff.
"""
__revision__ = "$Id$"
from marshal import loads, dumps
from zlib import compress, decompress
import time
import os
import crypt
import string
import smtplib
import sre
from invenio.config import \
CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS, \
CFG_ACCESS_CONTROL_LEVEL_GUESTS, \
CFG_ACCESS_CONTROL_LEVEL_SITE, \
CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN, \
CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS, \
CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT, \
CFG_APACHE_GROUP_FILE, \
CFG_APACHE_PASSWORD_FILE, \
adminemail, \
cdslang, \
cdsname, \
supportemail, \
sweburl, \
tmpdir, \
version, \
weburl
from invenio import session, websession
from invenio.dbquery import run_sql, escape_string, OperationalError
from invenio.websession import pSession, pSessionMapping
from invenio.session import SessionError
from invenio.access_control_config import *
from invenio.access_control_engine import acc_authorize_action
from invenio.access_control_admin import acc_findUserRoleActions
from invenio.messages import gettext_set_language
import invenio.template
tmpl = invenio.template.load('websession')
sre_invalid_nickname = sre.compile(""".*[,'@]+.*""")
# pylint: disable-msg=C0301
def createGuestUser():
"""Create a guest user , insert into user null values in all fields
createGuestUser() -> GuestUserID
"""
if CFG_ACCESS_CONTROL_LEVEL_GUESTS == 0:
try:
return run_sql("insert into user (email, note) values ('', '1')")
except OperationalError:
return None
elif CFG_ACCESS_CONTROL_LEVEL_GUESTS >= 1:
try:
return run_sql("insert into user (email, note) values ('', '0')")
except OperationalError:
return None
def page_not_authorized(req, referer='', uid='', text='', navtrail='', ln=cdslang):
"""Show error message when account is not activated"""
from invenio.webpage import page
_ = gettext_set_language(ln)
if not CFG_ACCESS_CONTROL_LEVEL_SITE:
title = CFG_WEBACCESS_MSGS[5]
if not uid:
uid = getUid(req)
try:
res = run_sql("SELECT email FROM user WHERE id=%s" % uid)
if res and res[0][0]:
if text:
body = text
else:
body = "%s %s" % (CFG_WEBACCESS_WARNING_MSGS[9] % res[0][0],
("%s %s" % (CFG_WEBACCESS_MSGS[0] % referer, CFG_WEBACCESS_MSGS[1])))
else:
if text:
body = text
else:
if CFG_ACCESS_CONTROL_LEVEL_GUESTS == 1:
body = CFG_WEBACCESS_MSGS[3]
else:
body = CFG_WEBACCESS_WARNING_MSGS[4] + CFG_WEBACCESS_MSGS[2]
except OperationalError, e:
body = _("Database problem") + ': ' + str(e)
elif CFG_ACCESS_CONTROL_LEVEL_SITE == 1:
title = CFG_WEBACCESS_MSGS[8]
body = "%s %s" % (CFG_WEBACCESS_MSGS[7], CFG_WEBACCESS_MSGS[2])
elif CFG_ACCESS_CONTROL_LEVEL_SITE == 2:
title = CFG_WEBACCESS_MSGS[6]
body = "%s %s" % (CFG_WEBACCESS_MSGS[4], CFG_WEBACCESS_MSGS[2])
return page(title=title,
uid=getUid(req),
body=body,
navtrail=navtrail,
req=req)
def getUid (req):
"""Return user ID taking it from the cookie of the request.
Includes control mechanism for the guest users, inserting in
the database table when need be, raising the cookie back to the
client.
User ID is set to 0 when client refuses cookie or we are in the
read-only site operation mode.
User ID is set to -1 when we are in the permission denied site
operation mode.
getUid(req) -> userId
"""
if CFG_ACCESS_CONTROL_LEVEL_SITE == 1: return 0
if CFG_ACCESS_CONTROL_LEVEL_SITE == 2: return -1
guest = 0
sm = session.MPSessionManager(pSession, pSessionMapping())
try:
s = sm.get_session(req)
except SessionError:
sm.revoke_session_cookie (req)
s = sm.get_session(req)
userId = s.getUid()
if userId == -1: # first time, so create a guest user
s.setUid(createGuestUser())
userId = s.getUid()
guest = 1
sm.maintain_session(req, s)
if guest == 0:
guest = isGuestUser(userId)
if guest:
if CFG_ACCESS_CONTROL_LEVEL_GUESTS == 0:
return userId
elif CFG_ACCESS_CONTROL_LEVEL_GUESTS >= 1:
return -1
else:
res = run_sql("SELECT note FROM user WHERE id=%s" % userId)
if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 0:
return userId
elif CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS >= 1 and res and res[0][0] in [1, "1"]:
return userId
else:
return -1
def setUid(req, uid):
"""It sets the userId into the session, and raise the cookie to the client.
"""
sm = session.MPSessionManager(pSession, pSessionMapping())
try:
s = sm.get_session(req)
except SessionError:
sm.revoke_session_cookie(req)
s = sm.get_session(req)
s.setUid(uid)
sm.maintain_session(req, s)
return uid
def get_user_info(uid, ln=cdslang):
"""Get infos for a given user.
@param uid: user id (int)
@return tuple: (uid, nickname, display_name)
"""
_ = gettext_set_language(ln)
query = """SELECT id, nickname
FROM user
WHERE id=%i"""
res = run_sql(query%uid)
if res:
if res[0]:
user = list(res[0])
if user[1]:
user.append(user[1])
else:
user[1] = str(user[0])
user.append(_("user") + ' #' + str(user[0]))
return tuple(user)
return (uid, '', _("N/A"))
def isGuestUser(uid):
"""It Checks if the userId corresponds to a guestUser or not
isGuestUser(uid) -> boolean
"""
out = 1
try:
res = run_sql("select email from user where id=%s", (uid,))
if res:
if res[0][0]:
out = 0
except OperationalError:
pass
return out
def isUserSubmitter(uid):
u_email = get_email(uid)
res = run_sql("select * from sbmSUBMISSIONS where email=%s", (u_email,))
if len(res) > 0:
return 1
else:
return 0
def isUserReferee(uid):
res = run_sql("select sdocname from sbmDOCTYPE")
for row in res:
doctype = row[0]
categ = "*"
(auth_code, auth_message) = acc_authorize_action(uid, "referee", doctype=doctype, categ=categ)
if auth_code == 0:
return 1
res2 = run_sql("select sname from sbmCATEGORIES where doctype=%s", (doctype,))
for row2 in res2:
categ = row2[0]
(auth_code, auth_message) = acc_authorize_action(uid, "referee", doctype=doctype, categ=categ)
if auth_code == 0:
return 1
return 0
def isUserAdmin(uid):
"Return 1 if the user UID has some admin rights; 0 otherwise."
out = 0
if acc_findUserRoleActions(uid):
out = 1
return out
def nickname_valid_p(nickname):
"""Check whether wanted NICKNAME supplied by the user is valid.
At the moment we just check whether it is not empty, does not
contain blanks or @, is not equal to `guest', etc.
This check relies on sre_invalid_nickname regexp (see above)
Return 1 if nickname is okay, return 0 if it is not.
"""
if nickname and \
not(nickname.startswith(' ') or nickname.endswith(' ')) and \
nickname.lower() != 'guest':
if not sre_invalid_nickname.match(nickname):
return 1
return 0
def email_valid_p(email):
"""Check whether wanted EMAIL address supplied by the user is valid.
At the moment we just check whether it contains '@' and whether
it doesn't contain blanks. We also check the email domain if
CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN is set.
Return 1 if email is okay, return 0 if it is not.
"""
if (string.find(email, "@") <= 0) or (string.find(email, " ") > 0):
return 0
elif CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN:
if not email.endswith(CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN):
return 0
return 1
def registerUser(req, email, passw, nickname, register_without_nickname=False):
"""Register user with the desired values of NICKNAME, EMAIL and
PASSW.
If REGISTER_WITHOUT_NICKNAME is set to True, then ignore
desired NICKNAME and do not set any. This is suitable for
external authentications so that people can login without
having to register an internal account first.
Return 0 if the registration is successful, 1 if email is not
valid, 2 if nickname is not valid, 3 if email is already in the
database, 4 if nickname is already in the database, 5 when
users cannot register themselves because of the site policy.
"""
# is email valid?
if not email_valid_p(email):
return 1
# is email already taken?
res = run_sql("SELECT * FROM user WHERE email=%s", (email,))
if len(res) > 0:
return 3
if register_without_nickname:
# ignore desired nick and use default empty string one:
nickname = ""
else:
# is nickname valid?
if not nickname_valid_p(nickname):
return 2
# is nickname already taken?
res = run_sql("SELECT * FROM user WHERE nickname=%s", (nickname,))
if len(res) > 0:
return 4
# okay, go on and register the user:
if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 0:
activated = 1
elif CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 1:
activated = 0
elif CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS >= 2:
return 5
user_preference = get_default_user_preferences()
setUid(req, run_sql("INSERT INTO user (nickname, email, password, note, settings) VALUES (%s,%s,%s,%s,%s)",
(nickname, email, passw, activated, serialize_via_marshal(user_preference),)))
if CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT:
sendNewUserAccountWarning(email, email, passw)
if CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS:
sendNewAdminAccountWarning(email, adminemail)
return 0
def updateDataUser(uid, email, password, nickname):
"""Update user data. Used when a user changed his email or password or nickname.
"""
if email == 'guest':
return 0
if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS >= 2:
run_sql("update user set password=%s where id=%s", (password, uid))
else:
run_sql("update user set email=%s,password=%s where id=%s", (email, password, uid))
if nickname and nickname != '':
run_sql("update user set nickname=%s where id=%s", (nickname, uid))
return 1
def loginUser(req, p_un, p_pw, login_method):
"""It is a first simple version for the authentication of user. It returns the id of the user,
for checking afterwards if the login is correct
"""
# p_un passed may be an email or a nickname:
p_email = get_email_from_username(p_un)
# go on with the old stuff based on p_email:
user_prefs = get_user_preferences(emailUnique(p_email))
if user_prefs and login_method != user_prefs["login_method"]:
if CFG_EXTERNAL_AUTHENTICATION.has_key(user_prefs["login_method"]):
return ([], p_email, p_pw, 11)
if not CFG_EXTERNAL_AUTHENTICATION.has_key(login_method):
return ([], p_email, p_pw, 12)
if CFG_EXTERNAL_AUTHENTICATION[login_method][0]:
p_email = CFG_EXTERNAL_AUTHENTICATION[login_method][0].auth_user(p_email, p_pw)
if p_email:
p_pw = givePassword(p_email)
if not p_pw or p_pw < 0:
import random
p_pw = int(random.random() * 1000000)
if registerUser(req, p_email, p_pw, "", register_without_nickname=True) != 0:
return ([], p_email, p_pw, 13)
else:
query_result = run_sql("SELECT id from user where email=%s and password=%s", (p_email, p_pw,))
user_prefs = get_user_preferences(query_result[0][0])
user_prefs["login_method"] = login_method
set_user_preferences(query_result[0][0], user_prefs)
else:
return ([], p_email, p_pw, 10)
query_result = run_sql("SELECT id from user where email=%s and password=%s", (p_email, p_pw,))
if query_result:
prefered_login_method = get_user_preferences(query_result[0][0])['login_method']
else:
return ([], p_email, p_pw, 14)
if login_method != prefered_login_method:
if CFG_EXTERNAL_AUTHENTICATION.has_key(prefered_login_method):
return ([], p_email, p_pw, 11)
return (query_result, p_email, p_pw, 0)
def logoutUser(req):
"""It logout the user of the system, creating a guest user.
"""
getUid(req)
sm = session.MPSessionManager(pSession, pSessionMapping())
try:
s = sm.get_session(req)
except SessionError:
sm.revoke_session_cookie(req)
s = sm.get_session(req)
id1 = createGuestUser()
s.setUid(id1)
sm.maintain_session(req, s)
return id1
def username_exists_p(username):
"""Check if USERNAME exists in the system. Username may be either
nickname or email.
Return 1 if it does exist, 0 if it does not.
"""
if username == "":
# return not exists if asked for guest users
return 0
- res = run_sql("SELECT email FROM user WHERE nickname=%s OR email=%s",
- (username, username,))
+ res = run_sql("SELECT email FROM user WHERE email=%s", (username,)) + \
+ run_sql("SELECT email FROM user WHERE nickname=%s", (username,))
if len(res) > 0:
return 1
return 0
def emailUnique(p_email):
"""Check if the email address only exists once. If yes, return userid, if not, -1
"""
query_result = run_sql("select id, email from user where email=%s", (p_email,))
if len(query_result) == 1:
return query_result[0][0]
elif len(query_result) == 0:
return 0
return -1
def nicknameUnique(p_nickname):
"""Check if the nickname only exists once. If yes, return userid, if not, -1
"""
query_result = run_sql("select id, nickname from user where nickname=%s", (p_nickname,))
if len(query_result) == 1:
return query_result[0][0]
elif len(query_result) == 0:
return 0
return -1
def update_Uid(req, p_email, p_pw):
"""It updates the userId of the session. It is used when a guest user is logged in succesfully in the system
with a given email and password
"""
query_ID = int(run_sql("select id from user where email=%s and password=%s",
(p_email, p_pw))[0][0])
setUid(req, query_ID)
return query_ID
def givePassword(email):
""" It checks in the database the password for a given email. It is used to send the password to the email of the user.It returns
the password if the user exists, otherwise it returns -999
"""
query_pass = run_sql("select password from user where email =%s", (email,))
if len(query_pass)>0:
return query_pass[0][0]
return -999
def sendNewAdminAccountWarning(newAccountEmail, sendTo, ln=cdslang):
"""Send an email to the address given by sendTo about the new account newAccountEmail."""
_ = gettext_set_language(ln)
fromaddr = "From: %s" % supportemail
toaddrs = "To: %s" % sendTo
to = toaddrs + "\n"
sub = "Subject: New account on '%s'" % cdsname
if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 1:
sub += " - PLEASE ACTIVATE"
sub += "\n\n"
body = "A new account has been created on '%s'" % cdsname
if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 1:
body += " and is awaiting activation"
body += ":\n\n"
body += " Username/Email: %s\n\n" % newAccountEmail
body += "You can approve or reject this account request at: %s/admin/webaccess/webaccessadmin.py/manageaccounts\n" % weburl
body += "\n---------------------------------"
body += "\n%s" % cdsname
body += "\nContact: %s" % supportemail
msg = to + sub + body
server = smtplib.SMTP('localhost')
server.set_debuglevel(1)
try:
server.sendmail(fromaddr, toaddrs, msg)
except smtplib.SMTPRecipientsRefused:
return 0
server.quit()
return 1
def sendNewUserAccountWarning(newAccountEmail, sendTo, password, ln=cdslang):
"""Send an email to the address given by sendTo about the new account newAccountEmail."""
_ = gettext_set_language(ln)
fromaddr = "From: %s" % supportemail
toaddrs = "To: %s" % sendTo
to = toaddrs + "\n"
sub = "Subject: Your account created on '%s'\n\n" % cdsname
body = "You have created a new account on '%s':\n\n" % cdsname
body += " Username/Email: %s\n" % newAccountEmail
body += " Password: %s\n\n" % ("*" * len(password))
if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS >= 1:
body += "This account is awaiting approval by the site administrators and therefore cannot be used as of yet.\nYou will receive an email notification as soon as your account request has been processed.\n"
body += "\n---------------------------------"
body += "\n%s" % cdsname
body += "\nContact: %s" % supportemail
msg = to + sub + body
server = smtplib.SMTP('localhost')
server.set_debuglevel(1)
try:
server.sendmail(fromaddr, toaddrs, msg)
except smtplib.SMTPRecipientsRefused:
return 0
server.quit()
return 1
def get_email(uid):
"""Return email address of the user uid. Return string 'guest' in case
the user is not found."""
out = "guest"
res = run_sql("SELECT email FROM user WHERE id=%s", (uid,), 1)
if res and res[0][0]:
out = res[0][0]
return out
def get_email_from_username(username):
"""Return email address of the user corresponding to USERNAME.
The username may be either nickname or email. Return USERNAME
untouched if not found in the database or if found several
matching entries.
"""
out = username
- res = run_sql("SELECT email FROM user WHERE email=%s OR nickname=%s", (username, username,), 1)
+ res = run_sql("SELECT email FROM user WHERE email=%s", (username,), 1) + \
+ run_sql("SELECT email FROM user WHERE nickname=%s", (username,), 1)
if res and len(res) == 1:
out = res[0][0]
return out
def get_password(uid):
"""Return password of the user uid. Return None in case
the user is not found."""
out = None
res = run_sql("SELECT password FROM user WHERE id=%s", (uid,), 1)
if res and res[0][0]:
out = res[0][0]
return out
def get_nickname(uid):
"""Return nickname of the user uid. Return None in case
the user is not found."""
out = None
res = run_sql("SELECT nickname FROM user WHERE id=%s", (uid,), 1)
if res and res[0][0]:
out = res[0][0]
return out
def get_nickname_or_email(uid):
"""Return nickname (preferred) or the email address of the user uid.
Return string 'guest' in case the user is not found."""
out = "guest"
res = run_sql("SELECT nickname, email FROM user WHERE id=%s", (uid,), 1)
if res and res[0]:
if res[0][0]:
out = res[0][0]
elif res[0][1]:
out = res[0][1]
return out
def create_userinfobox_body(req, uid, language="en"):
"""Create user info box body for user UID in language LANGUAGE."""
if req:
if req.subprocess_env.has_key('HTTPS') \
and req.subprocess_env['HTTPS'] == 'on':
url_referer = sweburl + req.unparsed_uri
else:
url_referer = weburl + req.unparsed_uri
else:
url_referer = weburl
try:
return tmpl.tmpl_create_userinfobox(ln=language,
url_referer=url_referer,
guest = isGuestUser(uid),
username = get_nickname_or_email(uid),
submitter = isUserSubmitter(uid),
referee = isUserReferee(uid),
admin = isUserAdmin(uid),
)
except OperationalError:
return ""
def list_registered_users():
"""List all registered users."""
return run_sql("SELECT id,email FROM user where email!=''")
def list_users_in_role(role):
"""List all users of a given role (see table accROLE)
@param role: role of user (string)
@return list of uids
"""
query = """SELECT uacc.id_user
FROM user_accROLE uacc JOIN accROLE acc
ON uacc.id_accROLE=acc.id
WHERE acc.name='%s'"""
res = run_sql(query% escape_string(role))
if res:
return map(lambda x: int(x[0]), res)
return []
def list_users_in_roles(role_list):
"""List all users of given roles (see table accROLE)
@param role_list: list of roles [string]
@return list of uids
"""
if not(type(role_list) is list or type(role_list) is tuple):
role_list = [role_list]
params = ''
query = """SELECT distinct(uacc.id_user)
FROM user_accROLE uacc JOIN accROLE acc
ON uacc.id_accROLE=acc.id
%s"""
if len(role_list) > 0:
params = 'WHERE '
for role in role_list[:-1]:
params += "acc.name='%s' OR " % escape_string(role)
params += "acc.name='%s'" % escape_string(role_list[-1])
res = run_sql(query% params)
if res:
return map(lambda x: int(x[0]), res)
return []
## --- follow some functions for Apache user/group authentication
def auth_apache_user_p(user, password, apache_password_file=CFG_APACHE_PASSWORD_FILE):
"""Check whether user-supplied credentials correspond to valid
Apache password data file. Return 0 in case of failure, 1 in case
of success."""
try:
if not apache_password_file.startswith("/"):
apache_password_file = tmpdir + "/" + apache_password_file
dummy, pipe_output = os.popen2(["grep", "^" + user + ":", apache_password_file], 'r')
line = pipe_output.readlines()[0]
password_apache = string.split(string.strip(line),":")[1]
except: # no pw found, so return not-allowed status
return 0
salt = password_apache[:2]
if crypt.crypt(password, salt) == password_apache:
return 1
else:
return 0
def auth_apache_user_in_groups(user, apache_group_file=CFG_APACHE_GROUP_FILE):
"""Return list of Apache groups to which Apache user belong."""
out = []
try:
if not apache_group_file.startswith("/"):
apache_group_file = tmpdir + "/" + apache_group_file
dummy, pipe_output = os.popen2(["grep", user, apache_group_file], 'r')
for line in pipe_output.readlines():
out.append(string.split(string.strip(line),":")[0])
except: # no groups found, so return empty list
pass
return out
def auth_apache_user_collection_p(user, password, coll):
"""Check whether user-supplied credentials correspond to valid
Apache password data file, and whether this user is authorized to
see the given collections. Return 0 in case of failure, 1 in case
of success."""
from invenio.search_engine import coll_restricted_p, coll_restricted_group
if not auth_apache_user_p(user, password):
return 0
if not coll_restricted_p(coll):
return 1
if coll_restricted_group(coll) in auth_apache_user_in_groups(user):
return 1
else:
return 0
def get_user_preferences(uid):
pref = run_sql("SELECT id, settings FROM user WHERE id=%s", (uid,))
if pref:
try:
return deserialize_via_marshal(pref[0][1])
except:
return get_default_user_preferences()
return None
def set_user_preferences(uid, pref):
run_sql("UPDATE user SET settings='%s' WHERE id=%s" % (serialize_via_marshal(pref), uid))
def get_default_user_preferences():
user_preference = {
'login_method': ''}
for system in CFG_EXTERNAL_AUTHENTICATION.keys():
if CFG_EXTERNAL_AUTHENTICATION[system][1]:
user_preference['login_method'] = system
break
return user_preference
def serialize_via_marshal(obj):
"""Serialize Python object via marshal into a compressed string."""
return escape_string(compress(dumps(obj)))
def deserialize_via_marshal(string):
"""Decompress and deserialize string into a Python object via marshal."""
return loads(decompress(string))