diff --git a/modules/bibformat/bin/bibreformat.in b/modules/bibformat/bin/bibreformat.in index 1fa299fe5..88c87f7a5 100644 --- a/modules/bibformat/bin/bibreformat.in +++ b/modules/bibformat/bin/bibreformat.in @@ -1,756 +1,757 @@ #!@PYTHON@ ## -*- mode: python; coding: utf-8; -*- ## ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Call BibFormat engine and create HTML brief (and other) formats for bibliographic records. Upload formats via BibUpload.""" __revision__ = "$Id$" ## import interesting modules: try: import sys from invenio.dbquery import run_sql, escape_string from invenio.config import * from invenio.search_engine import perform_request_search from invenio.search_engine import print_record, encode_for_xml from invenio.access_control_engine import acc_authorize_action from invenio.bibformat import format_record from invenio.bibformat_utils import encode_for_xml from invenio.bibformat_config import CFG_BIBFORMAT_USE_OLD_BIBFORMAT from invenio.bibrecord import create_records import getopt import getpass import marshal import signal import string import sys import os import re import time except ImportError, e: print "Error: %s" % e import sys sys.exit(1) options = {} # global variable to hold task options sql_queries = [] # holds SQL queries to be executed cds_query = {} # holds CDS query parameters (fields, collection, pattern) process_format = 0 # flag, process records without created format process = 1 # flag, process records (unless count only) fmt = "hb" # default format to be processed sleeptime = "" # default sleeptime format_string = "%Y-%m-%d %H:%M:%S" # date/time format sched_time = time.strftime(format_string) # scheduled execution time in the date/time format ### run the bibreformat task bibsched scheduled ### def bibreformat_task(sql, sql_queries, cds_query, process_format): global process, fmt t1 = os.times()[4] ### Query the database ### if process_format: print "Querying database for records with missing format ..." without_format = without_fmt(sql) recIDs = [] if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": print "Querying database for records with old format (CDS query)..." res = perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field']) for item in res: recIDs.append(item) for sql_query in sql_queries: print "Querying database for records with old format (SQL query) ..." res = run_sql(sql_query) for item in res: recIDs.append(item[0]) ### list of corresponding record IDs was retrieved ### bibformat the records selected if process_format: print "Records to be processed: %d" % (len(recIDs)+len(without_format)) print "Out of it records without created format: %d" % len(without_format) else: print "Records to be processed: %d" % (len(recIDs)) ### Initialize main loop total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this when migration from php to python bibformat is done iterate_over_old(recIDs, weburl, fmt) else: iterate_over_new(recIDs, weburl, fmt) ### Iterate over all records prepared in list II (no_format) if process_format and process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this when migration from php to python bibformat is done iterate_over_old(without_format, weburl, fmt) else: iterate_over_new(without_format, weburl, fmt) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec print message message = "total processing time: %2f sec" % elapsed print message message = "Time spent on external call (os.system):" print message message = " bibformat: %2f sec" % tbibformat print message message = " bibupload: %2f sec" % tbibupload print message ### Result set operations ### def lhdiff(l1, l2): "Does list difference via intermediate hash." d = {} ld = [] for e in l2: d[e]=1 for e in l1: if not d.has_key(e): ld.append(e) return ld ### Result set operations ### def ldiff(l1, l2): "Returns l1 - l2." ld = [] for e in l1: if not e in l2: ld.append(e) return ld ### Identify recIDs of records with missing format ### def without_fmt(sql): "List of record IDs to be reformated, not having the specified format yet" global fmt xm1, xm2, format1, format2 = [],[],[],[] q1 = sql['q1'] q2 = sql['q2'] ## get complete recID list of xm formatted records xm1 = run_sql(q1) for item in xm1: xm2.append(item[0]) ## get complete recID list of formatted records format1 = run_sql(q2) for item in format1: format2.append(item[0]) return lhdiff(xm2,format2) ### Bibreformat all selected records (using new python bibformat) ### (see iterate_over_old further down) def iterate_over_new(list, weburl, fmt): "Iterate odver list of IDs" n_it_rec = 0 # Number of records for current iteration n_it_max = 10000 # Number of max records in one iteration n_rec = 0 # Number of formatted records total_rec = len(list) # Total number of records formatted_records = '' # (string-)List of formatted record of an iteration for recID in list: n_rec +=1 n_it_rec += 1 message = "Processing record %d with format %s (New BibFormat)" % (recID, fmt) print message t11 = os.times()[4] message = "START bibformat external call" print message ### bibformat external call ### formatted_record = format_record(recID, fmt, on_the_fly=True) # Encapsulate record in xml tags that bibupload understands prologue = ''' %s %s ''' % (recID, fmt) epilogue = ''' ''' t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message formatted_records += prologue + encode_for_xml(formatted_record) + epilogue # every n_max record, upload all formatted records. # also upload if recID is last one if n_it_rec > n_it_max or n_rec == total_rec: #Save formatted records to disk for bibupload finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S')) filehandle = open(finalfilename,"w") filehandle.write(formatted_records) filehandle.close() ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message command = "%s/bibupload -f %s" % (bindir, finalfilename) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message #Reset iteration state n_it_rec = 0 xml_content = '' def iterate_over_old(list, weburl, fmt): "Iterate odver list of IDs" n_rec = 0 n_max = 10000 total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call for record in list: n_rec = n_rec + 1 total_rec = total_rec + 1 message = "Processing record: %d" % (record) print message query = "id=%d&of=xm" % (record) count = 0 contents = print_record(record, 'xm') while (contents == "") and (count < 10): contents = print_record(record, 'xm') count = count + 1 time.sleep(10) if count == 10: sys.stderr.write("Failed to download %s from %s after 10 attempts... terminating" % (query, weburl)) sys.exit(0) xml_content = xml_content + contents if xml_content: if n_rec >= n_max: finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S')) filename = "%s/bibreformat.xml" % tmpdir filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message command = "%s/bibupload -f %s" % (bindir,finalfilename) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) n_rec = 0 xml_content = '' ### Process the last re-formated chunk ### if n_rec > 0: print "Processing last record set (%d)" % n_rec finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S')) filename = "%s/bibreformat.xml" % tmpdir filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message command = "%s/bibupload -f %s" % (bindir,finalfilename) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) return ### Bibshed compatibility procedures ### def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) print value date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def write_message(msg, stream=sys.stdout): """Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff.""" if stream == sys.stdout or stream == sys.stderr: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame)) write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame)) write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame)) write_message("stopping...") task_update_status("STOPPING") write_message("flushing cache or whatever...") time.sleep(3) write_message("closing tables or whatever...") time.sleep(1) write_message("stopped") task_update_status("STOPPED") sys.exit(0) def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame)) write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" # do nothing for unknown signals: write_message("unknown signal %d (frame %s) ignored" % (sig, frame)) def authenticate(user, header="BibReformat Task Submission", action="runbibformat"): """Authenticate the user against the user database. Check for its password, if it exists. Check for action access rights. Return user name upon authorization success, do system exit upon authorization failure. """ print header print "=" * len(header) if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user ## first check user pw: - res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1) + res = run_sql("select id,password from user where email=%s", (user,), 1) + \ + run_sql("select id,password from user where nickname=%s", (user,), 1) if not res: print "Sorry, %s does not exist." % user sys.exit(1) else: (uid_db, password_db) = res[0] if password_db: password_entered = getpass.getpass() if password_db == password_entered: pass else: print "Sorry, wrong credentials for %s." % user sys.exit(1) ## secondly check authorization for the action: (auth_code, auth_message) = acc_authorize_action(uid_db, action) if auth_code != 0: print auth_message sys.exit(1) return user def task_submit(): """Submits task to the BibSched task queue. This is what people will be invoking via command line.""" global options, sched_time, sleep_time ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] ## authenticate user: user = authenticate(options.get("user", "")) ## submit task: task_id = run_sql("""INSERT INTO schTASK (id,proc,user,status,arguments,sleeptime,runtime) VALUES (NULL,'bibreformat',%s,'WAITING',%s,%s,%s)""", (user, marshal.dumps(options),sleeptime,escape_string(sched_time))) ## update task number: options["task"] = task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id)) write_message("Task #%d submitted." % task_id) return task_id def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global options return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"])) def task_update_status(val): """Updates status information in the BibSched task table.""" global options return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"])) def task_read_status(task_id): """Read status information in the BibSched task table.""" res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1) try: out = res[0][0] except: out = 'UNKNOWN' return out def task_get_options(id): """Returns options for the task 'id' read from the BibSched task queue table.""" out = {} res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibreformat'", (id,)) try: out = marshal.loads(res[0][0]) except: write_message("Error: BibReformat task %d does not seem to exist." % id) sys.exit(1) return out def task_run(task_id, process_format): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call.""" global options, process, fmt, sched_time options = task_get_options(task_id) # get options from BibSched task table ## check task id: if not options.has_key("task"): write_message("Error: The task #%d does not seem to be a BibReformat task." % task_id) return ## initialize parameters if options.has_key("format"): fmt = options["format"] else: fmt = "hb" sql = { "all" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt, "last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt, "q1" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'", "q2" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt } if options.has_key("all"): sql_queries.append(sql['all']) if options.has_key("without"): process_format = 1 if options.has_key("noprocess"): process = 0 if options.has_key("last"): sql_queries.append(sql['last']) if options.has_key("collection"): cds_query['collection'] = options['collection'] else: cds_query['collection'] = "" if options.has_key("field"): cds_query['field'] = options['field'] else: cds_query['field'] = "" if options.has_key("pattern"): cds_query['pattern'] = options['pattern'] else: cds_query['pattern'] = "" ### sql commands to be executed during the script run ### ## check task status: task_status = task_read_status(task_id) if task_status != "WAITING": write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status)) return ## update task status: task_update_status("RUNNING") ## initialize signal handler: signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) ## run the task: bibreformat_task(sql, sql_queries, cds_query, process_format) ## we are done: task_update_status("DONE") return def usage(exitcode=1, msg=""): """Prints usage info.""" if msg: sys.stderr.write("Error: %s.\n" % msg) sys.stderr.write("Usage: %s [options]\n" % sys.argv[0]) sys.stderr.write(" -u, --user=USER \t\t User name to submit the task as, password needed.\n") sys.stderr.write(" -h, --help \t\t Print this help.\n") sys.stderr.write(" -V, --version \t\t Print version information.\n") sys.stderr.write(" -v, --verbose=LEVEL \t\t Verbose level (0=min,1=normal,9=max).\n") sys.stderr.write(" -s, --sleeptime=SLEEP\t\t Time after which to repeat tasks (no)\n") sys.stderr.write(" -t, --time=DATE \t\t Moment for the task to be active (now).\n") sys.stderr.write(" -a, --all \t\t All records\n") sys.stderr.write(" -c, --collection \t\t Select records by collection\n") sys.stderr.write(" -f, --field \t\t Select records by field.\n") sys.stderr.write(" -p, --pattern \t\t Select records by pattern.\n") sys.stderr.write(" -o, --format \t\t Specify output format to be (re-)created. (default HB)\n") sys.stderr.write(" -n, --noprocess \t\t Count records to be processed only (no processing done)\n") sys.stderr.write("\n") sys.stderr.write(" Example: bibreformat -n Show how many records are to be bibreformated.") sys.stderr.write("\n") sys.exit(exitcode) def main(): """Main function that analyzes command line input and calls whatever is appropriate. Useful for learning on how to write BibSched tasks.""" global options, sched_time, sleeptime ## parse command line: if len(sys.argv) == 2 and sys.argv[1].isdigit(): ## A - run the task task_id = int(sys.argv[1]) process_format = 0 task_run(task_id, process_format) else: ## B - submit the task process_format = 0 options = {} # will hold command-line options options["verbose"] = 1 try: opts, args = getopt.getopt(sys.argv[1:], "hVv:u:ac:f:s:p:lo:nt:wl", ["help", "version", "verbose=","user=","all","collection=","field=","sleeptime=","pattern=","format=","noprocess","time=","without","last"]) except getopt.GetoptError, err: usage(1, err) clp = 0 # default parameters flag try: for opt in opts: if opt[0] in ["-h", "--help"]: usage(0) elif opt[0] in ["-V", "--version"]: print __revision__ sys.exit(0) elif opt[0] in [ "-u", "--user"]: options["user"] = opt[1] elif opt[0] in ["-v", "--verbose"]: options["verbose"] = int(opt[1]) elif opt[0] in ["-a", "--all"]: options["all"] = 1 options["without"] = 1 clp = 1 elif opt[0] in ["-c", "--collection"]: options["collection"]=opt[1] clp = 1 elif opt[0] in ["-n", "--noprocess"]: options["noprocess"] = 1 elif opt[0] in ["-f", "--field"]: options["field"] = opt[1] clp = 1 elif opt[0] in ["-p","--pattern"]: options["pattern"] = opt[1] clp = 1 elif opt[0] in ["-o","--format"]: options["format"] = opt[1] elif opt[0] in ["-s", "--sleeptime" ]: get_date(opt[1]) # see if it is a valid shift sleeptime = opt[1] elif opt[0] in [ "-t", "--time" ]: sched_time = get_date(opt[1]) if clp == 0: # default options["without"] = 1 options["last"] = 1 except StandardError, e: usage(e) task_submit() return ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/bibharvest/bin/oaiarchive.in b/modules/bibharvest/bin/oaiarchive.in index 4f8a40107..6303b779e 100644 --- a/modules/bibharvest/bin/oaiarchive.in +++ b/modules/bibharvest/bin/oaiarchive.in @@ -1,370 +1,371 @@ #!@PYTHON@ ## -*- mode: python; coding: utf-8; -*- ## ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ OAI repository archive and management tool Usage: oaiarchive [options] Options: -o --oaiset= Specify setSpec -h --help Print this help -V --version Print version information and exit -a --add Add records to OAI repository -d --delete Remove records from OAI repository -r --report Print OAI repository status -i --info Give info about OAI set (default) -p --upload Upload records -n --no-process Do not upload records (default) Examples: Expose set -setname- via OAI gateway oaiarchive --oaiset='setname' --add --upload oaiarchive -apo 'setname' Remove records defined by set -setname- from OAI repository oaiarchive --oaiset='setname' --delete --upload oaiarchive -dpo 'setname' Expose entire repository via OAI gateway oaiarchive --oaiset=global --add --upload oaiarchive -apo global Print OAI repository status oaiarchive -r """ __revision__ = "$Id$" try: import sys from invenio.oaiarchive_engine import oaiarchive_task from invenio.oaiarchive_engine import printInfo from invenio.dbquery import run_sql, escape_string from invenio.config import * from invenio.search_engine import perform_request_search from invenio.search_engine import print_record from invenio.access_control_engine import acc_authorize_action import getopt import getpass import string import marshal import signal import time import re except ImportError, e: print "Error: %s" % e sys.exit(1) options = {} # global variable to hold task options sleeptime = "" # default sleeptime sched_time = time.strftime("%Y-%m-%d %H:%M:%S") # scheduled execution time in the date/time format ### Bibshed compatibility procedures ### def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def write_message(msg, stream=sys.stdout): """Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff.""" if stream == sys.stdout or stream == sys.stderr: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame)) write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame)) write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame)) write_message("stopping...") task_update_status("STOPPING") write_message("flushing cache or whatever...") time.sleep(3) write_message("closing tables or whatever...") time.sleep(1) write_message("stopped") task_update_status("STOPPED") sys.exit(0) def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame)) write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" # do nothing for unknown signals: write_message("unknown signal %d (frame %s) ignored" % (sig, frame)) def authenticate(user, header="OAI Archive Task Submission", action="runoaiarchive"): """Authenticate the user against the user database. Check for its password, if it exists. Check for action access rights. Return user name upon authorization success, do system exit upon authorization failure. """ print header print "=" * len(header) if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user ## first check user pw: - res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1) + res = run_sql("select id,password from user where email=%s", (user,), 1) + \ + run_sql("select id,password from user where nickname=%s", (user,), 1) if not res: print "Sorry, %s does not exist." % user sys.exit(1) else: (uid_db, password_db) = res[0] if password_db: password_entered = getpass.getpass() if password_db == password_entered: pass else: print "Sorry, wrong credentials for %s." % user sys.exit(1) ## secondly check authorization for the action: (auth_code, auth_message) = acc_authorize_action(uid_db, action) if auth_code != 0: print auth_message sys.exit(1) return user def task_submit(): """Submits task to the BibSched task queue. This is what people will be invoking via command line.""" global options, sched_time, sleep_time ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] ## authenticate user: user = authenticate(options.get("user", "")) ## submit task: task_id = run_sql("""INSERT INTO schTASK (id,proc,user,status,arguments,sleeptime,runtime) VALUES (NULL,'oaiarchive',%s,'WAITING',%s,%s,%s)""", (user, marshal.dumps(options),sleeptime,escape_string(sched_time))) ## update task number: options["task"] = task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id)) write_message("Task #%d submitted." % task_id) return task_id def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global options return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"])) def task_update_status(val): """Updates status information in the BibSched task table.""" global options return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"])) def task_read_status(task_id): """Read status information in the BibSched task table.""" res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1) try: out = res[0][0] except: out = 'UNKNOWN' return out def task_get_options(id): """Returns options for the task 'id' read from the BibSched task queue table.""" out = {} res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='oaiarchive'", (id,)) try: out = marshal.loads(res[0][0]) except: write_message("Error: OAIarchive task %d does not seem to exist." % id) sys.exit(1) return out def task_run(task_id): """Runs the task""" global options options = task_get_options(task_id) # get options from BibSched task table ## check task id: if not options.has_key("task"): write_message("Error: The task #%d does not seem to be a OAI archive task." % task_id) return ## initialize parameters if options.has_key("option"): ### sql commands to be executed during the script run ### ## check task status: task_status = task_read_status(task_id) if task_status != "WAITING": write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status)) return ## update task status: task_update_status("RUNNING") ## initialize signal handler: signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) ## run the task: oaiarchive_task(options) ## we are done: task_update_status("DONE") return ######################### def main(): """Main function that analyzes command line input and calls whatever is appropriate. Useful for learning on how to write BibSched tasks.""" global options, sched_time, sleeptime ## parse command line: if len(sys.argv) == 2 and sys.argv[1].isdigit(): ## A - run the task task_id = int(sys.argv[1]) task_run(task_id) else: ## B - submit the task options = {} # will hold command-line options options["verbose"] = 1 try: opts, args = getopt.getopt(sys.argv[1:], "hVv:u:s:t:ado:pirn", ["help", "version", "verbose=","user=","sleeptime=","time=","add","delete","oaiset=","upload","info","report","no-process"]) except getopt.GetoptError: printInfo() sys.exit(1) ## set defaults options["upload"] = 0 options["mode"] = 0 options["oaiset"] = "" options["nice"] = 0 try: for opt in opts: if opt[0] in ["-h", "--help"]: printInfo() sys.exit(0) elif opt[0] in ["-V", "--version"]: print __revision__ sys.exit(0) elif opt[0] in [ "-u", "--user"]: options["user"] = opt[1] elif opt[0] in ["-v", "--verbose"]: options["verbose"] = int(opt[1]) elif opt[0] in ["-s", "--sleeptime" ]: get_date(opt[1]) # see if it is a valid shift sleeptime = opt[1] elif opt[0] in [ "-t", "--time" ]: sched_time = get_date(opt[1]) elif opt[0] in ["-n", "--nice"]: options["nice"] = opt[1] elif opt[0] in ["-o", "--oaiset"]: options["oaiset"] = opt[1] elif opt[0] in ["-a", "--add"]: options["mode"] = 1 elif opt[0] in ["-d", "--delete"]: options["mode"] = 2 elif opt[0] in ["-p", "--upload"]: options["upload"] = 1 elif opt[0] in ["-i", "--info"]: options["mode"] = 0 elif opt[0] in ["-r", "--report"]: options["mode"] = 3 elif opt[0] in ["-n", "--no-process"]: options["upload"] = 0 except StandardError: printInfo() sys.exit(1) task_submit() return ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/bibharvest/lib/oaiharvestlib.py b/modules/bibharvest/lib/oaiharvestlib.py index 5fd01eaae..55e8e8681 100644 --- a/modules/bibharvest/lib/oaiharvestlib.py +++ b/modules/bibharvest/lib/oaiharvestlib.py @@ -1,640 +1,641 @@ # -*- coding: utf-8 -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ oaiharvest implementation. See oaiharvest executable for entry point. """ __revision__ = "$Id$" import marshal import getopt import getpass import string import os import sre import sys import time import signal import traceback import calendar from invenio.config import \ bibconvert, \ bibupload, \ bindir, \ tmpdir, \ version from invenio.bibindex_engine_config import * from invenio.dbquery import run_sql, escape_string from invenio.access_control_engine import acc_authorize_action options = {} # global variable to hold task options ## precompile some often-used regexp for speed reasons: sre_subfields = sre.compile('\$\$\w'); sre_html = sre.compile("(?s)<[^>]*>|&#?\w+;") sre_datetime_shift = sre.compile("([-\+]{0,1})([\d]+)([dhms])") tmpHARVESTpath = tmpdir + '/oaiharvest' def write_message(msg, stream=sys.stdout): """Write message and flush output stream (may be sys.stdout or sys.stderr).""" if stream == sys.stdout or stream == sys.stderr: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) return def usage(code, msg=''): "Prints usage for this module." if msg: sys.stderr.write("Error: %s.\n" % msg) usagetext = """ Usage: oaiharvest [options] Examples: oaiharvest -r arxiv -s 24h oaiharvest -r pubmed -d 2005-05-05:2005-05-10 -t 10m Specific options: -r, --repository=REPOS_ONE,"REPOS TWO" name of the OAI repositories to be harvested (default=all) -d, --dates=yyyy-mm-dd:yyyy-mm-dd harvest repositories between specified dates (overrides repositories' last updated timestamps) Scheduling options: -u, --user=USER user name to store task, password needed -s, --sleeptime=SLEEP time after which to repeat tasks (no) e.g.: 1s, 30m, 24h, 7d -t, --time=TIME moment for the task to be active (now) e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26 General options: -h, --help print this help and exit -V, --version print version and exit -v, --verbose=LEVEL verbose level (from 0 to 9, default 1) """ sys.stderr.write(usagetext) sys.exit(code) def authenticate(user, header="oaiharvest Task Submission", action="runoaiharvest"): """Authenticate the user against the user database. Check for its password, if it exists. Check for action access rights. Return user name upon authorization success, do system exit upon authorization failure. """ print header print "=" * len(header) if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user ## first check user pw: - res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1) + res = run_sql("select id,password from user where email=%s", (user,), 1) + \ + run_sql("select id,password from user where nickname=%s", (user,), 1) if not res: print "Sorry, %s does not exist." % user sys.exit(1) else: (uid_db, password_db) = res[0] if password_db: password_entered = getpass.getpass() if password_db == password_entered: pass else: print "Sorry, wrong credentials for %s." % user sys.exit(1) ## secondly check authorization for the action: (auth_code, auth_message) = acc_authorize_action(uid_db, action) if auth_code != 0: print auth_message sys.exit(1) return user def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = sre_datetime_shift.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def task_run(row): """Run the harvesting task. The row argument is the Bibharvest task queue row, containing if, arguments, etc. Return 1 in case of success and 0 in case of failure. """ global options reposlist = [] datelist = [] dateflag = 0 # read from SQL row: task_id = row[0] task_proc = row[1] options = marshal.loads(row[6]) task_status = row[7] # sanity check: if task_proc != "oaiharvest": write_message("The task #%d does not seem to be a oaiharvest task." % task_id, sys.stderr) return 0 if task_status != "WAITING": write_message("The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr) return 0 # we can run the task now: if options["verbose"]: write_message("Task #%d started." % task_id) task_starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) task_update_status("RUNNING") # install signal handlers signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) ### go ahead: build up the reposlist if options["repository"] != None: ### user requests harvesting from selected repositories write_message("harvesting from selected repositories") for reposname in options["repository"]: row = get_row_from_reposname(reposname) if row==[]: write_message("source name " + reposname + " is not valid") continue else: reposlist.append(get_row_from_reposname(reposname)) else: ### user requests harvesting from all repositories write_message("harvesting from all repositories in the database") reposlist = get_all_rows_from_db() ### go ahead: check if user requested from-until harvesting if options["dates"]: ### for each repos simply perform a from-until date harvesting... no need to update anything dateflag = 1 for element in options["dates"]: datelist.append(element) for repos in reposlist: postmode = str(repos[0][9]) if postmode=="h" or postmode=="h-c" or postmode=="h-u" or postmode=="h-c-u": harvestpath = tmpdir + "/oaiharvest" + str(os.getpid()) if dateflag == 1: res = call_bibharvest(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, fro=str(datelist[0]), until=str(datelist[1])) if res==0 : write_message("source " + str(repos[0][6]) + " was harvested from " + str(datelist[0]) + " to " + str(datelist[1])) else : write_message("an error occurred while harvesting from source " + str(repos[0][6]) + " for the dates chosen") continue elif dateflag != 1 and repos[0][7] == None and repos[0][8] != 0: write_message("source " + str(repos[0][6]) + " was never harvested before - harvesting whole repository") res = call_bibharvest(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath) if res==0 : update_lastrun(repos[0][0]) else : write_message("an error occurred while harvesting from source " + str(repos[0][6])) continue elif dateflag != 1 and repos[0][8] != 0: ### check that update is actually needed, i.e. lastrun+frequency>today timenow = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) lastrundate = sre.sub(r'\.[0-9]+$', '', str(repos[0][7])) # remove trailing .00 timeinsec = int(repos[0][8])*60*60 updatedue = add_timestamp_and_timelag(lastrundate, timeinsec) proceed = compare_timestamps_with_tolerance(updatedue, timenow) if proceed==0 or proceed==-1 : #update needed! write_message("source " + str(repos[0][6]) + " is going to be updated") fromdate = str(repos[0][7]) fromdate = fromdate.split()[0] # get rid of time of the day for the moment res = call_bibharvest(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, fro=fromdate) if res==0 : update_lastrun(repos[0][0]) else : write_message("an error occurred while harvesting from source " + str(repos[0][6])) continue else: write_message("source " + str(repos[0][6]) + " does not need updating") continue elif dateflag != 1 and repos[0][8] == 0: write_message("source " + str(repos[0][6]) + " has frequency set to 'Never' so it will not be updated") continue if postmode=="h-u": res = call_bibupload(convertpath=harvestpath) if res==0 : write_message("material harvested from source " + str(repos[0][6]) + " was successfully uploaded") else : write_message("an error occurred while uploading harvest from " + str(repos[0][6])) continue if postmode=="h-c" or postmode=="h-c-u": convertpath = tmpdir + "/bibconvertrun" + str(os.getpid()) res = call_bibconvert(config=str(repos[0][5]), harvestpath=harvestpath, convertpath=convertpath) if res==0 : write_message("material harvested from source " + str(repos[0][6]) + " was successfully converted") else : write_message("an error occurred while converting from " + str(repos[0][6])) continue if postmode=="h-c-u": res = call_bibupload(convertpath=convertpath) if res==0 : write_message("material harvested from source " + str(repos[0][6]) + " was successfully uploaded") else : write_message("an error occurred while uploading harvest from " + str(repos[0][6])) continue elif postmode not in ["h", "h-c", "h-u", "h-c-u"] : ### this should not happen write_message("invalid postprocess mode: " + postmode + " skipping repository") continue task_update_status("DONE") if options["verbose"]: write_message("Task #%d finished." % task_id) return 1 def add_timestamp_and_timelag(timestamp, timelag): """ Adds a time lag in seconds to a given date (timestamp). Returns the resulting date. """ # remove any trailing .00 in timestamp: timestamp = sre.sub(r'\.[0-9]+$', '', timestamp) # first convert timestamp to Unix epoch seconds: timestamp_seconds = calendar.timegm(time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")) # now add them: result_seconds = timestamp_seconds + timelag result = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(result_seconds)) return result def update_lastrun(index): """ A method that updates the lastrun of a repository successfully harvested """ try: today = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) sql = 'UPDATE oaiHARVEST SET lastrun="%s" WHERE id=%s' % (today, index) res = run_sql(sql) return 1 except StandardError, e: return (0,e) def call_bibharvest(prefix, baseurl, harvestpath, fro="", until=""): """ A method that calls bibharvest and writes harvested output to disk """ try: command = '%s/bibharvest -o %s -v ListRecords -p %s ' % (bindir, harvestpath, prefix) if fro!="": command += '-f %s ' % fro if until!="": command += '-u %s ' % until command += baseurl print "Start harvesting" #print command ret = os.system(command) print "Harvesting finished, merging files" harvestdir, filename = os.path.split(harvestpath) #print "get files" files = os.listdir(harvestdir) #print "sort file" files.sort() #print "open dest file" hf = file(harvestpath, 'w') for f in files: if f[:len(filename)] == filename: print "processing file %s"%f rf = file(os.path.join(harvestdir, f), 'r') hf.write(rf.read()) hf.write("\n") rf.close() #os.remove(os.path.join(harvestdir, f)) hf.close() print "Files merged" return 0 except StandardError, e: return (0,e) def call_bibconvert(config, harvestpath, convertpath): """ A method that reads from a file and converts according to a BibConvert Configuration file. Converted output is returned """ command = """%s/bibconvert -c %s < %s > %s """ % (bindir, config, harvestpath, convertpath) stdout = os.popen(command) return 0 def call_bibupload(convertpath): """ A method that uploads a file to the database - calls bibUpload """ command = '%s/bibupload -i %s ' % (bindir, convertpath) p=os.system(command) return p def get_row_from_reposname(reposname): """ Returns all information about a row (OAI source) from the source name """ try: sql = 'select * from oaiHARVEST where name="%s"' % escape_string(reposname) res = run_sql(sql) reposdata = [] for element in res: reposdata.append(element) return reposdata except StandardError, e: return (0,e) def get_all_rows_from_db(): """ This method retrieves the full database of repositories and returns a list containing (in exact order): | id | baseurl | metadataprefix | arguments | comment | bibconvertcfgfile | name | lastrun | frequency | postprocess | """ try: reposlist = [] sql = """select id from oaiHARVEST""" idlist = run_sql(sql) for index in idlist: sql = """select * from oaiHARVEST where id=%s""" % index reposelements = run_sql(sql) repos = [] for element in reposelements: repos.append(element) reposlist.append(repos) return reposlist except StandardError, e: return (0,e) def compare_timestamps_with_tolerance(timestamp1, timestamp2, tolerance=0): """Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form '2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument (in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2 minus TOLERANCE, 0 if they are equal within TOLERANCE limit, and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE. """ # remove any trailing .00 in timestamps: timestamp1 = sre.sub(r'\.[0-9]+$', '', timestamp1) timestamp2 = sre.sub(r'\.[0-9]+$', '', timestamp2) # first convert timestamps to Unix epoch seconds: timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S")) timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S")) # now compare them: if timestamp1_seconds < timestamp2_seconds - tolerance: return -1 elif timestamp1_seconds > timestamp2_seconds + tolerance: return 1 else: return 0 def command_line(): global options long_flags =["repository=", "dates=" "user=","sleeptime=","time=", "help", "version", "verbose="] short_flags ="r:d:u:s:t:hVv:" format_string = "%Y-%m-%d %H:%M:%S" repositories = None dates = None sleeptime = "" try: opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags) except getopt.GetoptError, err: write_message(err, sys.stderr) usage(1) if args: usage(1) options={"sleeptime":0, "verbose":1, "repository":0, "dates":0} sched_time = time.strftime(format_string) user = "" # Check for key options try: for opt in opts: if opt == ("-h","") or opt == ("--help",""): usage(1) elif opt == ("-V","") or opt == ("--version",""): print __revision__ sys.exit(1) elif opt[0] in ["--verbose", "-v"]: options["verbose"] = int(opt[1]) elif opt[0] in [ "-r", "--repository" ]: repositories = opt[1] elif opt[0] in [ "-d", "--dates" ]: dates = opt[1] elif opt[0] in [ "-u", "--user"]: user = opt[1] elif opt[0] in [ "-s", "--sleeptime" ]: get_datetime(opt[1]) # see if it is a valid shift sleeptime= opt[1] elif opt[0] in [ "-t", "--time" ]: sched_time= get_datetime(opt[1]) else: usage(1) except StandardError, e: write_message(e, sys.stderr) sys.exit(1) options["repository"]=get_repository_names(repositories) if dates != None: options["dates"]=get_dates(dates) if dates != None and options["dates"]==None: write_message("Date format not valid. Quitting task...") sys.exit(1) user = authenticate(user) if options["verbose"] >= 9: print "" write_message("storing task options %s\n" % options) ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] new_task_id = run_sql("""INSERT INTO schTASK (proc,user,runtime,sleeptime,arguments,status) VALUES ('oaiharvest',%s,%s,%s,%s,'WAITING')""", (user, sched_time, sleeptime, marshal.dumps(options))) ## update task number: options["task"] = new_task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), new_task_id)) print "Task #%d was successfully scheduled for execution." % new_task_id return def get_dates(dates): """ A method to validate and process the dates input by the user at the command line """ twodates = [] if dates: datestring = string.split(dates, ":") if len(datestring)==2: for date in datestring: ### perform some checks on the date format datechunks = string.split(date, "-") if len(datechunks)==3: try: if int(datechunks[0]) and int(datechunks[1]) and int(datechunks[2]): twodates.append(date) except StandardError: write_message("Dates have invalid format, not 'yyyy-mm-dd:yyyy-mm-dd'") twodates=None return twodates else: write_message("Dates have invalid format, not 'yyyy-mm-dd:yyyy-mm-dd'") twodates=None return twodates ## final check.. date1 must me smaller than date2 date1 = str(twodates[0]) + " 01:00:00" date2 = str(twodates[1]) + " 01:00:00" if compare_timestamps_with_tolerance(date1, date2)!=-1: write_message("First date must be before second date.") twodates=None return twodates else: write_message("Dates have invalid format, not 'yyyy-mm-dd:yyyy-mm-dd'") twodates=None else: twodates=None return twodates def get_repository_names(repositories): """ A method to validate and process the repository names input by the user at the command line """ repository_names = [] if repositories: names = string.split(repositories, ",") for name in names: ### take into account both single word names and multiple word names (which get wrapped around "" or '') quote = "'" doublequote = '"' if name.find(quote)==0 and name.find(quote)==len(name): name = name.split(quote)[1] if name.find(doublequote)==0 and name.find(doublequote)==len(name): name = name.split(doublequote)[1] repository_names.append(name) else: repository_names=None return repository_names def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame)) write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame)) write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame)) write_message("stopping...") task_update_status("STOPPING") errcode = 0 try: task_sig_stop_commands() write_message("stopped") task_update_status("STOPPED") except StandardError, err: write_message("Error during stopping! %e" % err) task_update_status("STOPPINGFAILED") errcode = 1 sys.exit(errcode) def task_sig_stop_commands(): """Do all the commands necessary to stop the task before quitting. Useful for task_sig_stop() handler. """ write_message("stopping commands started") pass write_message("stopping commands ended") def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame)) write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" # do nothing for unknown signals: write_message("unknown signal %d (frame %s) ignored" % (sig, frame)) def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global options if options["verbose"] >= 9: write_message("Updating task progress to %s." % msg) return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"])) def task_update_status(val): """Updates state information in the BibSched task table.""" global options if options["verbose"] >= 9: write_message("Updating task status to %s." % val) return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"])) def main(): """Reads arguments and either runs the task, or starts user-interface (command line).""" if len(sys.argv) == 2: try: task_id = int(sys.argv[1]) except StandardError: command_line() sys.exit() res = run_sql("SELECT * FROM schTASK WHERE id='%d'" % (task_id), None, 1) if not res: write_message("Selected task not found.", sys.stderr) sys.exit(1) try: if not task_run(res[0]): write_message("Error occurred. Exiting.", sys.stderr) except StandardError, e: write_message("Unexpected error occurred: %s." % e, sys.stderr) if options["verbose"] >= 9: traceback.print_tb(sys.exc_info()[2]) write_message("Exiting.") task_update_status("ERROR") else: command_line() diff --git a/modules/bibindex/lib/bibindex_engine.py b/modules/bibindex/lib/bibindex_engine.py index dfade3868..f2f42fe79 100644 --- a/modules/bibindex/lib/bibindex_engine.py +++ b/modules/bibindex/lib/bibindex_engine.py @@ -1,1659 +1,1660 @@ # -*- coding: utf-8 -*- ## ## $Id$ ## BibIndxes bibliographic data, reference and fulltext indexing utility. ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibIndex indexing engine implementation. See bibindex executable for entry point. """ __revision__ = "$Id$" import marshal from zlib import compress,decompress import string import getopt import getpass import os import sre import sys import time import Numeric import urllib2 import signal import tempfile import traceback import cStringIO from invenio.config import \ CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS, \ CFG_BIBINDEX_CHARS_PUNCTUATION, \ CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY, \ CFG_BIBINDEX_MIN_WORD_LENGTH, \ CFG_BIBINDEX_REMOVE_HTML_MARKUP, \ CFG_BIBINDEX_STEMMER_DEFAULT_LANGUAGE, \ CFG_MAX_RECID, \ version, \ weburl from invenio.bibindex_engine_config import * from invenio.search_engine import perform_request_search, strip_accents from invenio.dbquery import run_sql, escape_string, DatabaseError from invenio.access_control_engine import acc_authorize_action from invenio.bibindex_engine_stopwords import is_stopword from invenio.bibindex_engine_stemmer import stem ## import optional modules: try: import psyco psyco.bind(get_words_from_phrase) psyco.bind(WordTable.merge_with_old_recIDs) psyco.bind(serialize_via_numeric_array) psyco.bind(serialize_via_marshal) psyco.bind(deserialize_via_numeric_array) psyco.bind(deserialize_via_marshal) except: pass def write_message(msg, stream=sys.stdout): """Write message and flush output stream (may be sys.stdout or sys.stderr).""" if stream == sys.stdout or stream == sys.stderr: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) return ## precompile some often-used regexp for speed reasons: sre_subfields = sre.compile('\$\$\w'); sre_html = sre.compile("(?s)<[^>]*>|&#?\w+;") sre_block_punctuation_begin = sre.compile(r"^"+CFG_BIBINDEX_CHARS_PUNCTUATION+"+") sre_block_punctuation_end = sre.compile(CFG_BIBINDEX_CHARS_PUNCTUATION+"+$") sre_punctuation = sre.compile(CFG_BIBINDEX_CHARS_PUNCTUATION) sre_separators = sre.compile(CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS) sre_datetime_shift = sre.compile("([-\+]{0,1})([\d]+)([dhms])") nb_char_in_line = 50 # for verbose pretty printing chunksize = 1000 # default size of chunks that the records will be treated by wordTables = [] base_process_size = 4500 # process base size options = {} # will hold task options ## Dictionary merging functions def intersection(dict1, dict2): "Returns intersection of the two dictionaries." int_dict = {} if len(dict1) > len(dict2): for e in dict2: if dict1.has_key(e): int_dict[e] = 1 else: for e in dict1: if dict2.has_key(e): int_dict[e] = 1 return int_dict def union(dict1, dict2): "Returns union of the two dictionaries." union_dict = {} for e in dict1.keys(): union_dict[e] = 1 for e in dict2.keys(): union_dict[e] = 1 return union_dict def diff(dict1, dict2): "Returns dict1 - dict2." diff_dict = {} for e in dict1.keys(): if not dict2.has_key(e): diff_dict[e] = 1 return diff_dict def list_union(list1, list2): "Returns union of the two lists." union_dict = {} for e in list1: union_dict[e] = 1 for e in list2: union_dict[e] = 1 return union_dict.keys() ## safety function for killing slow DB threads: def kill_sleepy_mysql_threads(max_threads=CFG_MAX_MYSQL_THREADS, thread_timeout=CFG_MYSQL_THREAD_TIMEOUT): """Check the number of DB threads and if there are more than MAX_THREADS of them, lill all threads that are in a sleeping state for more than THREAD_TIMEOUT seconds. (This is useful for working around the the max_connection problem that appears during indexation in some not-yet-understood cases.) If some threads are to be killed, write info into the log file. """ res = run_sql("SHOW FULL PROCESSLIST") if len(res) > max_threads: for row in res: r_id,r_user,r_host,r_db,r_command,r_time,r_state,r_info = row if r_command == "Sleep" and int(r_time) > thread_timeout: run_sql("KILL %s", (r_id,)) if options["verbose"] >= 1: write_message("WARNING: too many DB threads, killing thread %s" % r_id) return ## MARC-21 tag/field access functions def get_fieldvalues(recID, tag): """Returns list of values of the MARC-21 'tag' fields for the record 'recID'.""" out = [] bibXXx = "bib" + tag[0] + tag[1] + "x" bibrec_bibXXx = "bibrec_" + bibXXx query = "SELECT value FROM %s AS b, %s AS bb WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id AND tag LIKE '%s'" \ % (bibXXx, bibrec_bibXXx, recID, tag) res = run_sql(query) for row in res: out.append(row[0]) return out def get_associated_subfield_value(recID, tag, value, associated_subfield_code): """Return list of ASSOCIATED_SUBFIELD_CODE, if exists, for record RECID and TAG of value VALUE. Used by fulltext indexer only. Note: TAG must be 6 characters long (tag+ind1+ind2+sfcode), otherwise en empty string is returned. FIXME: what if many tag values have the same value but different associated_subfield_code? Better use bibrecord library for this. """ out = "" if len(tag) != 6: return out bibXXx = "bib" + tag[0] + tag[1] + "x" bibrec_bibXXx = "bibrec_" + bibXXx query = """SELECT bb.field_number, b.tag, b.value FROM %s AS b, %s AS bb WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id AND tag LIKE '%s%%'""" % \ (bibXXx, bibrec_bibXXx, recID, tag[:-1]) res = run_sql(query) field_number = -1 for row in res: if row[1] == tag and row[2] == value: field_number = row[0] if field_number > 0: for row in res: if row[0] == field_number and row[1] == tag[:-1] + associated_subfield_code: out = row[2] break return out def get_field_tags(field): """Returns a list of MARC tags for the field code 'field'. Returns empty list in case of error. Example: field='author', output=['100__%','700__%'].""" out = [] query = """SELECT t.value FROM tag AS t, field_tag AS ft, field AS f WHERE f.code='%s' AND ft.id_field=f.id AND t.id=ft.id_tag ORDER BY ft.score DESC""" % field res = run_sql(query) for row in res: out.append(row[0]) return out ## Fulltext word extraction functions def get_fulltext_urls_from_html_page(htmlpagebody): """Parses htmlpagebody data looking for url_directs referring to probable fulltexts. Returns an array of (ext,url_direct) to fulltexts. Note: it looks for file format extensions as defined by global 'CONV_PROGRAMS'structure. """ out = [] for ext in CONV_PROGRAMS.keys(): expr = sre.compile( r"\"(http://[\w]+\.+[\w]+[^\"'><]*\." + \ ext + r")\"") match = expr.search(htmlpagebody) if match: out.append([ext,match.group(1)]) else: # FIXME: workaround for getfile, should use bibdoc tables expr_getfile = sre.compile(r"\"(http://.*getfile\.py\?.*format=" + ext + r"&version=.*)\"") match = expr_getfile.search(htmlpagebody) if match: out.append([ext,match.group(1)]) return out def get_words_from_fulltext(url_direct_or_indirect, split=string.split, lower=string.lower, force_file_extension=None): """Returns all the words contained in the document specified by URL_DIRECT_OR_INDIRECT with the words being split by various SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF file. (This is interesting to index Indico for example.) Note also that URL_DIRECT_OR_INDIRECT may be either a direct URL to the fulltext file or an URL to a setlink-like page body that presents the links to be indexed. In the latter case the URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs to fulltext documents, for all knows file extensions as specified by global CONV_PROGRAMS config variable. """ if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY and string.find(url_direct_or_indirect, weburl) < 0: return [] if options["verbose"] >= 2: write_message("... reading fulltext files from %s started" % url_direct_or_indirect) fulltext_urls = None if not force_file_extension: url_direct = None fulltext_urls = None # check for direct link in url url_direct_or_indirect_ext = lower(split(url_direct_or_indirect,".")[-1]) if url_direct_or_indirect_ext in CONV_PROGRAMS.keys(): fulltext_urls = [(url_direct_or_indirect_ext,url_direct_or_indirect)] # Indirect url. Try to fetch the real fulltext(s) if not fulltext_urls: # read "setlink" data try: htmlpagebody = urllib2.urlopen(url_direct_or_indirect).read() except: sys.stderr.write("Error: Cannot read %s.\n" % url_direct_or_indirect) return [] fulltext_urls = get_fulltext_urls_from_html_page(htmlpagebody) if options["verbose"] >= 9: write_message("... fulltext_urls = %s" % fulltext_urls) else: fulltext_urls = [[force_file_extension, url_direct_or_indirect]] words = {} # process as many urls as they were found: for (ext,url_direct) in fulltext_urls: if options["verbose"] >= 2: write_message(".... processing %s from %s started" % (ext,url_direct)) # sanity check: if not url_direct: break; # read fulltext file: try: url = urllib2.urlopen(url_direct) except: sys.stderr.write("Error: Cannot read %s.\n" % url_direct) break # try other fulltext files... tmp_fd, tmp_name = tempfile.mkstemp('invenio.tmp') data_chunk = url.read(8*1024) while data_chunk: os.write(tmp_fd, data_chunk) data_chunk = url.read(8*1024) os.close(tmp_fd) # try all available conversion programs according to their order: bingo = 0 for conv_program in CONV_PROGRAMS[ext]: if os.path.exists(conv_program): # intelligence on how to run various conversion programs: cmd = "" # wil keep command to run bingo = 0 # had we success? if os.path.basename(conv_program) == "pdftotext": cmd = "%s -enc UTF-8 %s %s.txt" % (conv_program, tmp_name, tmp_name) elif os.path.basename(conv_program) == "pstotext": if ext == "ps.gz": # is there gzip available? if os.path.exists(CONV_PROGRAMS_HELPERS["gz"]): cmd = "%s -cd %s | %s > %s.txt" \ % (CONV_PROGRAMS_HELPERS["gz"], tmp_name, conv_program, tmp_name) else: cmd = "%s %s > %s.txt" \ % (conv_program, tmp_name, tmp_name) elif os.path.basename(conv_program) == "ps2ascii": if ext == "ps.gz": # is there gzip available? if os.path.exists(CONV_PROGRAMS_HELPERS["gz"]): cmd = "%s -cd %s | %s > %s.txt"\ % (CONV_PROGRAMS_HELPERS["gz"], tmp_name, conv_program, tmp_name) else: cmd = "%s %s %s.txt" \ % (conv_program, tmp_name, tmp_name) elif os.path.basename(conv_program) == "antiword": cmd = "%s %s > %s.txt" % (conv_program, tmp_name, tmp_name) elif os.path.basename(conv_program) == "catdoc": cmd = "%s %s > %s.txt" % (conv_program, tmp_name, tmp_name) elif os.path.basename(conv_program) == "wvText": cmd = "%s %s %s.txt" % (conv_program, tmp_name, tmp_name) elif os.path.basename(conv_program) == "ppthtml": # is there html2text available? if os.path.exists(CONV_PROGRAMS_HELPERS["html"]): cmd = "%s %s | %s > %s.txt"\ % (conv_program, tmp_name, CONV_PROGRAMS_HELPERS["html"], tmp_name) else: cmd = "%s %s > %s.txt" \ % (conv_program, tmp_name, tmp_name) elif os.path.basename(conv_program) == "xlhtml": # is there html2text available? if os.path.exists(CONV_PROGRAMS_HELPERS["html"]): cmd = "%s %s | %s > %s.txt" % \ (conv_program, tmp_name, CONV_PROGRAMS_HELPERS["html"], tmp_name) else: cmd = "%s %s > %s.txt" % \ (conv_program, tmp_name, tmp_name) else: sys.stderr.write("Error: Do not know how to handle %s conversion program.\n" % conv_program) # try to run it: try: if options["verbose"] >= 9: write_message("..... launching %s" % cmd) errcode = os.system(cmd) if errcode == 0 and os.path.exists("%s.txt" % tmp_name): bingo = 1 break # bingo! else: write_message("Error while running %s for %s.\n" % (cmd, url_direct), sys.stderr) except: write_message("Error running %s for %s.\n" % (cmd, url_direct), sys.stderr) # were we successful? if bingo: tmp_name_txt_file = open("%s.txt" % tmp_name) for phrase in tmp_name_txt_file.xreadlines(): for word in get_words_from_phrase(phrase): if not words.has_key(word): words[word] = 1; tmp_name_txt_file.close() else: if options["verbose"]: write_message("No conversion success for %s.\n" % (url_direct), sys.stderr) # delete temp files (they might not exist): try: os.unlink(tmp_name) os.unlink(tmp_name + ".txt") except StandardError: write_message("Error: Could not delete file. It didn't exist", sys.stderr) if options["verbose"] >= 2: write_message(".... processing %s from %s ended" % (ext,url_direct)) if options["verbose"] >= 2: write_message("... reading fulltext files from %s ended" % url_direct_or_indirect) return words.keys() # tagToFunctions mapping. It offers an indirection level necesary for # indexing fulltext. The default is get_words_from_phrase tagToWordsFunctions = {'8564_u':get_words_from_fulltext} def get_words_from_phrase(phrase, split=string.split): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ words = {} if CFG_BIBINDEX_REMOVE_HTML_MARKUP and string.find(phrase, " -1: phrase = sre_html.sub(' ', phrase) phrase = string.lower(phrase) # 1st split phrase into blocks according to whitespace for block in split(strip_accents(phrase)): # 2nd remove leading/trailing punctuation and add block: block = sre_block_punctuation_begin.sub("", block) block = sre_block_punctuation_end.sub("", block) if block: block = apply_stemming_and_stopwords_and_length_check(block) if block: words[block] = 1 # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in sre_punctuation.split(block): subblock = apply_stemming_and_stopwords_and_length_check(subblock) if subblock: words[subblock] = 1 # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in sre_separators.split(subblock): alphanumeric_group = apply_stemming_and_stopwords_and_length_check(alphanumeric_group) if alphanumeric_group: words[alphanumeric_group] = 1 return words.keys() def apply_stemming_and_stopwords_and_length_check(word): """Return WORD after applying stemming and stopword and length checks. See the config file in order to influence these. """ # stem word, when configured so: if CFG_BIBINDEX_STEMMER_DEFAULT_LANGUAGE != "": word = stem(word, CFG_BIBINDEX_STEMMER_DEFAULT_LANGUAGE) # now check against stopwords: if is_stopword(word): return "" # finally check the word length: if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH: return "" return word def remove_subfields(s): "Removes subfields from string, e.g. 'foo $$c bar' becomes 'foo bar'." return sre_subfields.sub(' ', s) def get_index_id(indexname): """Returns the words/phrase index id for INDEXNAME. Returns empty string in case there is no words table for this index. Example: field='author', output=4.""" out = 0 query = """SELECT w.id FROM idxINDEX AS w WHERE w.name='%s' LIMIT 1""" % indexname res = run_sql(query, None, 1) if res: out = res[0][0] return out def get_index_tags(indexname): """Returns the list of tags that are indexed inside INDEXNAME. Returns empty list in case there are no tags indexed in this index. Note: uses get_field_tags() defined before. Example: field='author', output=['100__%', '700__%'].""" out = [] query = """SELECT f.code FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f WHERE w.name='%s' AND w.id=wf.id_idxINDEX AND f.id=wf.id_field""" % indexname res = run_sql(query) for row in res: out.extend(get_field_tags(row[0])) return out def get_all_indexes(): """Returns the list of the names of all defined words indexes. Returns empty list in case there are no tags indexed in this index. Example: output=['global', 'author'].""" out = [] query = """SELECT name FROM idxINDEX""" res = run_sql(query) for row in res: out.append(row[0]) return out def usage(code, msg=''): "Prints usage for this module." if msg: sys.stderr.write("Error: %s.\n" % msg) print >> sys.stderr, \ """ Usage: %s [options] Examples: %s -a -i 234-250,293,300-500 -u admin@localhost %s -a -w author,fulltext -M 8192 -v3 %s -d -m +4d -A on --flush=10000 Indexing options: -a, --add add or update words for selected records -d, --del delete words for selected records -i, --id=low[-high] select according to doc recID -m, --modified=from[,to] select according to modification date -c, --collection=c1[,c2] select according to collection Repairing options: -k, --check check consistency for all records in the table(s) -r, --repair try to repair all records in the table(s) Specific options: -w, --windex=w1[,w2] word/phrase indexes to consider (all) -M, --maxmem=XXX maximum memory usage in kB (no limit) -f, --flush=NNN full consistent table flush after NNN records (10000) Scheduling options: -u, --user=USER user name to store task, password needed -s, --sleeptime=SLEEP time after which to repeat tasks (no) e.g.: 1s, 30m, 24h, 7d -t, --time=TIME moment for the task to be active (now) e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26 General options: -h, --help print this help and exit -V, --version print version and exit -v, --verbose=LEVEL verbose level (from 0 to 9, default 1) """ % ((sys.argv[0],) * 4) sys.exit(code) def authenticate(user, header="BibIndex Task Submission", action="runbibindex"): """Authenticate the user against the user database. Check for its password, if it exists. Check for action access rights. Return user name upon authorization success, do system exit upon authorization failure. """ print header print "=" * len(header) if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user ## first check user pw: - res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1) + res = run_sql("select id,password from user where email=%s", (user,), 1) + \ + run_sql("select id,password from user where nickname=%s", (user,), 1) if not res: print "Sorry, %s does not exist." % user sys.exit(1) else: (uid_db, password_db) = res[0] if password_db: password_entered = getpass.getpass() if password_db == password_entered: pass else: print "Sorry, wrong credentials for %s." % user sys.exit(1) ## secondly check authorization for the action: (auth_code, auth_message) = acc_authorize_action(uid_db, action) if auth_code != 0: print auth_message sys.exit(1) return user def split_ranges(parse_string): recIDs = [] ranges = string.split(parse_string, ",") for arange in ranges: tmp_recIDs = string.split(arange, "-") if len(tmp_recIDs)==1: recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])]) else: if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check tmp = tmp_recIDs[0] tmp_recIDs[0] = tmp_recIDs[1] tmp_recIDs[1] = tmp recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])]) return recIDs def get_word_tables(tables): global wordTables if tables: indexes = string.split(tables, ",") for index in indexes: index_id = get_index_id(index) if index_id: wordTables.append({"idxWORD%02dF" % index_id: \ get_index_tags(index)}) else: write_message("Error: There is no %s words table." % index, sys.stderr) else: for index in get_all_indexes(): index_id = get_index_id(index) wordTables.append({"idxWORD%02dF" % index_id: \ get_index_tags(index)}) return wordTables def get_date_range(var): "Returns the two dates contained as a low,high tuple" limits = string.split(var, ",") if len(limits)==1: low = get_datetime(limits[0]) return low,None if len(limits)==2: low = get_datetime(limits[0]) high = get_datetime(limits[1]) return low,high return None,None def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = sre_datetime_shift.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def create_range_list(res): """Creates a range list from a recID select query result contained in res. The result is expected to have ascending numerical order.""" if not res: return [] row = res[0] if not row: return [] else: range_list = [[row[0],row[0]]] for row in res[1:]: row_id = row[0] if row_id == range_list[-1][1] + 1: range_list[-1][1] = row_id else: range_list.append([row_id,row_id]) return range_list def beautify_range_list(range_list): """Returns a non overlapping, maximal range list""" ret_list = [] for new in range_list: found = 0 for old in ret_list: if new[0] <= old[0] <= new[1] + 1 or new[0] - 1 <= old[1] <= new[1]: old[0] = min(old[0], new[0]) old[1] = max(old[1], new[1]) found = 1 break if not found: ret_list.append(new) return ret_list def serialize_via_numeric_array(arr): """Serialize Numeric array into a compressed string.""" return compress(Numeric.dumps(arr)) def deserialize_via_numeric_array(string): """Decompress and deserialize string into a Numeric array.""" return Numeric.loads(decompress(string)) def serialize_via_marshal(obj): """Serialize Python object via marshal into a compressed string.""" return escape_string(compress(marshal.dumps(obj))) def deserialize_via_marshal(string): """Decompress and deserialize string into a Python object via marshal.""" return marshal.loads(decompress(string)) class WordTable: "A class to hold the words table." def __init__(self, tablename, fields_to_index, separators="[^\s]"): "Creates words table instance." self.tablename = tablename self.recIDs_in_mem = [] self.fields_to_index = fields_to_index self.separators = separators self.value = {} def get_field(self, recID, tag): """Returns list of values of the MARC-21 'tag' fields for the record 'recID'.""" out = [] bibXXx = "bib" + tag[0] + tag[1] + "x" bibrec_bibXXx = "bibrec_" + bibXXx query = """SELECT value FROM %s AS b, %s AS bb WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID, tag); res = run_sql(query) for row in res: out.append(row[0]) return out def clean(self): "Cleans the words table." self.value={} def put_into_db(self, mode="normal"): """Updates the current words table in the corresponding DB idxFOO table. Mode 'normal' means normal execution, mode 'emergency' means words index reverting to old state. """ if options["verbose"]: write_message("%s %s wordtable flush started" % (self.tablename,mode)) write_message('...updating %d words into %s started' % \ (len(self.value), self.tablename)) task_update_progress("%s flushed %d/%d words" % (self.tablename, 0, len(self.value))) self.recIDs_in_mem = beautify_range_list(self.recIDs_in_mem) if mode == "normal": for group in self.recIDs_in_mem: query = """UPDATE %sR SET type='TEMPORARY' WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='CURRENT'""" % \ (self.tablename[:-1], group[0], group[1]) if options["verbose"] >= 9: write_message(query) run_sql(query) nb_words_total = len(self.value) nb_words_report = int(nb_words_total/10.0) nb_words_done = 0 for word in self.value.keys(): self.put_word_into_db(word) nb_words_done += 1 if nb_words_report!=0 and ((nb_words_done % nb_words_report) == 0): if options["verbose"]: write_message('......processed %d/%d words' % (nb_words_done, nb_words_total)) task_update_progress("%s flushed %d/%d words" % (self.tablename, nb_words_done, nb_words_total)) if options["verbose"] >= 9: write_message('...updating %d words into %s ended' % \ (nb_words_total, self.tablename)) if options["verbose"]: write_message('...updating reverse table %sR started' % self.tablename[:-1]) if mode == "normal": for group in self.recIDs_in_mem: query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \ (self.tablename[:-1], group[0], group[1]) if options["verbose"] >= 9: write_message(query) run_sql(query) query = """DELETE FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \ (self.tablename[:-1], group[0], group[1]) if options["verbose"] >= 9: write_message(query) run_sql(query) if options["verbose"] >= 9: write_message('End of updating wordTable into %s' % self.tablename) elif mode == "emergency": for group in self.recIDs_in_mem: query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \ (self.tablename[:-1], group[0], group[1]) if options["verbose"] >= 9: write_message(query) run_sql(query) query = """DELETE FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \ (self.tablename[:-1], group[0], group[1]) if options["verbose"] >= 9: write_message(query) run_sql(query) if options["verbose"] >= 9: write_message('End of emergency flushing wordTable into %s' % self.tablename) if options["verbose"]: write_message('...updating reverse table %sR ended' % self.tablename[:-1]) self.clean() self.recIDs_in_mem = [] if options["verbose"]: write_message("%s %s wordtable flush ended" % (self.tablename, mode)) task_update_progress("%s flush ended" % (self.tablename)) def load_old_recIDs(self,word): """Load existing hitlist for the word from the database index files.""" query = "SELECT hitlist FROM %s WHERE term=%%s" % self.tablename res = run_sql(query, (word,)) if res: return deserialize_via_numeric_array(res[0][0]) else: return None def merge_with_old_recIDs(self,word,set): """Merge the system numbers stored in memory (hash of recIDs with value +1 or -1 according to whether to add/delete them) with those stored in the database index and received in set universe of recIDs for the given word. Return 0 in case no change was done to SET, return 1 in case SET was changed. """ set_changed_p = 0 for recID,sign in self.value[word].items(): if sign == -1 and set[recID]==1: # delete recID if existent in set and if marked as to be deleted set[recID] = 0 set_changed_p = 1 elif set[recID] == 0: # add recID if not existent in set and if marked as to be added set[recID] = 1 set_changed_p = 1 return set_changed_p def put_word_into_db(self, word): """Flush a single word to the database and delete it from memory""" set = self.load_old_recIDs(word) if set: # merge the word recIDs found in memory: if self.merge_with_old_recIDs(word,set) == 0: # nothing to update: if options["verbose"] >= 9: write_message("......... unchanged hitlist for ``%s''" % word) pass else: # yes there were some new words: if options["verbose"] >= 9: write_message("......... updating hitlist for ``%s''" % word) run_sql("UPDATE %s SET hitlist=%%s WHERE term=%%s" % self.tablename, (serialize_via_numeric_array(set), word)) else: # the word is new, will create new set: set = Numeric.zeros(CFG_MAX_RECID+1, Numeric.Int0) Numeric.put(set, self.value[word].keys(), 1) if options["verbose"] >= 9: write_message("......... inserting hitlist for ``%s''" % word) run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, %%s)" % self.tablename, (word, serialize_via_numeric_array(set))) if not set: # never store empty words run_sql("DELETE from %s WHERE term=%%s" % self.tablename, (word,)) del self.value[word] def display(self): "Displays the word table." keys = self.value.keys() keys.sort() for k in keys: if options["verbose"]: write_message("%s: %s" % (k, self.value[k])) def count(self): "Returns the number of words in the table." return len(self.value) def info(self): "Prints some information on the words table." if options["verbose"]: write_message("The words table contains %d words." % self.count()) def lookup_words(self, word=""): "Lookup word from the words table." if not word: done = 0 while not done: try: word = raw_input("Enter word: ") done = 1 except (EOFError, KeyboardInterrupt): return if self.value.has_key(word): if options["verbose"]: write_message("The word '%s' is found %d times." \ % (word, len(self.value[word]))) else: if options["verbose"]: write_message("The word '%s' does not exist in the word file."\ % word) def update_last_updated(self, starting_time=None): """Update last_updated column of the index table in the database. Puts starting time there so that if the task was interrupted for record download, the records will be reindexed next time.""" if starting_time is None: return None if options["verbose"] >= 9: write_message("updating last_updated to %s...", starting_time) return run_sql("UPDATE idxINDEX SET last_updated=%s WHERE id=%s", (starting_time, self.tablename[-3:-1],)) def add_recIDs(self, recIDs): """Fetches records which id in the recIDs range list and adds them to the wordTable. The recIDs range list is of the form: [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]]. """ global chunksize flush_count = 0 records_done = 0 records_to_go = 0 for arange in recIDs: records_to_go = records_to_go + arange[1] - arange[0] + 1 time_started = time.time() # will measure profile time for arange in recIDs: i_low = arange[0] chunksize_count = 0 while i_low <= arange[1]: # calculate chunk group of recIDs and treat it: i_high = min(i_low+options["flush"]-flush_count-1,arange[1]) i_high = min(i_low+chunksize-chunksize_count-1, i_high) try: self.chk_recID_range(i_low, i_high) except StandardError, e: write_message("Exception caught: %s" % e, sys.stderr) if options["verbose"] >= 9: traceback.print_tb(sys.exc_info()[2]) task_update_status("ERROR") task_sig_stop_commands() sys.exit(1) if options["verbose"]: write_message("%s adding records #%d-#%d started" % \ (self.tablename, i_low, i_high)) if CFG_CHECK_MYSQL_THREADS: kill_sleepy_mysql_threads() task_update_progress("%s adding recs %d-%d" % (self.tablename, i_low, i_high)) self.del_recID_range(i_low, i_high) just_processed = self.add_recID_range(i_low, i_high) flush_count = flush_count + i_high - i_low + 1 chunksize_count = chunksize_count + i_high - i_low + 1 records_done = records_done + just_processed if options["verbose"]: write_message("%s adding records #%d-#%d ended " % \ (self.tablename, i_low, i_high)) if chunksize_count >= chunksize: chunksize_count = 0 # flush if necessary: if flush_count >= options["flush"]: self.put_into_db() self.clean() if options["verbose"]: write_message("%s backing up" % (self.tablename)) flush_count = 0 self.log_progress(time_started,records_done,records_to_go) # iterate: i_low = i_high + 1 if flush_count > 0: self.put_into_db() self.log_progress(time_started,records_done,records_to_go) def add_recIDs_by_date(self, dates): """Add records that were modified between DATES[0] and DATES[1]. If DATES is not set, then add records that were modified since the last update of the index. """ if not dates: table_id = self.tablename[-3:-1] query = """SELECT last_updated FROM idxINDEX WHERE id='%s' """ % table_id res = run_sql(query) if not res: return if not res[0][0]: dates = ("0000-00-00", None) else: dates = (res[0][0], None) if dates[1] is None: res = run_sql("""SELECT b.id FROM bibrec AS b WHERE b.modification_date >= %s ORDER BY b.id ASC""", (dates[0],)) elif dates[0] is None: res = run_sql("""SELECT b.id FROM bibrec AS b WHERE b.modification_date <= %s ORDER BY b.id ASC""", (dates[1],)) else: res = run_sql("""SELECT b.id FROM bibrec AS b WHERE b.modification_date >= %s AND b.modification_date <= %s ORDER BY b.id ASC""", (dates[0], dates[1])) alist = create_range_list(res) if not alist: if options["verbose"]: write_message( "No new records added. %s is up to date" % self.tablename) else: self.add_recIDs(alist) def add_recID_range(self, recID1, recID2): empty_list_string = serialize_via_marshal([]) wlist = {} self.recIDs_in_mem.append([recID1,recID2]) # secondly fetch all needed tags: for tag in self.fields_to_index: if tag in tagToWordsFunctions.keys(): get_words_function = tagToWordsFunctions[tag] else: get_words_function = get_words_from_phrase bibXXx = "bib" + tag[0] + tag[1] + "x" bibrec_bibXXx = "bibrec_" + bibXXx query = """SELECT bb.id_bibrec,b.value FROM %s AS b, %s AS bb WHERE bb.id_bibrec BETWEEN %d AND %d AND bb.id_bibxxx=b.id AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID1, recID2, tag) res = run_sql(query) for row in res: recID,phrase = row if not wlist.has_key(recID): wlist[recID] = [] if tag == "8564_u": # Special treatment for fulltext indexing. 8564 # $$u contains URL, and $$y link name. If $$y is # actually a file name, that is if it ends with # something like .pdf or .ppt, then $$u is treated # as direct URL to the PDF file, and is indexed as # such. This is useful to index Indico files. # FIXME: this is a quick fix only. We should # rather download all 856 $$u files and analyze # content in order to decide how to index them # (directly for Indico, indirectly for Setlink). filename = get_associated_subfield_value(recID,'8564_u', phrase, 'y') filename_extension = string.lower(string.split(filename, ".")[-1]) if filename_extension in CONV_PROGRAMS.keys(): new_words = get_words_function(phrase, force_file_extension=filename_extension) # ,self.separators else: new_words = get_words_function(phrase) # ,self.separators else: new_words = get_words_function(phrase) # ,self.separators wlist[recID] = list_union(new_words,wlist[recID]) # were there some words for these recIDs found? if len(wlist) == 0: return 0 recIDs = wlist.keys() for recID in recIDs: # was this record marked as deleted? if "DELETED" in self.get_field(recID, "980__c"): wlist[recID] = [] if options["verbose"] >= 9: write_message("... record %d was declared deleted, removing its word list" % recID) if options["verbose"] >= 9: write_message("... record %d, termlist: %s" % (recID, wlist[recID])) # Using cStringIO for speed. query_factory = cStringIO.StringIO() qwrite = query_factory.write qwrite( "INSERT INTO %sR (id_bibrec,termlist,type) VALUES" % self.tablename[:-1]) qwrite( "('" ) qwrite( str(recIDs[0]) ) qwrite( "','" ) qwrite( serialize_via_marshal(wlist[recIDs[0]]) ) qwrite( "','FUTURE')" ) for recID in recIDs[1:]: qwrite(",('") qwrite(str(recID)) qwrite("','") qwrite(serialize_via_marshal(wlist[recID])) qwrite("','FUTURE')") query = query_factory.getvalue() query_factory.close() run_sql(query) query_factory = cStringIO.StringIO() qwrite = query_factory.write qwrite("INSERT INTO %sR (id_bibrec,termlist,type) VALUES" % self.tablename[:-1]) qwrite("('") qwrite(str(recIDs[0])) qwrite("','") qwrite(serialize_via_marshal(wlist[recIDs[0]])) qwrite("','CURRENT')") for recID in recIDs[1:]: qwrite( ",('" ) qwrite( str(recID) ) qwrite( "','" ) qwrite( empty_list_string ) qwrite( "','CURRENT')" ) query = query_factory.getvalue() query_factory.close() try: run_sql(query) except DatabaseError: # ok, we tried to add an existent record. No problem pass put = self.put for recID in recIDs: for w in wlist[recID]: put(recID, w, 1) return len(recIDs) def log_progress(self, start, done, todo): """Calculate progress and store it. start: start time, done: records processed, todo: total number of records""" time_elapsed = time.time() - start # consistency check if time_elapsed == 0 or done > todo: return time_recs_per_min = done/(time_elapsed/60.0) if options["verbose"]: write_message("%d records took %.1f seconds to complete.(%1.f recs/min)"\ % (done, time_elapsed, time_recs_per_min)) if time_recs_per_min: if options["verbose"]: write_message("Estimated runtime: %.1f minutes" % \ ((todo-done)/time_recs_per_min)) def put(self, recID, word, sign): "Adds/deletes a word to the word list." try: word = string.lower(word[:50]) if self.value.has_key(word): # the word 'word' exist already: update sign self.value[word][recID] = sign else: self.value[word] = {recID: sign} except: write_message("Error: Cannot put word %s with sign %d for recID %s." % (word, sign, recID)) def del_recIDs(self, recIDs): """Fetches records which id in the recIDs range list and adds them to the wordTable. The recIDs range list is of the form: [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]]. """ count = 0 for arange in recIDs: self.del_recID_range(arange[0],arange[1]) count = count + arange[1] - arange[0] self.put_into_db() def del_recID_range(self, low, high): """Deletes records with 'recID' system number between low and high from memory words index table.""" if options["verbose"] > 2: write_message("%s fetching existing words for records #%d-#%d started" % \ (self.tablename, low, high)) self.recIDs_in_mem.append([low,high]) query = """SELECT id_bibrec,termlist FROM %sR as bb WHERE bb.id_bibrec BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high) recID_rows = run_sql(query) for recID_row in recID_rows: recID = recID_row[0] wlist = deserialize_via_marshal(recID_row[1]) for word in wlist: self.put(recID, word, -1) if options["verbose"] > 2: write_message("%s fetching existing words for records #%d-#%d ended" % \ (self.tablename, low, high)) def report_on_table_consistency(self): """Check reverse words index tables (e.g. idxWORD01R) for interesting states such as 'TEMPORARY' state. Prints small report (no of words, no of bad words). """ # find number of words: query = """SELECT COUNT(*) FROM %s""" % (self.tablename) res = run_sql(query, None, 1) if res: nb_words = res[0][0] else: nb_words = 0 # find number of records: query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR""" % (self.tablename[:-1]) res = run_sql(query, None, 1) if res: nb_records = res[0][0] else: nb_records = 0 # report stats: if options["verbose"]: write_message("%s contains %d words from %d records" % (self.tablename, nb_words, nb_records)) # find possible bad states in reverse tables: query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR WHERE type <> 'CURRENT'""" % (self.tablename[:-1]) res = run_sql(query) if res: nb_bad_records = res[0][0] else: nb_bad_records = 999999999 if nb_bad_records: write_message("EMERGENCY: %s needs to repair %d of %d records" % \ (self.tablename, nb_bad_records, nb_records)) else: if options["verbose"]: write_message("%s is in consistent state" % (self.tablename)) return nb_bad_records def repair(self): """Repair the whole table""" # find possible bad states in reverse tables: query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sR WHERE type <> 'CURRENT'""" % (self.tablename[:-1]) res = run_sql(query, None, 1) if res: nb_bad_records = res[0][0] else: nb_bad_records = 0 if nb_bad_records == 0: return query = """SELECT id_bibrec FROM %sR WHERE type <> 'CURRENT' ORDER BY id_bibrec""" \ % (self.tablename[:-1]) res = run_sql(query) recIDs = create_range_list(res) flush_count = 0 records_done = 0 records_to_go = 0 for arange in recIDs: records_to_go = records_to_go + arange[1] - arange[0] + 1 time_started = time.time() # will measure profile time for arange in recIDs: i_low = arange[0] chunksize_count = 0 while i_low <= arange[1]: # calculate chunk group of recIDs and treat it: i_high = min(i_low+options["flush"]-flush_count-1,arange[1]) i_high = min(i_low+chunksize-chunksize_count-1, i_high) try: self.fix_recID_range(i_low, i_high) except StandardError, e: write_message("Exception caught: %s" % e, sys.stderr) if options["verbose"] >= 9: traceback.print_tb(sys.exc_info()[2]) task_update_status("ERROR") task_sig_stop_commands() sys.exit(1) flush_count = flush_count + i_high - i_low + 1 chunksize_count = chunksize_count + i_high - i_low + 1 records_done = records_done + i_high - i_low + 1 if chunksize_count >= chunksize: chunksize_count = 0 # flush if necessary: if flush_count >= options["flush"]: self.put_into_db("emergency") self.clean() flush_count = 0 self.log_progress(time_started,records_done,records_to_go) # iterate: i_low = i_high + 1 if flush_count > 0: self.put_into_db("emergency") self.log_progress(time_started,records_done,records_to_go) write_message("%s inconsistencies repaired." % self.tablename) def chk_recID_range(self, low, high): """Check if the reverse index table is in proper state""" ## check db query = """SELECT COUNT(*) FROM %sR WHERE type <> 'CURRENT' AND id_bibrec BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high) res = run_sql(query, None, 1) if res[0][0]==0: if options["verbose"]: write_message("%s for %d-%d is in consistent state"%(self.tablename,low,high)) return # okay, words table is consistent ## inconsistency detected! write_message("EMERGENCY: %s inconsistencies detected..." % self.tablename) write_message("""EMERGENCY: Errors found. You should check consistency of the %s - %sR tables.\nRunning 'bibindex --repair' is recommended.""" \ % (self.tablename, self.tablename[:-1])) raise StandardError def fix_recID_range(self, low, high): """Try to fix reverse index database consistency (e.g. table idxWORD01R) in the low,high doc-id range. Possible states for a recID follow: CUR TMP FUT: very bad things have happened: warn! CUR TMP : very bad things have happened: warn! CUR FUT: delete FUT (crash before flushing) CUR : database is ok TMP FUT: add TMP to memory and del FUT from memory flush (revert to old state) TMP : very bad things have happened: warn! FUT: very bad things have happended: warn! """ state = {} query = "SELECT id_bibrec,type FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d'"\ % (self.tablename[:-1], low, high) res = run_sql(query) for row in res: if not state.has_key(row[0]): state[row[0]]=[] state[row[0]].append(row[1]) ok = 1 # will hold info on whether we will be able to repair for recID in state.keys(): if not 'TEMPORARY' in state[recID]: if 'FUTURE' in state[recID]: if 'CURRENT' not in state[recID]: write_message("EMERGENCY: Record %d is in inconsistent state. Can't repair it" % recID) ok = 0 else: write_message("EMERGENCY: Inconsistency in record %d detected" % recID) query = """DELETE FROM %sR WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID) run_sql(query) write_message("EMERGENCY: Inconsistency in record %d repaired." % recID) else: if 'FUTURE' in state[recID] and not 'CURRENT' in state[recID]: self.recIDs_in_mem.append([recID,recID]) # Get the words file query = """SELECT type,termlist FROM %sR WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID) if options["verbose"] >= 9: write_message(query) res = run_sql(query) for row in res: wlist = deserialize_via_marshal(row[1]) if options["verbose"] >= 9: write_message("Words are %s " % wlist) if row[0] == 'TEMPORARY': sign = 1 else: sign = -1 for word in wlist: self.put(recID, word, sign) else: write_message("EMERGENCY: %s for %d is in inconsistent state. Couldn't repair it." % (self.tablename, recID)) ok = 0 if not ok: write_message("""EMERGENCY: Unrepairable errors found. You should check consistency of the %s - %sR tables. Deleting affected records is recommended.""" % (self.tablename, self.tablename[:-1])) raise StandardError def task_run(row): """Run the indexing task. The row argument is the BibSched task queue row, containing if, arguments, etc. Return 1 in case of success and 0 in case of failure. """ global options, wordTables, stemmer, stopwords # read from SQL row: task_id = row[0] task_proc = row[1] options = marshal.loads(row[6]) task_status = row[7] # sanity check: if task_proc != "bibindex": write_message("The task #%d does not seem to be a BibIndex task." % task_id, sys.stderr) return 0 if task_status != "WAITING": write_message("The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr) return 0 # we can run the task now: if options["verbose"]: write_message("Task #%d started." % task_id) task_starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) task_update_status("RUNNING") # install signal handlers signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) ## go ahead and treat each table : for table in options["windex"]: wordTable = WordTable(table.keys()[0], table.values()[0]) wordTable.report_on_table_consistency() try: if options["cmd"] == "del": if options["id"]: wordTable.del_recIDs(options["id"]) elif options["collection"]: l_of_colls = string.split(options["collection"], ",") recIDs = perform_request_search(c=l_of_colls) recIDs_range = [] for recID in recIDs: recIDs_range.append([recID,recID]) wordTable.del_recIDs(recIDs_range) else: write_message("Missing IDs of records to delete from index %s." % wordTable.tablename, sys.stderr) raise StandardError elif options["cmd"] == "add": if options["id"]: wordTable.add_recIDs(options["id"]) elif options["collection"]: l_of_colls = string.split(options["collection"], ",") recIDs = perform_request_search(c=l_of_colls) recIDs_range = [] for recID in recIDs: recIDs_range.append([recID,recID]) wordTable.add_recIDs(recIDs_range) else: wordTable.add_recIDs_by_date(options["modified"]) # only update last_updated if run via automatic mode: wordTable.update_last_updated(task_starting_time) elif options["cmd"] == "repair": wordTable.repair() else: write_message("Invalid command found processing %s" % \ wordTable.tablename, sys.stderr) raise StandardError except StandardError, e: write_message("Exception caught: %s" % e, sys.stderr) if options["verbose"] >= 9: traceback.print_tb(sys.exc_info()[2]) task_update_status("ERROR") task_sig_stop_commands() sys.exit(1) wordTable.report_on_table_consistency() # We are done. State it in the database, close and quit task_update_status("DONE") if options["verbose"]: write_message("Task #%d finished." % task_id) return 1 def command_line(): global options long_flags =["add","del","id=","modified=","collection=", "windex=", "check","repair","maxmem=", "flush=","user=","sleeptime=", "time=","help", "version", "verbose="] short_flags ="adi:m:c:w:krM:f:u:s:t:hVv:" format_string = "%Y-%m-%d %H:%M:%S" tables = None sleeptime = "" try: opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags) except getopt.GetoptError, err: write_message(err, sys.stderr) usage(1) if args: usage(1) options={"cmd":"add", "id":[], "modified":[], "collection":[], "maxmem":0, "flush":10000, "sleeptime":0, "verbose":1 } sched_time = time.strftime(format_string) user = "" # Check for key options try: for opt in opts: if opt == ("-h","") or opt == ("--help",""): usage(1) elif opt == ("-V","") or opt == ("--version",""): print BIBINDEX_ENGINE_VERSION sys.exit(1) elif opt[0] in ["--verbose", "-v"]: options["verbose"] = int(opt[1]) elif opt == ("-a","") or opt == ("--add",""): options["cmd"] = "add" if ("-x","") in opts or ("--del","") in opts: usage(1) elif opt == ("-k","") or opt == ("--check",""): options["cmd"] = "check" elif opt == ("-r","") or opt == ("--repair",""): options["cmd"] = "repair" elif opt == ("-d","") or opt == ("--del",""): options["cmd"]="del" elif opt[0] in [ "-i", "--id" ]: options["id"] = options["id"] + split_ranges(opt[1]) elif opt[0] in [ "-m", "--modified" ]: options["modified"] = get_date_range(opt[1]) elif opt[0] in [ "-c", "--collection" ]: options["collection"] = opt[1] elif opt[0] in [ "-w", "--windex" ]: tables = opt[1] elif opt[0] in [ "-M", "--maxmem"]: options["maxmem"]=int(opt[1]) if options["maxmem"] < base_process_size + 1000: raise StandardError, "Memory usage should be higher than %d kB" % (base_process_size + 1000) elif opt[0] in [ "-f", "--flush"]: options["flush"]=int(opt[1]) elif opt[0] in [ "-u", "--user"]: user = opt[1] elif opt[0] in [ "-s", "--sleeptime" ]: get_datetime(opt[1]) # see if it is a valid shift sleeptime= opt[1] elif opt[0] in [ "-t", "--time" ]: sched_time= get_datetime(opt[1]) else: usage(1) except StandardError, e: write_message(e, sys.stderr) sys.exit(1) options["windex"]=get_word_tables(tables) if options["cmd"] == "check": for table in options["windex"]: wordTable = WordTable(table.keys()[0], table.values()[0]) wordTable.report_on_table_consistency() return user = authenticate(user) if options["verbose"] >= 9: print "" write_message("storing task options %s\n" % options) ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] new_task_id = run_sql("""INSERT INTO schTASK (proc,user,runtime,sleeptime,arguments,status) VALUES ('bibindex',%s,%s,%s,%s,'WAITING')""", (user, sched_time, sleeptime, marshal.dumps(options))) ## update task number: options["task"] = new_task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), new_task_id)) print "Task #%d was successfully scheduled for execution." % new_task_id return def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame)) write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame)) write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame)) write_message("stopping...") task_update_status("STOPPING") errcode = 0 try: task_sig_stop_commands() write_message("stopped") task_update_status("STOPPED") except StandardError, err: write_message("Error during stopping! %e" % err) task_update_status("STOPPINGFAILED") errcode = 1 sys.exit(errcode) def task_sig_stop_commands(): """Do all the commands necessary to stop the task before quitting. Useful for task_sig_stop() handler. """ write_message("stopping commands started") for table in wordTables: table.put_into_db() write_message("stopping commands ended") def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame)) write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" # do nothing for unknown signals: write_message("unknown signal %d (frame %s) ignored" % (sig, frame)) def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global options if options["verbose"] >= 9: write_message("Updating task progress to %s." % msg) return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"])) def task_update_status(val): """Updates state information in the BibSched task table.""" global options if options["verbose"] >= 9: write_message("Updating task status to %s." % val) return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"])) def test_fulltext_indexing(): """Tests fulltext indexing programs on PDF, PS, DOC, PPT, XLS. Prints list of words and word table on the screen. Does not integrate anything into the database. Useful when debugging problems with fulltext indexing: call this function instead of main(). """ global options options["verbose"] = 9 print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=atlnot&categ=Communication&id=com-indet-2002-012") # protected URL print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=agenda&categ=a00388&id=a00388s2t7") # XLS print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=agenda&categ=a02883&id=a02883s1t6/transparencies") # PPT print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=agenda&categ=a99149&id=a99149s1t10/transparencies") # DOC print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=preprint&categ=cern&id=lhc-project-report-601") # PDF sys.exit(0) def test_word_separators(phrase="hep-th/0101001"): """Tests word separating policy on various input.""" print "%s:" % phrase for word in get_words_from_phrase(phrase): print "\t-> %s" % word def main(): """Reads arguments and either runs the task, or starts user-interface (command line).""" if len(sys.argv) == 2: try: task_id = int(sys.argv[1]) except StandardError: command_line() sys.exit() res = run_sql("SELECT * FROM schTASK WHERE id='%d'" % (task_id), None, 1) if not res: write_message("Selected task not found.", sys.stderr) sys.exit(1) try: if not task_run(res[0]): write_message("Error occurred. Exiting.", sys.stderr) except StandardError, e: write_message("Unexpected error occurred: %s." % e, sys.stderr) if options["verbose"] >= 9: traceback.print_tb(sys.exc_info()[2]) write_message("Exiting.") task_update_status("ERROR") else: command_line() diff --git a/modules/bibrank/bin/bibrank.in b/modules/bibrank/bin/bibrank.in index 844b4dae7..60b37bd44 100644 --- a/modules/bibrank/bin/bibrank.in +++ b/modules/bibrank/bin/bibrank.in @@ -1,479 +1,480 @@ #!@PYTHON@ ## -*- mode: python; coding: utf-8; -*- ## ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibRank ranking daemon. Usage: %s [options] Ranking examples: %s -wjif -a --id=0-30000,30001-860000 --verbose=9 %s -wjif -d --modified='2002-10-27 13:57:26' %s -wwrd --rebalance --collection=Articles %s -wwrd -a -i 234-250,293,300-500 -u admin Ranking options: -w, --run=r1[,r2] runs each rank method in the order given -c, --collection=c1[,c2] select according to collection -i, --id=low[-high] select according to doc recID -m, --modified=from[,to] select according to modification date -l, --lastupdate select according to last update -a, --add add or update words for selected records -d, --del delete words for selected records -S, --stat show statistics for a method -R, --recalculate recalculate weigth data, used by word frequency method should be used if ca 1% of the document has been changed since last time -R was used Repairing options: -k, --check check consistency for all records in the table(s) check if update of ranking data is necessary -r, --repair try to repair all records in the table(s) Scheduling options: -u, --user=USER user name to store task, password needed -s, --sleeptime=SLEEP time after which to repeat tasks (no) e.g.: 1s, 30m, 24h, 7d -t, --time=TIME moment for the task to be active (now) e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26 General options: -h, --help print this help and exit -V, --version print version and exit -v, --verbose=LEVEL verbose level (from 0 to 9, default 1) """ __revision__ = "$Id$" try: import marshal from zlib import compress,decompress from string import split,translate,lower,upper import getopt import getpass import string import os import sre import sys import time import urllib import signal import tempfile import traceback import cStringIO import re import copy import types import ConfigParser except ImportError, e: import sys try: from invenio.dbquery import run_sql, escape_string from invenio.bibrank_tag_based_indexer import * from invenio.bibrank_word_indexer import * from invenio.access_control_engine import acc_authorize_action from invenio.search_engine import perform_request_search except ImportError, e: import sys nb_char_in_line = 50 # for verbose pretty printing chunksize = 1000 # default size of chunks that the records will be treated by base_process_size = 4500 # process base size bibrank_options = {} # will hold task options def serialize_via_numeric_array_dumps(arr): return Numeric.dumps(arr) def serialize_via_numeric_array_compr(str): return compress(str) def serialize_via_numeric_array_escape(str): return escape_string(str) def serialize_via_numeric_array(arr): """Serialize Numeric array into a compressed string.""" return serialize_via_numeric_array_escape(serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr))) def deserialize_via_numeric_array(string): """Decompress and deserialize string into a Numeric array.""" return Numeric.loads(decompress(string)) def serialize_via_marshal(obj): """Serialize Python object via marshal into a compressed string.""" return escape_string(compress(marshal.dumps(obj))) def deserialize_via_marshal(string): """Decompress and deserialize string into a Python object via marshal.""" return marshal.loads(decompress(string)) def authenticate(user, header="BibRank Task Submission", action="runbibrank"): print header print "=" * len(header) if user == "": print>> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print>> sys.stdout, "\rUsername:", user - res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1) + res = run_sql("select id,password from user where email=%s", (user,), 1) + \ + run_sql("select id,password from user where nickname=%s", (user,), 1) if not res: print "Sorry, %s does not exist." % user sys.exit(1) else: (uid_db, password_db) = res[0] if password_db: password_entered = getpass.getpass() if password_db == password_entered: pass else: print "Sorry, wrong credentials for %s." % user sys.exit(1) (auth_code, auth_message) = acc_authorize_action(uid_db, action) if auth_code != 0: print auth_message sys.exit(1) return user def usage(code, msg=''): "Prints usage for this module." if msg: sys.stderr.write("Error: %s.\n" % msg) print >> sys.stderr, \ """ Usage: %s [options] Ranking examples: %s -wjif -a --id=0-30000,30001-860000 --verbose=9 %s -wjif -d --modified='2002-10-27 13:57:26' %s -wjif --rebalance --collection=Articles %s -wsbr -a -i 234-250,293,300-500 -u admin Ranking options: -w, --run=r1[,r2] runs each rank method in the order given -c, --collection=c1[,c2] select according to collection -i, --id=low[-high] select according to doc recID -m, --modified=from[,to] select according to modification date -l, --lastupdate select according to last update -a, --add add or update words for selected records -d, --del delete words for selected records -S, --stat show statistics for a method -R, --recalculate recalculate weigth data, used by word frequency method should be used if ca 1%% of the document has been changed since last time -R was used Repairing options: -k, --check check consistency for all records in the table(s) check if update of ranking data is necessary -r, --repair try to repair all records in the table(s) Scheduling options: -u, --user=USER user name to store task, password needed -s, --sleeptime=SLEEP time after which to repeat tasks (no) e.g.: 1s, 30m, 24h, 7d -t, --time=TIME moment for the task to be active (now) e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26 General options: -h, --help print this help and exit -V, --version print version and exit -v, --verbose=LEVEL verbose level (from 0 to 9, default 1) """ % ((sys.argv[0],) * 5) sys.exit(code) def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re = sre.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" if bibrank_options["verbose"] >= 9: write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame)) write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" if bibrank_options["verbose"] >= 9: write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame)) write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop_commands(): """Do all the commands necessary to stop the task before quitting. Useful for task_sig_stop() handler. """ write_message("stopping commands started") write_message("stopping commands ended") def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" global bibrank_options if bibrank_options["verbose"] >= 9: write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame)) write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" # do nothing for unknown signals: write_message("unknown signal %d (frame %s) ignored" % (sig, frame)) def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global bibrank_options query = "UPDATE schTASK SET progress='%s' where id=%d" % (escape_string(msg), bibrank_options["task"]) if bibrank_options["verbose"]>= 9: write_message(query) run_sql(query) return def task_update_status(val): """Updates state information in the BibSched task table.""" global bibrank_options query = "UPDATE schTASK SET status='%s' where id=%d" % (escape_string(val), bibrank_options["task"]) if bibrank_options["verbose"]>= 9: write_message(query) run_sql(query) return def split_ranges(parse_string): recIDs = [] ranges = string.split(parse_string, ",") for range in ranges: tmp_recIDs = string.split(range, "-") if len(tmp_recIDs)==1: recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])]) else: if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check tmp = tmp_recIDs[0] tmp_recIDs[0] = tmp_recIDs[1] tmp_recIDs[1] = tmp recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])]) return recIDs def get_date_range(var): "Returns the two dates contained as a low,high tuple" limits = string.split(var, ",") if len(limits)==1: low = get_datetime(limits[0]) return low,None if len(limits)==2: low = get_datetime(limits[0]) high = get_datetime(limits[1]) return low,high def command_line(): """Storing the task together with the parameters given.""" global bibrank_options long_flags = ["lastupdate","add","del","repair","maxmem", "flush","stat", "rebalance", "id=", "collection=", "check", "modified=", "update", "run=", "user=", "sleeptime=", "time=", "help", "version", "verbose="] short_flags = "ladSi:m:c:kUrRM:f:w:u:s:t:hVv:" format_string = "%Y-%m-%d %H:%M:%S" sleeptime = "" try: opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags) except getopt.GetoptError, err: write_message(err, sys.stderr) usage(1) if args: usage(1) bibrank_options = {"quick":"yes","cmd":"add","flush":100000,"validset":"", "collection":[], "id":[], "check": "", "stat":"", "modified":"", "last_updated":"last_updated","run":"", "verbose":1} res = run_sql("SELECT name from rnkMETHOD") bibrank_options["run"] = [] for (name,) in res: bibrank_options["run"].append(name) sched_time = time.strftime(format_string) user = "" try: for opt in opts: if opt == ("-h","") or opt == ("--help",""): usage(1) elif opt == ("-V","") or opt == ("--version",""): print __revision__ sys.exit(1) elif opt[0] in ["--verbose", "-v"]: bibrank_options["verbose"] = int(opt[1]) elif opt == ("-a","") or opt == ("--add",""): bibrank_options["cmd"] = "add" if ("-x","") in opts or ("--del","") in opts: usage(1) elif opt[0] in ["--run", "-w"]: bibrank_options["run"] = [] run = split(opt[1],",") for key in range(0,len(run)): bibrank_options["run"].append(run[key]) elif opt == ("-r","") or opt == ("--repair",""): bibrank_options["cmd"] = "repair" elif opt == ("-d","") or opt == ("--del",""): bibrank_options["cmd"]="del" elif opt[0] in [ "-u", "--user"]: user = opt[1] elif opt[0] in [ "-k", "--check"]: bibrank_options["cmd"]= "check" elif opt[0] in [ "-S", "--stat"]: bibrank_options["cmd"] = "stat" elif opt[0] in [ "-i", "--id" ]: bibrank_options["id"] = bibrank_options["id"] + split_ranges(opt[1]) bibrank_options["last_updated"] = "" elif opt[0] in [ "-c", "--collection" ]: bibrank_options["collection"] = opt[1] elif opt[0] in [ "-R", "--rebalance"]: bibrank_options["quick"] = "no" elif opt[0] in [ "-f", "--flush"]: bibrank_options["flush"]=int(opt[1]) elif opt[0] in [ "-M", "--maxmem"]: bibrank_options["maxmem"]=int(opt[1]) if bibrank_options["maxmem"] < base_process_size + 1000: raise StandardError, "Memory usage should be higher than %d kB" % (base_process_size + 1000) elif opt[0] in [ "-m", "--modified" ]: bibrank_options["modified"] = get_date_range(opt[1]) #2002-10-27 13:57:26 bibrank_options["last_updated"] = "" elif opt[0] in [ "-l", "--lastupdate" ]: bibrank_options["last_updated"] = "last_updated" elif opt[0] in [ "-s", "--sleeptime" ]: get_datetime(opt[1]) # see if it is a valid shift sleeptime=opt[1] elif opt[0] in [ "-t", "--time" ]: sched_time = get_datetime(opt[1]) else: usage(1) except StandardError, e: write_message(e, sys.stderr) sys.exit(1) user = authenticate(user) if bibrank_options["verbose"]>=9: write_message("Storing task options %s" % bibrank_options) ## sanity check: remove eventual "task" option: if bibrank_options.has_key("task"): del bibrank_options["task"] new_task_id = run_sql("""INSERT INTO schTASK (proc,user,runtime,sleeptime,arguments,status) VALUES ('bibrank',%s,%s,%s,%s,'WAITING')""", (user, sched_time, sleeptime, marshal.dumps(bibrank_options))) ## update task number: bibrank_options["task"] = new_task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(bibrank_options), new_task_id)) print "Task #%d was successfully scheduled for execution." % new_task_id return def task_run(row): """Run the indexing task. The row argument is the BibSched task queue row, containing if, arguments, etc. Return 1 in case of success and 0 in case of failure. """ global bibrank_options task_id = row[0] task_proc = row[1] bibrank_options = marshal.loads(row[6]) task_status = row[7] # install signal handlers signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) if task_proc != "bibrank": write_message("-The task #%d does not seem to be a BibRank task." % task_id, sys.stderr) return 0 if task_status != "WAITING": write_message("The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr) return 0 if bibrank_options["verbose"]: write_message("Task #%d started." % task_id) task_update_status("RUNNING") try: bibrank_options = marshal.loads(row[6]) for key in bibrank_options["run"]: write_message("") file = etcdir + "/bibrank/" + key + ".cfg" if bibrank_options["verbose"] >= 9: write_message("Getting configuration from file: %s" % file) config = ConfigParser.ConfigParser() try: config.readfp(open(file)) except StandardError, e: write_message("Cannot find configurationfile: %s. The rankmethod may also not be registered using the BibRank Admin Interface." % file, sys.stderr) raise StandardError #Using the function variable to call the function related to the rank method cfg_function = config.get("rank_method", "function") func_object = globals().get(cfg_function) if func_object: func_object(row, key) else: write_message("Cannot run method '%s', no function to call" % key) except StandardError, e: write_message("\nException caught: %s" % e, sys.stderr) traceback.print_tb(sys.exc_info()[2]) task_update_status("ERROR") sys.exit(1) task_update_status("DONE") if bibrank_options["verbose"]: write_message("Task #%d finished." % task_id) return 1 def main(): global bibrank_options if len(sys.argv) == 2: try: id = int(sys.argv[1]) except StandardError, err: command_line() sys.exit() res = run_sql("SELECT * FROM schTASK WHERE id='%d'" % (id), None, 1) if not res: write_message("Selected task not found.", sys.stderr) sys.exit(1) try: if not task_run(res[0]): write_message("Error occurred. Exiting.", sys.stderr) except StandardError, e: write_message("Unexpected error occurred: %s." % e, sys.stderr) write_message("Traceback is:") traceback.print_tb(sys.exc_info()[2]) write_message("Exiting.") task_update_status("ERROR") else: command_line() if __name__ == "__main__": main() diff --git a/modules/bibsched/lib/bibtaskex.py b/modules/bibsched/lib/bibtaskex.py index 340b75fff..f430a6760 100644 --- a/modules/bibsched/lib/bibtaskex.py +++ b/modules/bibsched/lib/bibtaskex.py @@ -1,339 +1,340 @@ # -*- coding: utf-8 -*- ## ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """CDS Invenio Bibliographic Task Example. Demonstrates BibTask <-> BibSched connectivity, signal handling, error handling, etc. """ __revision__ = "$Id$" import sys from invenio.dbquery import run_sql from invenio.access_control_engine import acc_authorize_action import getopt import getpass import marshal import signal import sre import string import time import traceback options = {} # global variable to hold task options cfg_n_default = 30 # how many Fibonacci numbers to calculate if none submitted? def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re = sre.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def write_message(msg, stream=sys.stdout): """Write message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff.""" if stream == sys.stdout or stream == sys.stderr: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) return def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame)) write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame)) write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame)) write_message("stopping...") task_update_status("STOPPING") write_message("flushing cache or whatever...") time.sleep(3) write_message("closing tables or whatever...") time.sleep(1) write_message("stopped") task_update_status("STOPPED") sys.exit(0) def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame)) write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" # do nothing for unknown signals: write_message("unknown signal %d (frame %s) ignored" % (sig, frame)) def fib(n): """Returns Fibonacci number for 'n'.""" out = 1 if n >= 2: out = fib(n-2) + fib(n-1) return out def authenticate(user, header="BibTaskEx Task Submission", action="runbibtaskex"): """Authenticate the user against the user database. Check for its password, if it exists. Check for action access rights. Return user name upon authorization success, do system exit upon authorization failure. """ print header print "=" * len(header) if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user ## first check user pw: - res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1) + res = run_sql("select id,password from user where email=%s", (user,), 1) + \ + run_sql("select id,password from user where nickname=%s", (user,), 1) if not res: print "Sorry, %s does not exist." % user sys.exit(1) else: (uid_db, password_db) = res[0] if password_db: password_entered = getpass.getpass() if password_db == password_entered: pass else: print "Sorry, wrong credentials for %s." % user sys.exit(1) ## secondly check authorization for the action: (auth_code, auth_message) = acc_authorize_action(uid_db, action) if auth_code != 0: print auth_message sys.exit(1) return user def task_submit(): """Submits task to the BibSched task queue. This is what people will be invoking via command line.""" global options ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] ## authenticate user: user = authenticate(options.get("user", "")) ## submit task: if options["verbose"] >= 9: print "" write_message("storing task options %s\n" % options) task_id = run_sql("""INSERT INTO schTASK (id,proc,user,runtime,sleeptime,status,arguments) VALUES (NULL,'bibtaskex',%s,%s,%s,'WAITING',%s)""", (user, options["runtime"], options["sleeptime"], marshal.dumps(options))) ## update task number: options["task"] = task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), task_id)) write_message("Task #%d submitted." % task_id) return task_id def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global options if options["verbose"] >= 9: write_message("Updating task progress to %s." % msg) return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"])) def task_update_status(val): """Updates status information in the BibSched task table.""" global options if options["verbose"] >= 9: write_message("Updating task status to %s." % val) return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"])) def task_read_status(task_id): """Read status information in the BibSched task table.""" res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1) try: out = res[0][0] except: out = 'UNKNOWN' return out def task_get_options(id): """Returns options for the task 'id' read from the BibSched task queue table.""" out = {} res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibtaskex'", (id,)) try: out = marshal.loads(res[0][0]) except: write_message("Error: BibTaskEx task %d does not seem to exist." % id, sys.stderr) sys.exit(1) return out def task_run(task_id): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call. The task prints Fibonacci numbers for up to NUM on the stdout, and some messages on stderr. Return 1 in case of success and 0 in case of failure.""" global options options = task_get_options(task_id) # get options from BibSched task table ## check task id: if not options.has_key("task"): write_message("Error: The task #%d does not seem to be a BibTaskEx task." % task_id, sys.stderr) return 0 ## check task status: task_status = task_read_status(task_id) if task_status != "WAITING": write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr) return 0 ## we can run the task now: if options["verbose"]: write_message("Task #%d started." % task_id) task_update_status("RUNNING") ## initialize signal handler: signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) ## run the task: if options.has_key("number"): n = options["number"] else: n = cfg_n_default if options["verbose"] >= 9: write_message("Printing %d Fibonacci numbers." % n) for i in range(0, n): if i > 0 and i % 4 == 0: if options["verbose"] >= 3: write_message("Error: water in the CPU. Ignoring and continuing.", sys.stderr) elif i > 0 and i % 5 == 0: if options["verbose"]: write_message("Error: floppy drive dropped on the floor. Ignoring and continuing.", sys.stderr) if options["verbose"]: write_message("fib(%d)=%d" % (i, fib(i))) task_update_progress("Done %d out of %d." % (i, n)) time.sleep(1) ## we are done: task_update_progress("Done %d out of %d." % (n, n)) task_update_status("DONE") if options["verbose"]: write_message("Task #%d finished." % task_id) return 1 def usage(exitcode=1, msg=""): """Prints usage info.""" if msg: sys.stderr.write("Error: %s.\n" % msg) sys.stderr.write("Usage: %s [options]\n" % sys.argv[0]) sys.stderr.write("Command options:\n") sys.stderr.write(" -n, --number=NUM\t Print Fibonacci numbers for up to NUM. [default=%d]\n" % cfg_n_default) sys.stderr.write("Scheduling options:\n") sys.stderr.write(" -u, --user=USER \t User name to submit the task as, password needed.\n") sys.stderr.write(" -t, --runtime=TIME \t Time to execute the task (now), e.g.: +15s, 5m, 3h, 2002-10-27 13:57:26\n") sys.stderr.write(" -s, --sleeptime=SLEEP \t Sleeping frequency after which to repeat task (no), e.g.: 30m, 2h, 1d\n") sys.stderr.write("General options:\n") sys.stderr.write(" -h, --help \t\t Print this help.\n") sys.stderr.write(" -V, --version \t\t Print version information.\n") sys.stderr.write(" -v, --verbose=LEVEL \t Verbose level (0=min, 1=default, 9=max).\n") sys.exit(exitcode) def main(): """Main function that analyzes command line input and calls whatever is appropriate. Useful for learning on how to write BibSched tasks.""" global options ## parse command line: if len(sys.argv) == 2 and sys.argv[1].isdigit(): ## A - run the task task_id = int(sys.argv[1]) try: if not task_run(task_id): write_message("Error occurred. Exiting.", sys.stderr) except StandardError, e: write_message("Unexpected error occurred: %s." % e, sys.stderr) write_message("Traceback is:", sys.stderr) traceback.print_tb(sys.exc_info()[2]) write_message("Exiting.", sys.stderr) task_update_status("ERROR") else: ## B - submit the task # set default values: options = {} options["runtime"] = time.strftime("%Y-%m-%d %H:%M:%S") options["sleeptime"] = "" options["verbose"] = 1 # set user-defined options: try: opts, args = getopt.getopt(sys.argv[1:], "hVv:n:u:s:t:", ["help", "version", "verbose=", "number=", "user=", "sleep=", "time="]) except getopt.GetoptError, err: usage(1, err) try: for opt in opts: if opt[0] in ["-h", "--help"]: usage(0) elif opt[0] in ["-V", "--version"]: print __revision__ sys.exit(0) elif opt[0] in [ "-u", "--user"]: options["user"] = opt[1] elif opt[0] in ["-v", "--verbose"]: options["verbose"] = int(opt[1]) elif opt[0] in ["-n", "--number"]: options["number"] = int(opt[1]) elif opt[0] in [ "-s", "--sleeptime" ]: get_datetime(opt[1]) # see if it is a valid shift options["sleeptime"] = opt[1] elif opt[0] in [ "-t", "--runtime" ]: options["runtime"] = get_datetime(opt[1]) else: usage(1) except StandardError, e: usage(e) task_submit() return ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/bibupload/lib/bibupload.py b/modules/bibupload/lib/bibupload.py index cc18a8bb7..828c4c592 100644 --- a/modules/bibupload/lib/bibupload.py +++ b/modules/bibupload/lib/bibupload.py @@ -1,1301 +1,1302 @@ # -*- coding: utf-8 -*- ## ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable-msg=C0301 """ BibUpload: Receive MARC XML file and update the appropriate database tables according to options. Usage: bibupload [options] input.xml Examples: $ bibupload -i input.xml Options: -a, --append new fields are appended to the existing record -c, --correct fields are replaced by the new ones in the existing record -f, --format takes only the FMT fields into account. Does not update -i, --insert insert the new record in the database -r, --replace the existing record is entirely replaced by the new one -z, --reference update references (update only 999 fields) -s, --stage=STAGE stage to start from in the algorithm (0: always done; 1: FMT tags; 2: FFT tags; 3: BibFmt; 4: Metadata update; 5: time update) -n, --notimechange do not change record last modification date when updating Scheduling options: -u, --user=USER user name to store task, password needed General options: -h, --help print this help and exit -v, --verbose=LEVEL verbose level (from 0 to 9, default 1) -V --version print the script version """ __revision__ = "$Id$" import os import sys import getopt import getpass import signal import string import marshal import time import traceback from zlib import compress import MySQLdb import re from invenio.bibupload_config import * from invenio.access_control_engine import acc_authorize_action from invenio.dbquery import run_sql, \ Error from invenio.bibrecord import create_records, \ create_record, \ record_add_field, \ record_delete_field, \ record_xml_output, \ record_get_field_instances, \ field_get_subfield_values from invenio.dateutils import convert_datestruct_to_datetext from invenio.search_engine import print_record from invenio.config import filedir, \ filedirsize, \ htdocsurl # Global variables options = {} options['mode'] = None options['verbose'] = 1 options['tag'] = None options['file_path'] = None options['notimechange'] = 0 options['stage_to_start_from'] = 1 #Statistic variables stat = {} stat['nb_records_to_upload'] = 0 stat['nb_records_updated'] = 0 stat['nb_records_inserted'] = 0 stat['nb_errors'] = 0 stat['exectime'] = time.localtime() ### bibsched task related functions: def write_message(msg, stream=sys.stdout, verbose=1): """Write message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff. Do not print anything if the global verbose option is lower than VERBOSE. """ if stream == sys.stdout or stream == sys.stderr: if options['verbose'] >= verbose: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) return def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame)) write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame)) write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame)) write_message("stopping...") task_update_status("STOPPING") write_message("flushing cache or whatever...") time.sleep(3) write_message("closing tables or whatever...") time.sleep(1) write_message("stopped") task_update_status("STOPPED") sys.exit(0) def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame)) write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" # do nothing for unknown signals: write_message("unknown signal %d (frame %s) ignored" % (sig, frame)) def authenticate(user, header="BibUpload Task Submission", action="runbibupload"): """Authenticate the user against the user database. Check for its password, if it exists. Check for action access rights. Return user name upon authorization success, do system exit upon authorization failure. """ # FIXME: for the time being do not authenticate but always let the # tasks in, because of automated inserts. Maybe we shall design # an internal user here that will always be let in. return user print header print "=" * len(header) if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user ## first check user pw: - res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1) + res = run_sql("select id,password from user where email=%s", (user,), 1) + \ + run_sql("select id,password from user where nickname=%s", (user,), 1) if not res: print "Sorry, %s does not exist." % user sys.exit(1) else: (uid_db, password_db) = res[0] if password_db: password_entered = getpass.getpass() if password_db == password_entered: pass else: print "Sorry, wrong credentials for %s." % user sys.exit(1) ## secondly check authorization for the action: (auth_code, auth_message) = acc_authorize_action(uid_db, action) if auth_code != 0: print auth_message sys.exit(1) return user def task_submit(): """Submits task to the BibSched task queue. This is what people will be invoking via command line.""" global options ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] ## authenticate user: user = authenticate(options.get("user", "")) ## submit task: if options["verbose"] >= 9: print "" write_message("storing task options %s\n" % options) task_id = run_sql("""INSERT INTO schTASK (id,proc,user,runtime,sleeptime,status,arguments) VALUES (NULL,'bibupload',%s,%s,%s,'WAITING',%s)""", (user, options["runtime"], options["sleeptime"], marshal.dumps(options))) ## update task number: options["task"] = task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), task_id)) write_message("Task #%d submitted." % task_id) return task_id def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global options if options["verbose"] >= 9: write_message("Updating task progress to %s." % msg) return run_sql("UPDATE schTASK SET progress=%s WHERE id=%s", (msg, options["task"])) def task_update_status(val): """Updates status information in the BibSched task table.""" global options if options["verbose"] >= 9: write_message("Updating task status to %s." % val) return run_sql("UPDATE schTASK SET status=%s WHERE id=%s", (val, options["task"])) def task_read_status(task_id): """Read status information in the BibSched task table.""" res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1) try: out = res[0][0] except: out = 'UNKNOWN' return out def task_get_options(task_id): """Returns options for the task 'task_id' read from the BibSched task queue table.""" out = {} res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibupload'", (task_id,)) try: out = marshal.loads(res[0][0]) except: write_message("Error: BibUpload task %d does not seem to exist." % task_id, sys.stderr) sys.exit(1) return out def task_run(task_id): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call. The task prints Fibinacci numbers for up to NUM on the stdout, and some messages on stderr. Return 1 in case of success and 0 in case of failure.""" global options, stat options = task_get_options(task_id) # get options from BibSched task table ## check task id: if not options.has_key("task"): write_message("Error: The task #%d does not seem to be a BibUpload task." % task_id, sys.stderr) return 0 ## check task status: task_status = task_read_status(task_id) if task_status != "WAITING": write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr) return 0 ## we can run the task now: if options["verbose"]: write_message("Task #%d started." % task_id) task_update_status("RUNNING") ## initialize signal handler: signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) ## run the task: error = 0 write_message("BibUpload Mode "+options['mode']+" has been choosen.", verbose=2) write_message("STAGE 0:", verbose=2) if options['file_path'] != None: recs = xml_marc_to_records(open_marc_file(options['file_path'])) stat['nb_records_to_upload'] = len(recs) write_message(" -Open XML marc: DONE", verbose=2) if recs != None: #We proceed each record by record for record in recs: error = bibupload(record) if error[0] == 1: stat['nb_errors'] += 1 task_update_progress("Done %d out of %d." % (stat['nb_records_inserted'] + stat['nb_records_updated'], stat['nb_records_to_upload'])) else: write_message(" Error bibupload failed: No record found", verbose=1, stream=sys.stderr) if options['verbose'] >= 1: #Print out the statistics print_out_bibupload_statistics() # Check if they were errors if stat['nb_errors'] >= 1: task_update_status("DONE WITH ERRORS") else: ## we are done: task_update_status("DONE") if options["verbose"]: write_message("Task #%d finished." % task_id) return 1 ### bibupload engine functions: def parse_command(): """Analyze the command line and retrieve arguments (xml file, mode, etc) into global options variable. Return 0 in case everything went well, 1 in case of errors, 2 in case only help or version number were asked for. """ # FIXME: add treatment of `time' try: opts, args = getopt.getopt(sys.argv[1:], "i:r:c:a:z:s:f:u:hv:Vn", [ "insert=", "replace=", "correct=", "append=", "reference=", "stage=", "format=", "user=", "help", "verbose", "version", "notimechange", ]) except getopt.GetoptError,erro: usage() write_message("Stage 0 error: %s" % erro, verbose=1, stream=sys.stderr) return 1 # set the proper mode depending on the argument value for opt, opt_value in opts: # Verbose mode option if opt in ["-v", "--verbose"]: try: options['verbose'] = int(opt_value) except ValueError: write_message("Failed: enter a valid number for verbose mode (between 0 and 9).", verbose=1, stream=sys.stderr) return 1 # stage mode option if opt in ["-s", "--stage"]: try: options['stage_to_start_from'] = int(opt_value) except ValueError: write_message("Failed: enter a valid number for the stage to start from(>0).", verbose=1, stream=sys.stderr) return 1 # No time change option if opt in ["-n", "--notimechange"]: options['notimechange'] = 1 # Insert mode option if opt in ["-i", "--insert"]: options['mode'] = 'insert' options['file_path'] = os.path.abspath(opt_value) # Replace mode option if opt in ["-r", "--replace"]: # We check if there is not the mode insert after replace if opt_value == '-i' or opt_value == '--insert': options['replace_insert'] = 1 options['mode'] = 'replace_insert' options['file_path'] = os.path.abspath(args[0]) else: # Creation of the records from the xml Marc in argument options['mode'] = 'replace' options['file_path'] = os.path.abspath(opt_value) # Correct mode option if opt in ["-c", "--correct"]: # We check if there is not just a special tag to correct try: if int(opt_value) > 0 and int(opt_value) < 999: options['mode'] = 'correct' options['tag'] = opt_value options['file_path'] = os.path.abspath(args[0]) except ValueError: if opt_value == 'FMT' or opt_value == 'FFT': options['mode'] = 'correct' options['tag'] = opt_value options['file_path'] = os.path.abspath(args[0]) else: options['mode'] = 'correct' options['file_path'] = os.path.abspath(opt_value) # Append mode option if opt in ["-a", "--append"]: # We check if there is not just a special tag to append try: if int(opt_value) > 0 and int(opt_value) < 999: options['mode'] = 'append' options['tag'] = opt_value options['file_path'] = os.path.abspath(args[0]) except ValueError: if opt_value == 'FMT' or opt_value == 'FFT': options['mode'] = 'append' options['tag'] = opt_value options['file_path'] = os.path.abspath(args[0]) else: options['mode'] = 'append' options['file_path'] = os.path.abspath(opt_value) # Reference mode option if opt in ["-z", "--reference"]: options['mode'] = 'reference' options['file_path'] = os.path.abspath(opt_value) # Format mode option if opt in ["-f", "--format"]: options['mode'] = 'format' options['file_path'] = os.path.abspath(opt_value) # Detection of user if opt in ["-u", "--user"]: options['user'] = opt_value # Help mode option if opt in ["-h", "--help"]: usage() return 2 # Version mode option if opt in ["-V", "--version"]: write_message(__revision__, verbose=1) return 2 if options['mode'] == None: write_message("Please specify at least one update/insert mode!") return 1 if options['file_path'] == None: write_message("Missing filename! -h for help.") return 1 return 0 def bibupload(record): """Main function: process a record and fit it in the tables bibfmt, bibrec, bibrec_bibxxx, bibxxx with proper record metadata. Return (error_code, recID) of the processed record. """ error = None #If there are special tags to proceed check if it exists in the record if options['tag'] != None and not(record.has_key(options['tag'])): write_message(" Failed: Tag not found, enter a valid tag to update.", verbose=1, stream=sys.stderr) return (1, -1) #Extraction of the Record Id rec_id = retrieve_rec_id(record) if rec_id == -1: return (1, -1) else: write_message(" -Retrieve record Id (found %s): DONE." % rec_id, verbose=2) write_message(" -Check if the xml marc file is already in the database: DONE" , verbose=2) # Reference mode check if there are reference tag if options['mode'] == 'reference': error = extract_tag_from_record(record, CFG_BIBUPLOAD_REFERENCE_TAG) if error == None: write_message(" Failed: No reference tags has been found...", verbose=1, stream=sys.stderr) return (1, -1) else: error = None write_message(" -Check if reference tags exist: DONE", verbose=2) if options['mode'] == 'insert': # Insert the record into the bibrec databases to have a recordId rec_id = create_new_record() write_message(" -Creation of a new record id (%d): DONE" % rec_id, verbose=2) # we add the record Id control field to the record error = record_add_field(record, '001', '', '', rec_id, [], 0) if error == None: write_message(" Failed: " \ "Error during adding the 001 controlfield " \ "to the record", verbose=1, stream=sys.stderr) return (1, rec_id) else: error = None elif options['mode'] != 'insert' and options['mode'] != 'format' and options['stage_to_start_from'] != 5: # Update Mode #Retrieve the old record to update rec_old = create_record(print_record(int(rec_id),'xm'), 2)[0] if rec_old == None: write_message(" Failed during the creation of the old record!", verbose=1, stream=sys.stderr) return (1, rec_id) else: write_message(" -Retrieve the old record to update: DONE", verbose=2) #Delete tags to correct in the record if options['mode'] == 'correct' or options['mode'] == 'reference': delete_tags_to_correct(record, rec_old) write_message(" -Delete the old tags to correct in the old record: DONE", verbose=2) # Append new tag to the old record and update the new record with the old_record modified if options['mode'] == 'append' or options['mode'] == 'correct' or options['mode'] == 'reference': record = append_new_tag_to_old_record(record, rec_old) write_message(" -Append new tags to the old record: DONE", verbose=2) # now we clear all the rows from bibrec_bibxxx from the old record delete_bibrec_bibxxx(rec_old, rec_id) write_message(" -Clean bibrec_bibxxx: DONE", verbose=2) write_message(" -Stage COMPLETED", verbose=2) #Have a look if we have FMT tags write_message("Stage 1: Start (Insert of FMT tags if exist).", verbose=2) if options['stage_to_start_from'] <= 1 and extract_tag_from_record(record, 'FMT') != None: record = insert_fmt_tags(record, rec_id) if record == None: write_message(" Stage 1 failed: Error while inserting FMT tags", verbose=1, stream=sys.stderr) return (1, rec_id) elif record == 0: #Mode format finished stat['nb_records_updated'] += 1 return (0, rec_id) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) #Have a look if we have FFT tags write_message("Stage 2: Start (Process FFT tags if exist).", verbose=2) if options['stage_to_start_from'] <= 2 and extract_tag_from_record(record, 'FFT') != None: if options['mode'] == 'insert' or options['mode'] == 'append': record = insert_fft_tags(record, rec_id) else: record = update_fft_tag(record, rec_id) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Update of the BibFmt write_message("Stage 3: Start (Update bibfmt).", verbose=2) if options['stage_to_start_from'] <= 3: # format the single record as xml rec_xml_new = record_xml_output(record) #Update bibfmt with the format xm of this record if options['mode'] != 'format': error = update_bibfmt_format(rec_id, rec_xml_new, 'xm') if error == 1: write_message(" Failed: error during update_bibfmt_format", verbose=1, stream=sys.stderr) return (1, rec_id) write_message(" -Stage COMPLETED", verbose=2) # Update the database MetaData write_message("Stage 4: Start (Update the database with the metadata).", verbose=2) if options['stage_to_start_from'] <= 4: update_database_with_metadata(record, rec_id) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Finally we update the bibrec table with the current date write_message("Stage 5: Start (Update bibrec table with current date).", verbose=2) if options['stage_to_start_from'] <= 5 and options['mode'] != 'insert' and options['notimechange'] == 0: now = convert_datestruct_to_datetext(time.localtime()) write_message(" -Retrieve current localtime: DONE", verbose=2) update_bibrec_modif_date(now, rec_id) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) #Increase statistics if options['mode'] == 'insert': stat['nb_records_inserted'] += 1 else: stat['nb_records_updated'] += 1 #Upload of this record finish write_message("Record "+str(rec_id)+" DONE", verbose=1) return (0, rec_id) def usage(): """Print help""" print """Receive MARC XML file and update appropriate database tables according to options. Usage: bibupload [options] input.xml Examples: $ bibupload -i input.xml Options: -a, --append new fields are appended to the existing record -c, --correct fields are replaced by the new ones in the existing record -f, --format takes only the FMT fields into account. Does not update -i, --insert insert the new record in the database -r, --replace the existing record is entirely replaced by the new one -z, --reference update references (update only 999 fields) -s, --stage=STAGE stage to start from in the algorithm (0: always done; 1: FMT tags; 2: FFT tags; 3: BibFmt; 4: Metadata update; 5: time update) -n, --notimechange do not change record last modification date when updating Scheduling options: -u, --user=USER user name to store task, password needed General options: -h, --help print this help and exit -v, --verbose=LEVEL verbose level (from 0 to 9, default 1) -V --version print the script version """ def print_out_bibupload_statistics(): """Print the statistics of the process""" out = "Task stats: %(nb_input)d input records, %(nb_updated)d updated, " \ "%(nb_inserted)d inserted, %(nb_errors)d errors. Time %(nb_sec).2f sec." % { \ 'nb_input': stat['nb_records_to_upload'], 'nb_updated': stat['nb_records_updated'], 'nb_inserted': stat['nb_records_inserted'], 'nb_errors': stat['nb_errors'], 'nb_sec': time.time() - time.mktime(stat['exectime']) } write_message(out) def open_marc_file(path): """Open a file and return the data""" try: # open the file containing the marc document marc_file = open(path,'r') marc = marc_file.read() marc_file.close() except IOError, erro: write_message("Error: %s" % erro, verbose=1, stream=sys.stderr) write_message("Exiting.", sys.stderr) task_update_status("ERROR") sys.exit(1) return marc def xml_marc_to_records(xml_marc): """create the records""" # Creation of the records from the xml Marc in argument recs = create_records(xml_marc, 1, 1) if recs == []: write_message("Error: Cannot parse MARCXML file.", verbose=1, stream=sys.stderr) write_message("Exiting.", sys.stderr) task_update_status("ERROR") sys.exit(1) elif recs[0][0] == None: write_message("Error: MARCXML file has wrong format: %s" % recs[0][2], verbose=1, stream=sys.stderr) write_message("Exiting.", sys.stderr) task_update_status("ERROR") sys.exit(1) else: recs = map((lambda x:x[0]), recs) return recs def find_record_bibrec(rec_id): """ receives the record ID and returns if this record exist in bibrec """ query = """SELECT id FROM bibrec WHERE id = %s""" params = (rec_id,) try: res = run_sql(query, params) except Error, error: write_message(" Error during find_record_bibrec function : %s " % error, verbose=1, stream=sys.stderr) if len(res): return res else: return None def find_record_format(rec_id, format): """Look whether record REC_ID is formatted in FORMAT, i.e. whether FORMAT exists in the bibfmt table for this record. Return the number of times it is formatted: 0 if not, 1 if yes, 2 if found more than once (should never occur). """ out = 0 query = """SELECT COUNT(id) FROM bibfmt WHERE id_bibrec=%s AND format=%s""" params = (rec_id, format) res = [] try: res = run_sql(query, params) out = res[0][0] except Error, error: write_message(" Error during find_record_format() : %s " % error, verbose=1, stream=sys.stderr) return out def find_record_bibfmt(marc): """ receives the xmlmarc containing a record and returns the id in bibrec if the record exists in bibfmt""" # compress the marc value pickled_marc = MySQLdb.escape_string(compress(marc)) query = """SELECT id_bibrec FROM bibfmt WHERE value = %s""" # format for marc xml is xm params = (pickled_marc,) try: res = run_sql(query, params) except Error, error: write_message(" Error during find_record_bibfmt function : %s " % error, verbose=1, stream=sys.stderr) if len(res): return res else: return None def find_record_from_sysno(sysno): """receive the sysno number and return the record id""" table_name = 'bib'+CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:2]+'x' query = """SELECT DISTINCT id FROM `%s` WHERE value = '%s'""" params = (table_name, sysno) try: res = run_sql(query % params) except Error, error: write_message(" Error during find_record_from_sysno function 1st query : %s " % error, verbose=1, stream=sys.stderr) if len(res): table_name = 'bibrec_bib'+CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:2]+'x' query = """SELECT DISTINCT id_bibrec FROM `%s` WHERE id_bibxxx = '%s'""" params = (table_name, res[0][0]) try: res = run_sql(query % params) except Error, error: write_message(" Error during find_record_from_sysno function 2nd query : %s " % error, verbose=1, stream=sys.stderr) if len(res): return res[0][0] else: return None else: return None def extract_tag_from_record(record, tag_number): """ Extract the tag_number for record.""" # first step verify if the record is not already in the database if record: return record.get(tag_number, None) return None def retrieve_rec_id(record): """Retrieve the record Id from a record by using tag 001 or SYSNO""" rec_id = None tag = None #1st step: we look for the tag 001 tag = extract_tag_from_record(record, '001') if tag != None: #We exctract the record Id from the Tag rec_id = tag[0][3] #if we are in insert mode => error if options['mode'] == 'insert': write_message(" Failed : Error tag 001 found in the xml submitted"\ ", you should use the option replace, correct or append"\ " to replace an existing record. -h for help.", verbose=1, stream=sys.stderr) return -1 #if we found the rec id and we are not in insert mode => continue elif options['mode'] != 'insert': #we try to find the rec_id in the table bibrec if find_record_bibrec(rec_id) != None: return rec_id else: #The record doesn't exist yet. We will try to check the SYSNO id rec_id = None else: write_message(" -Tag 001 not found in the xml marc file.", verbose=9) if rec_id == None: #2nd step we look for the sysno code sysno = extract_tag_from_record(record, CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG) if sysno != None: #retrieve the SYSNO code from the tuple SYSNO sysno = sysno[0][0][0][1] write_message(" Check if the SYSNO id "+sysno+" exist in the database", verbose=9) #Retrieve the rec id from the database rec_id = find_record_from_sysno(sysno) if rec_id != None and options['mode'] == 'insert': write_message(" Failed : Record id found in the database: Please choose another mode than insert. -h for help.", verbose=1, stream=sys.stderr) return -1 elif rec_id == None and options['mode'] != 'insert': if options['mode'] != 'replace_insert': write_message(" Failed : Record Id not found even with SYSNO id..."\ "Please insert the file before updating it."\ " -h for help", verbose=1, stream=sys.stderr) return -1 else: options['mode'] = 'insert' else: return rec_id if sysno == None and options['mode'] != 'insert': if options['mode'] != 'replace_insert': write_message(" Failed : SYSNO tag not found in the xml marc file."\ "Please insert the file before updating it."\ " -h for help", verbose=1, stream=sys.stderr) return -1 else: options['mode'] = 'insert' return rec_id ### Insert functions def create_new_record(): """Create new record in the database""" now = convert_datestruct_to_datetext(time.localtime()) query = """INSERT INTO bibrec (creation_date, modification_date) VALUES (%s, %s)""" params = (now, now) try: rec_id = run_sql(query, params) return rec_id except Error, error: write_message(" Error during the creation_new_record function : %s " % error, verbose=1, stream=sys.stderr) return None def insert_bibfmt(id_bibrec, marc, format): """Insert the format in the table bibfmt""" # compress the marc value #pickled_marc = MySQLdb.escape_string(compress(marc)) pickled_marc = compress(marc) # get the current time now = convert_datestruct_to_datetext(time.localtime()) query = """INSERT INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)""" try: row_id = run_sql(query, (id_bibrec, format, now, pickled_marc)) return row_id except Error, error: write_message(" Error during the insert_bibfmt function : %s " % error, verbose=1, stream=sys.stderr) return None def insert_record_bibxxx(tag, value): """Insert the record into bibxxx""" #determine into which table one should insert the record table_name = 'bib'+tag[0:2]+'x' # check if the tag, value combination exists in the table query = """SELECT id FROM %s """ % table_name query += """ WHERE tag=%s AND value=%s""" params = (tag, value) try: res = run_sql(query, params) except Error, error: write_message(" Error during the insert_record_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) if len(res): # get the id of the row, if it exists row_id = res[0][0] return (table_name, row_id) else: # necessary to insert the tag, value into bibxxx table query = """INSERT INTO %s """ % table_name query += """ (tag, value) values (%s , %s)""" params = (tag, value) try: row_id = run_sql(query, params) except Error, error: write_message(" Error during the insert_record_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) return (table_name, row_id) def insert_record_bibrec_bibxxx(table_name, id_bibxxx, field_number, id_bibrec): """Insert the record into bibrec_bibxxx""" #determine into which table one should insert the record full_table_name = 'bibrec_'+ table_name # insert the proper row into the table query = """INSERT INTO %s """ % full_table_name query += """(id_bibrec,id_bibxxx, field_number) values (%s , %s, %s)""" params = (id_bibrec, id_bibxxx, field_number) try: res = run_sql(query, params) except Error, error: write_message(" Error during the insert_record_bibrec_bibxxx function 2nd query : %s " % error, verbose=1, stream=sys.stderr) return res def insert_fft_tags(record, rec_id): """Process and insert FFT tags""" tuple_list = None tuple_list = extract_tag_from_record(record, 'FFT') #If there is a FFT TAG :) if tuple_list != None: for single_tuple in tuple_list: # Get the inside of the FFT file docpath = single_tuple[0][0][1] docname = re.sub("\..*", "", os.path.basename(docpath)) extension = re.sub("^[^\.]*.", "", os.path.basename(docpath)).lower() #Create a new docId try: bib_doc_id = run_sql("insert into bibdoc (docname,creation_date,modification_date) values(%s,NOW(),NOW())", (docname,)) write_message(" -Insert of the file %s into bibdoc : DONE" % docname, verbose=2) except Error, error: write_message(" Error during the insert_fft_tags function : %s " % error, verbose=1, stream=sys.stderr) if bib_doc_id != None: #we link the document to the record if a rec_id was specified if rec_id != "": #TO FIX doc_type : main or additional, fron where the information come from? doc_type = "" try: res = run_sql("insert into bibrec_bibdoc values(%s,%s,%s)", (rec_id, bib_doc_id, doc_type)) if res == None: write_message(" Failed during creation of link between doc Id and rec Id).", verbose=1, stream=sys.stderr) else: write_message(" -Insert of the link bibrec bibdoc for %s : DONE" % docname, verbose=2) except Error, error: write_message(" Error during the insert_fft_tags function : %s " % error, verbose=1, stream=sys.stderr) else: write_message(" Failed during creation of the new doc Id.", verbose=1, stream=sys.stderr) #Move the file to the correct place # Variables from the config file archivepath = filedir archivesize = filedirsize url_path = None group = "g"+str(int(int(bib_doc_id)/archivesize)) basedir = "%s/%s/%s" % (archivepath, group, bib_doc_id) # we create the corresponding storage directory if not os.path.exists(basedir): try: os.makedirs(basedir) write_message(" -Create a new directory %s : DONE" % basedir, verbose=2) except OSError, error: write_message(" Error making the directory : %s " % error, verbose=1, stream=sys.stderr) # and save the father record id if it exists if rec_id != "": try: filep = open("%s/.recid" % basedir, "w") filep.write(str(bib_doc_id)) filep.close() except IOError, error: write_message(" Error writing the file : %s " % error, verbose=1, stream=sys.stderr) #Move the file to the good directory try: os.system("mv %s %s" % (docpath, basedir)) write_message(" -Move the file %s : DONE" % docname, verbose=2) except OSError, error: write_message(" Error moving the file : %s " % error, verbose=1, stream=sys.stderr) #Create the Url Path url_path = htdocsurl+"/record/"+str(rec_id)+"/files/"+docname+"."+extension #add tag 856 to the xml marc to proceed subfield_list = [('u', url_path), ('z', 'Access to Fulltext')] newfield_number = record_add_field(record, "856", "4", "", "", subfield_list) if newfield_number == None: write_message(" Error when adding the field"+ single_tuple, verbose=1, stream=sys.stderr) else: write_message(" -Add the new tag 856 to the record for %s : DONE" % docname, verbose=2) #Delete FFT tag :) record_delete_field(record, 'FFT', '', '') write_message(" -Delete FFT tag from source : DONE", verbose=2) return record def insert_fmt_tags(record, rec_id): """Process and insert FMT tags""" fmt_fields = record_get_field_instances(record, 'FMT') if fmt_fields: for fmt_field in fmt_fields: # Get the f, g subfields of the FMT tag try: f_value = field_get_subfield_values(fmt_field, "f")[0] except IndexError: f_value = "" try: g_value = field_get_subfield_values(fmt_field, "g")[0] except IndexError: g_value = "" # Update the format res = update_bibfmt_format(rec_id, g_value, f_value) if res == 1: write_message(" Failed: Error during update_bibfmt", verbose=1, stream=sys.stderr) # If we are in format mode, we only care about the FMT tag if options['mode'] == 'format': return 0 # We delete the FMT Tag of the record record_delete_field(record, 'FMT') write_message(" -Delete field FMT from record : DONE", verbose=2) return record elif options['mode'] == 'format': write_message(" Failed: Format updated failed : No tag FMT found", verbose=1, stream=sys.stderr) return None else: return record ### Update functions def update_bibrec_modif_date(now, bibrec_id): """Update the date of the record in bibrec table """ query = """UPDATE bibrec SET modification_date=%s WHERE id=%s""" params = (now, bibrec_id) try: res = run_sql(query, params) if res != 1: write_message(" Failed : Sql error during the update of the bibrec modification_date", verbose=1, stream=sys.stderr) else: write_message(" -Update record modification date : DONE" , verbose=2) except Error, error: write_message(" Error during update_bibrec_modif_date function : %s" % error, verbose=1, stream=sys.stderr) def update_bibfmt_format(id_bibrec, format_value, format_name): """Update the format in the table bibfmt""" # We check if the format is already in bibFmt nb_found = find_record_format(id_bibrec, format_name) if nb_found == 1: # Update the format # get the current time now = convert_datestruct_to_datetext(time.localtime()) # compress the format_value value pickled_format_value = compress(format_value) query = """UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s""" params = (now, pickled_format_value, id_bibrec, format_name) try: row_id = run_sql(query, params) if row_id == None: write_message(" Failed: Error during update_bibfmt_format function", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Update the format %s in bibfmt : DONE" % format_name , verbose=2) return 0 except Error, error: write_message(" Error during the update_bibfmt_format function : %s " % error, verbose=1, stream=sys.stderr) elif nb_found > 1: write_message(" Failed: Same format %s found several time in bibfmt for the same record." % format_name, verbose=1, stream=sys.stderr) return 1 else: # Insert the format information in BibFMT res = insert_bibfmt(id_bibrec, format_value, format_name) if res == None: write_message(" Failed: Error during insert_bibfmt", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Insert the format %s in bibfmt : DONE" % format_name , verbose=2) return 0 def update_database_with_metadata(record, rec_id): """Update the database tables with the record and the record id given in parameter""" for tag in record.keys(): # check if tag is not a control field : tag not in CFG_BIBUPLOAD_CONTROLFIELD_TAGS and if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS: # for each tag there is a list of tuples representing datafields tuple_list = record[tag] # this list should contain the elements of a full tag [tag, ind1, ind2, subfield_code] tag_list = [] tag_list.append(tag) for single_tuple in tuple_list: # these are the contents of a single tuple subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # append the ind's to the full tag if (ind1 == ''): tag_list.append('_') else: tag_list.append(ind1) if (ind2 == ''): tag_list.append('_') else: tag_list.append(ind2) datafield_number = single_tuple[4] if tag in CFG_BIBUPLOAD_SPECIAL_TAGS: # nothing to do for special tags (FFT, FMT) pass elif tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS and tag != "001": value = single_tuple[3] # get the full tag full_tag = ''.join(tag_list) # Others modes : update all the tag if options['mode'] == 'insert' or options['mode'] == 'replace' or options['mode'] == 'append': write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value) #print 'tname, bibrow', table_name, bibxxx_row_id; if table_name == None or bibxxx_row_id == None: write_message(" Failed : during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id) if res == None: write_message(" Failed : during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) else: # get the tag and value from the content of each subfield for subfield in subfield_list: subtag = subfield[0] value = subfield[1] tag_list.append(subtag) # get the full tag full_tag = ''.join(tag_list) # Others modes : update all the tag if options['mode'] == 'insert' or options['mode'] == 'replace' or options['mode'] == 'append': write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value) if table_name == None or bibxxx_row_id == None: write_message(" Failed : during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id) if res == None: write_message(" Failed : during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) # remove the subtag from the list tag_list.pop() tag_list.pop() tag_list.pop() tag_list.pop() write_message(" -Update the database with metadata : DONE", verbose=2) def append_new_tag_to_old_record(record, rec_old): """Append new tags to a old record""" if options['tag'] != None: tag = options['tag'] if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: if tag == '001': pass else: # if it is a controlfield,just access the value for single_tuple in record[tag]: controlfield_value = single_tuple[3] # add the field to the old record newfield_number = record_add_field(rec_old, tag, "", "", controlfield_value) if newfield_number == None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: # For each tag there is a list of tuples representing datafields for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # We add the datafield to the old record if options['verbose'] == 9: print " Adding tag: ", tag, " ind1=", ind1, " ind2=", ind2, " code=", subfield_list newfield_number = record_add_field(rec_old, tag, ind1, ind2, "", subfield_list) if newfield_number == None: write_message("Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: # Go through each tag in the appended record for tag in record.keys(): # Reference mode append only reference tag if options['mode'] == 'reference': if tag == CFG_BIBUPLOAD_REFERENCE_TAG: for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # We add the datafield to the old record if options['verbose'] == 9: print " Adding tag: ", tag, " ind1=", ind1, " ind2=", ind2, " code=", subfield_list newfield_number = record_add_field(rec_old, tag, ind1, ind2, "", subfield_list) if newfield_number == None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: if tag == '001': pass else: # if it is a controlfield,just access the value for single_tuple in record[tag]: controlfield_value = single_tuple[3] # add the field to the old record newfield_number = record_add_field(rec_old, tag, "", "", controlfield_value) if newfield_number == None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: # For each tag there is a list of tuples representing datafields for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # We add the datafield to the old record if options['verbose'] == 9: print " Adding tag: ", tag, " ind1=", ind1, " ind2=", ind2, " code=", subfield_list newfield_number = record_add_field(rec_old, tag, ind1, ind2, "", subfield_list) if newfield_number == None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) return rec_old def update_fft_tag(record, rec_id): """Process and Update FFT tags""" #TO IMPROVE: SELECT THE BIBDOC ID TO DELETE AND FIRST INSERT THE NEW FFT TAGS BEFORE DELETING THE OLD ONE # We delete the bibdoc corresponding to this record delete_bibdoc(rec_id) # We delete the links between bibrec and bibdoc delete_bibrec_bibdoc(rec_id) # We delete the tag 856 from the record record_delete_field(record, '856', '4') # We add the new fft tags record = insert_fft_tags(record, rec_id) return record ###Delete function def delete_tags_to_correct(record, rec_old): """Delete the tags which are existing in both records""" # Browse through all the tags from the marc file for tag in record.keys(): #Do we have to delete only a special tag or all? if options['tag'] == None: # See if these tags exist in the old record if rec_old.has_key(tag) and tag != '001': # Delete the tag found write_message(" Delete tag: "+tag+" ind1= "+rec_old[tag][0][1]+" ind2= "+rec_old[tag][0][2], verbose=9) record_delete_field(rec_old, tag, rec_old[tag][0][1], rec_old[tag][0][2]) else: if rec_old.has_key(tag) and options['tag'] == tag: # Delete the tag found write_message(" Delete tag: "+tag+" ind1= "+rec_old[tag][0][1]+" ind2= "+rec_old[tag][0][2], verbose=9) record_delete_field(rec_old, tag, rec_old[tag][0][1], rec_old[tag][0][2]) def delete_bibrec_bibxxx(record, id_bibrec): """Delete the database record from the table bibxxx given in parameters""" # we clear all the rows from bibrec_bibxxx from the old record for tag in record.keys(): if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS: # for each name construct the bibrec_bibxxx table name table_name = 'bibrec_bib'+tag[0:2]+'x' # delete all the records with proper id_bibrec query = """DELETE FROM `%s` where id_bibrec = %s""" params = (table_name, id_bibrec) try: run_sql(query % params) except Error, error: write_message(" Error during the delete_bibrec_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) def delete_bibdoc(id_bibrec): """Delete document from bibdoc which correspond to the bibrec id given in parameter""" query = """UPDATE bibdoc SET status='deleted' WHERE id IN (SELECT id_bibdoc FROM bibrec_bibdoc WHERE id_bibrec=%s)""" params = (id_bibrec,) try: run_sql(query, params) except Error, error: write_message(" Error during the delete_bibdoc function : %s " % error, verbose=1, stream=sys.stderr) def delete_bibrec_bibdoc(id_bibrec): """Delete the bibrec record from the table bibrec_bibdoc given in parameter""" # delete all the records with proper id_bibrec query = """DELETE FROM bibrec_bibdoc WHERE id_bibrec=%s""" params = (id_bibrec,) try: run_sql(query, params) except Error, error: write_message(" Error during the delete_bibrec_bibdoc function : %s " % error, verbose=1, stream=sys.stderr) def main(): """main entry point for bibupload""" global options ## parse command line: if len(sys.argv) == 2 and sys.argv[1].isdigit(): ## A - run the task task_id = int(sys.argv[1]) try: if not task_run(task_id): write_message("Error occurred. Exiting.", sys.stderr) except StandardError, erro: write_message("Unexpected error occurred: %s." % erro, sys.stderr) write_message("Traceback is:", sys.stderr) traceback.print_tb(sys.exc_info()[2]) write_message("Exiting.", sys.stderr) task_update_status("ERROR") else: ## B - submit the task # set default values: options["runtime"] = time.strftime("%Y-%m-%d %H:%M:%S") options["sleeptime"] = "" # set user-defined options: error = parse_command() if error == 0: task_submit() else: sys.exit(1) return if __name__ == "__main__": main() diff --git a/modules/websearch/lib/websearch_webcoll.py b/modules/websearch/lib/websearch_webcoll.py index 74789b498..e0dc5d388 100644 --- a/modules/websearch/lib/websearch_webcoll.py +++ b/modules/websearch/lib/websearch_webcoll.py @@ -1,1087 +1,1088 @@ ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Creates CDS Invenio collection specific pages, using WML and MySQL configuration tables.""" __revision__ = "$Id$" import calendar import copy import getopt import getpass import marshal import signal import sys import cgi import sre import os import string import zlib import Numeric import time import traceback from invenio.config import \ CFG_CERN_SITE, \ CFG_WEBSEARCH_INSTANT_BROWSE, \ CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, \ cachedir, \ cdslang, \ cdsname, \ weburl from invenio.messages import gettext_set_language, language_list_long from invenio.search_engine import HitSet, search_pattern, get_creation_date, get_field_i18nname from invenio.dbquery import run_sql, escape_string, Error, get_table_update_time from invenio.access_control_engine import acc_authorize_action from invenio.bibrank_record_sorter import get_bibrank_methods from invenio.dateutils import convert_datestruct_to_dategui from invenio.websearch_external_collections import \ external_collection_load_states, \ dico_collection_external_searches, \ external_collection_sort_engine_by_name import invenio.template websearch_templates = invenio.template.load('websearch') ## global vars collection_house = {} # will hold collections we treat in this run of the program; a dict of {collname2, collobject1}, ... options = {} # will hold task options # cfg_cache_last_updated_timestamp_tolerance -- cache timestamp # tolerance (in seconds), to account for the fact that an admin might # accidentally happen to edit the collection definitions at exactly # the same second when some webcoll process was about to be started. # In order to be safe, let's put an exaggerated timestamp tolerance # value such as 20 seconds: cfg_cache_last_updated_timestamp_tolerance = 20 # cfg_cache_last_updated_timestamp_file -- location of the cache # timestamp file: cfg_cache_last_updated_timestamp_file = "%s/collections/last_updated" % cachedir def get_collection(colname): """Return collection object from the collection house for given colname. If does not exist, then create it.""" if not collection_house.has_key(colname): colobject = Collection(colname) collection_house[colname] = colobject return collection_house[colname] ## auxiliary functions: def mymkdir(newdir, mode=0777): """works the way a good mkdir should :) - already exists, silently complete - regular file in the way, raise an exception - parent directory(ies) does not exist, make them as well """ if os.path.isdir(newdir): pass elif os.path.isfile(newdir): raise OSError("a file with the same name as the desired " \ "dir, '%s', already exists." % newdir) else: head, tail = os.path.split(newdir) if head and not os.path.isdir(head): mymkdir(head, mode) if tail: os.umask(022) os.mkdir(newdir, mode) def is_selected(var, fld): "Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes." if var == fld: return " selected" else: return "" def write_message(msg, stream=sys.stdout): """Write message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff.""" if stream == sys.stdout or stream == sys.stderr: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) return def get_field(recID, tag): "Gets list of field 'tag' for the record with 'recID' system number." out = [] digit = tag[0:2] bx = "bib%sx" % digit bibx = "bibrec_bib%sx" % digit query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag='%s'" \ % (bx, bibx, recID, tag) res = run_sql(query) for row in res: out.append(row[0]) return out def print_record(recID, format='hb', ln=cdslang): "Prints record 'recID' formatted accoding to 'format'." out = "" # HTML brief format by default query = "SELECT value FROM bibfmt WHERE id_bibrec='%s' AND format='%s'" % (recID, format) res = run_sql(query, None, 1) if res: # record 'recID' is formatted in 'format', so print it out += "%s" % zlib.decompress(res[0][0]) else: # record 'recID' does not exist in format 'format', so print some default format: # firstly, title: titles = get_field(recID, "245__a") # secondly, authors: authors = get_field(recID, "100__a") + get_field(recID, "700__a") # thirdly, date of creation: dates = get_field(recID, "260__c") # thirdly bis, report numbers: rns = get_field(recID, "037__a") + get_field(recID, "088__a") # fourthly, beginning of abstract: abstracts = get_field(recID, "520__a") # fifthly, fulltext link: urls_z = get_field(recID, "8564_z") urls_u = get_field(recID, "8564_u") out += websearch_templates.tmpl_record_body( weburl = weburl, titles = titles, authors = authors, dates = dates, rns = rns, abstracts = abstracts, urls_u = urls_u, urls_z = urls_z, ln=ln) # at the end of HTML mode, print "Detailed record" and "Mark record" functions: out += websearch_templates.tmpl_record_links( weburl = weburl, recid = recID, ln = ln ) return out class Collection: "Holds the information on collections (id,name,dbquery)." def __init__(self, name=""): "Creates collection instance by querying the DB configuration database about 'name'." self.calculate_reclist_run_already = 0 # to speed things up wihtout much refactoring self.update_reclist_run_already = 0 # to speed things up wihtout much refactoring self.reclist_with_nonpublic_subcolls = HitSet() if not name: self.name = cdsname # by default we are working on the home page self.id = 1 self.dbquery = None self.nbrecs = None self.reclist = HitSet() else: self.name = name query = "SELECT id,name,dbquery,nbrecs,reclist FROM collection WHERE name='%s'" % escape_string(name) try: res = run_sql(query, None, 1) if res: self.id = res[0][0] self.name = res[0][1] self.dbquery = res[0][2] self.nbrecs = res[0][3] try: self.reclist = HitSet(Numeric.loads(zlib.decompress(res[0][5]))) except: self.reclist = HitSet() else: # collection does not exist! self.id = None self.dbquery = None self.nbrecs = None self.reclist = HitSet() except Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1) def get_name(self, ln=cdslang, name_type="ln", prolog="", epilog="", prolog_suffix=" ", epilog_suffix=""): """Return nicely formatted collection name for language LN. The NAME_TYPE may be 'ln' (=long name), 'sn' (=short name), etc.""" out = prolog i18name = "" res = run_sql("SELECT value FROM collectionname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, name_type)) try: i18name += res[0][0] except IndexError: pass if i18name: out += i18name else: out += self.name out += epilog return out def get_ancestors(self): "Returns list of ancestors of the current collection." ancestors = [] id_son = self.id while 1: query = "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son) res = run_sql(query, None, 1) if res: col_ancestor = get_collection(res[0][1]) ancestors.append(col_ancestor) id_son = res[0][0] else: break ancestors.reverse() return ancestors def restricted_p(self): """Predicate to test if the collection is restricted or not. Return the contect of the `restrited' column of the collection table (typically Apache group). Otherwise return None if the collection is public.""" out = None query = "SELECT restricted FROM collection WHERE id=%d" % self.id res = run_sql(query, None, 1) try: out = res[0][0] except: pass return out def get_sons(self, type='r'): "Returns list of direct sons of type 'type' for the current collection." sons = [] id_dad = self.id query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC, c.name ASC" % (int(id_dad), type) res = run_sql(query) for row in res: sons.append(get_collection(row[1])) return sons def get_descendants(self, type='r'): "Returns list of all descendants of type 'type' for the current collection." descendants = [] id_dad = self.id query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC" % (int(id_dad), type) res = run_sql(query) for row in res: col_desc = get_collection(row[1]) descendants.append(col_desc) descendants += col_desc.get_descendants() return descendants def write_cache_file(self, filename='', filebody=''): "Write a file inside collection cache." # open file: dirname = "%s/collections/%d" % (cachedir, self.id) mymkdir(dirname) fullfilename = dirname + "/%s.html" % filename try: os.umask(022) f = open(fullfilename, "w") except IOError, v: try: (code, message) = v except: code = 0 message = v print "I/O Error: " + str(message) + " (" + str(code) + ")" sys.exit(1) # print user info: if options["verbose"] >= 6: write_message("... creating %s" % fullfilename) sys.stdout.flush() # print page body: f.write(filebody) # close file: f.close() def update_webpage_cache(self): """Create collection page header, navtrail, body (including left and right stripes) and footer, and call write_cache_file() afterwards to update the collection webpage cache.""" ## do this for each language: for lang, lang_fullname in language_list_long(): # load the right message language _ = gettext_set_language(lang) ## first, update navtrail: for as in range(0, 2): self.write_cache_file("navtrail-as=%s-ln=%s" % (as, lang), self.create_navtrail_links(as, lang)) ## second, update page body: for as in range(0, 2): # do both simple search and advanced search pages: body = websearch_templates.tmpl_webcoll_body( ln=lang, collection=self.name, te_portalbox = self.create_portalbox(lang, 'te'), searchfor = self.create_searchfor(as, lang), np_portalbox = self.create_portalbox(lang, 'np'), narrowsearch = self.create_narrowsearch(as, lang, 'r'), focuson = self.create_narrowsearch(as, lang, "v") + self.create_external_collections_box(), instantbrowse = self.create_instant_browse(as=as, ln=lang), ne_portalbox = self.create_portalbox(lang, 'ne') ) self.write_cache_file("body-as=%s-ln=%s" % (as, lang), body) ## third, write portalboxes: self.write_cache_file("portalbox-tp-ln=%s" % lang, self.create_portalbox(lang, "tp")) self.write_cache_file("portalbox-te-ln=%s" % lang, self.create_portalbox(lang, "te")) self.write_cache_file("portalbox-lt-ln=%s" % lang, self.create_portalbox(lang, "lt")) self.write_cache_file("portalbox-rt-ln=%s" % lang, self.create_portalbox(lang, "rt")) ## fourth, write 'last updated' information: self.write_cache_file("last-updated-ln=%s" % lang, convert_datestruct_to_dategui(time.localtime(), ln=lang)) return def create_navtrail_links(self, as=0, ln=cdslang): """Creates navigation trail links, i.e. links to collection ancestors (except Home collection). If as==1, then links to Advanced Search interfaces; otherwise Simple Search. """ dads = [] for dad in self.get_ancestors(): if dad.name != cdsname: # exclude Home collection dads.append((dad.name, dad.get_name(ln))) return websearch_templates.tmpl_navtrail_links( as=as, ln=ln, dads=dads) def create_portalbox(self, lang=cdslang, position="rt"): """Creates portalboxes of language CDSLANG of the position POSITION by consulting DB configuration database. The position may be: 'lt'='left top', 'rt'='right top', etc.""" out = "" query = "SELECT p.title,p.body FROM portalbox AS p, collection_portalbox AS cp "\ " WHERE cp.id_collection=%d AND p.id=cp.id_portalbox AND cp.ln='%s' AND cp.position='%s' "\ " ORDER BY cp.score DESC" % (self.id, lang, position) res = run_sql(query) for row in res: title, body = row[0], row[1] if title: out += websearch_templates.tmpl_portalbox(title = title, body = body) else: # no title specified, so print body ``as is'' only: out += body return out def create_narrowsearch(self, as=0, ln=cdslang, type="r"): """Creates list of collection descendants of type 'type' under title 'title'. If as==1, then links to Advanced Search interfaces; otherwise Simple Search. Suitable for 'Narrow search' and 'Focus on' boxes.""" # get list of sons and analyse it sons = self.get_sons(type) if not sons: return '' # get descendents descendants = self.get_descendants(type) grandsons = [] if CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS: # load grandsons for each son for son in sons: grandsons.append(son.get_sons()) # return "" return websearch_templates.tmpl_narrowsearch( as = as, ln = ln, type = type, father = self, has_grandchildren = len(descendants)>len(sons), sons = sons, display_grandsons = CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, grandsons = grandsons ) def create_external_collections_box(self, ln=cdslang): external_collection_load_states() if not dico_collection_external_searches.has_key(self.id): return "" engines_list = external_collection_sort_engine_by_name(dico_collection_external_searches[self.id]) return websearch_templates.tmpl_searchalso(ln, engines_list, self.id) def create_instant_browse(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, as=0, ln=cdslang): "Searches database and produces list of last 'rg' records." if self.restricted_p(): return websearch_templates.tmpl_box_restricted_content(ln = ln) else: if self.nbrecs and self.reclist: # firstly, get last 'rg' records: recIDs = Numeric.nonzero(self.reclist._set) passIDs = [] total = len(recIDs) to_display = min(rg, total) for idx in range(total-1, total-to_display-1, -1): recid = recIDs[idx] passIDs.append({'id': recid, 'body': print_record(recid, ln=ln), 'date': get_creation_date(recid, fmt="%Y-%m-%d
%H:%i")}) if self.nbrecs > rg: url = websearch_templates.build_search_url( cc=self.name, jrec=rg+1, ln=ln, as=as) else: url = "" return websearch_templates.tmpl_instant_browse( as=as, ln=ln, recids=passIDs, more_link=url) return websearch_templates.tmpl_box_no_records(ln=ln) def create_searchoptions(self): "Produces 'Search options' portal box." box = "" query = """SELECT DISTINCT(cff.id_field),f.code,f.name FROM collection_field_fieldvalue AS cff, field AS f WHERE cff.id_collection=%d AND cff.id_fieldvalue IS NOT NULL AND cff.id_field=f.id ORDER BY cff.score DESC""" % self.id res = run_sql(query) if res: for row in res: field_id = row[0] field_code = row[1] field_name = row[2] query_bis = """SELECT fv.value,fv.name FROM fieldvalue AS fv, collection_field_fieldvalue AS cff WHERE cff.id_collection=%d AND cff.type='seo' AND cff.id_field=%d AND fv.id=cff.id_fieldvalue ORDER BY cff.score_fieldvalue DESC, cff.score DESC, fv.name ASC""" % (self.id, field_id) res_bis = run_sql(query_bis) if res_bis: values = [{'value' : '', 'text' : 'any' + field_name}] # FIXME: internationalisation of "any" for row_bis in res_bis: values.append({'value' : cgi.escape(row_bis[0], 1), 'text' : row_bis[1]}) box += websearch_templates.tmpl_select( fieldname = field_code, values = values ) return box def create_sortoptions(self, ln=cdslang): "Produces 'Sort options' portal box." # load the right message language _ = gettext_set_language(ln) box = "" query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE id_collection=%d AND cff.type='soo' AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""" % self.id values = [{'value' : '', 'text': "- %s -" % _("latest first")}] res = run_sql(query) if res: for row in res: values.append({'value' : row[0], 'text': row[1]}) else: for tmp in ('title', 'author', 'report number', 'year'): values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) box = websearch_templates.tmpl_select( fieldname = 'sf', css_class = 'address', values = values ) box += websearch_templates.tmpl_select( fieldname = 'so', css_class = 'address', values = [ {'value' : 'a' , 'text' : _("asc.")}, {'value' : 'd' , 'text' : _("desc.")} ] ) return box def create_rankoptions(self, ln=cdslang): "Produces 'Rank options' portal box." # load the right message language _ = gettext_set_language(ln) values = [{'value' : '', 'text': "- %s %s -" % (string.lower(_("OR")), _("rank by"))}] for (code, name) in get_bibrank_methods(self.id, ln): values.append({'value' : code, 'text': name}) box = websearch_templates.tmpl_select( fieldname = 'sf', css_class = 'address', values = values ) return box def create_displayoptions(self, ln=cdslang): "Produces 'Display options' portal box." # load the right message language _ = gettext_set_language(ln) values = [] for i in ['10', '25', '50', '100', '250', '500']: values.append({'value' : i, 'text' : i + ' ' + _("results")}) box = websearch_templates.tmpl_select( fieldname = 'rg', css_class = 'address', values = values ) if self.get_sons(): box += websearch_templates.tmpl_select( fieldname = 'sc', css_class = 'address', values = [ {'value' : '1' , 'text' : _("split by collection")}, {'value' : '0' , 'text' : _("single list")} ] ) return box def create_formatoptions(self, ln=cdslang): "Produces 'Output format options' portal box." # load the right message language _ = gettext_set_language(ln) box = "" values = [] query = """SELECT f.code,f.name FROM format AS f, collection_format AS cf WHERE cf.id_collection=%d AND cf.id_format=f.id ORDER BY cf.score DESC, f.name ASC""" % self.id res = run_sql(query) if res: for row in res: values.append({'value' : row[0], 'text': row[1]}) else: values.append({'value' : 'hb', 'text' : "HTML %s" % _("brief")}) box = websearch_templates.tmpl_select( fieldname = 'of', css_class = 'address', values = values ) return box def create_searchwithin_selection_box(self, fieldname='f', value='', ln='en'): "Produces 'search within' selection box for the current collection." # get values query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='sew' AND cff.id_collection=%d AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""" % self.id res = run_sql(query) values = [{'value' : '', 'text' : get_field_i18nname("any field", ln)}] if res: for row in res: values.append({'value' : row[0], 'text' : row[1]}) else: if CFG_CERN_SITE: for tmp in ['title', 'author', 'abstract', 'report number', 'year']: values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) else: for tmp in ['title', 'author', 'abstract', 'keyword', 'report number', 'year', 'fulltext', 'reference']: values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) return websearch_templates.tmpl_searchwithin_select( fieldname = fieldname, ln = ln, selected = value, values = values ) def create_searchexample(self): "Produces search example(s) for the current collection." out = "$collSearchExamples = getSearchExample(%d, $se);" % self.id return out def create_searchfor(self, as=0, ln=cdslang): "Produces either Simple or Advanced 'Search for' box for the current collection." if as == 1: return self.create_searchfor_advanced(ln) else: return self.create_searchfor_simple(ln) def create_searchfor_simple(self, ln=cdslang): "Produces simple 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_simple( ln=ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, middle_option = self.create_searchwithin_selection_box(ln=ln), ) def create_searchfor_advanced(self, ln=cdslang): "Produces advanced 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_advanced( ln = ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, middle_option_1 = self.create_searchwithin_selection_box('f1', ln=ln), middle_option_2 = self.create_searchwithin_selection_box('f2', ln=ln), middle_option_3 = self.create_searchwithin_selection_box('f3', ln=ln), searchoptions = self.create_searchoptions(), sortoptions = self.create_sortoptions(ln), rankoptions = self.create_rankoptions(ln), displayoptions = self.create_displayoptions(ln), formatoptions = self.create_formatoptions(ln) ) def calculate_reclist(self): """Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls) tuple for given collection.""" if self.calculate_reclist_run_already: # do we have to recalculate? return (self.reclist, self.reclist_with_nonpublic_subcolls) if options["verbose"] >= 6: write_message("... calculating reclist of %s" % self.name) reclist = HitSet() # will hold results for public sons only; good for storing into DB reclist_with_nonpublic_subcolls = HitSet() # will hold results for both public and nonpublic sons; good for deducing total # number of documents if not self.dbquery: # A - collection does not have dbquery, so query recursively all its sons # that are either non-restricted or that have the same restriction rules for coll in self.get_sons(): coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist() if ((coll.restricted_p() is None) or (coll.restricted_p() == self.restricted_p())): # add this reclist ``for real'' only if it is public reclist.union(coll_reclist) reclist_with_nonpublic_subcolls.union(coll_reclist_with_nonpublic_subcolls) else: # B - collection does have dbquery, so compute it: reclist = search_pattern(None, self.dbquery) reclist_with_nonpublic_subcolls = copy.deepcopy(reclist) # deduce the number of records: reclist.calculate_nbhits() reclist_with_nonpublic_subcolls.calculate_nbhits() # store the results: self.nbrecs = reclist_with_nonpublic_subcolls._nbhits self.reclist = reclist self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls # last but not least, update the speed-up flag: self.calculate_reclist_run_already = 1 # return the two sets: return (self.reclist, self.reclist_with_nonpublic_subcolls) def update_reclist(self): "Update the record universe for given collection; nbrecs, reclist of the collection table." if self.update_reclist_run_already: # do we have to reupdate? return 0 if options["verbose"] >= 6: write_message("... updating reclist of %s (%s recs)" % (self.name, self.nbrecs)) sys.stdout.flush() try: query = "UPDATE collection SET nbrecs=%d, reclist='%s' WHERE id=%d" % \ (self.nbrecs, escape_string(zlib.compress(Numeric.dumps(self.reclist._set))), self.id) run_sql(query) self.reclist_updated_since_start = 1 except Error, e: print "Database Query Error %d: %s." % (e.args[0], e.args[1]) sys.exit(1) # last but not least, update the speed-up flag: self.update_reclist_run_already = 1 return 0 def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re = sre.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def get_current_time_timestamp(): """Return timestamp corresponding to the current time.""" return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) def compare_timestamps_with_tolerance(timestamp1, timestamp2, tolerance=0): """Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form '2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument (in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2 minus TOLERANCE, 0 if they are equal within TOLERANCE limit, and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE. """ # remove any trailing .00 in timestamps: timestamp1 = sre.sub(r'\.[0-9]+$', '', timestamp1) timestamp2 = sre.sub(r'\.[0-9]+$', '', timestamp2) # first convert timestamps to Unix epoch seconds: timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S")) timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S")) # now compare them: if timestamp1_seconds < timestamp2_seconds - tolerance: return -1 elif timestamp1_seconds > timestamp2_seconds + tolerance: return 1 else: return 0 def get_database_last_updated_timestamp(): """Return last updated timestamp for collection-related and record-related database tables. """ database_tables_timestamps = [] database_tables_timestamps.append(get_table_update_time('bibrec')) database_tables_timestamps.append(get_table_update_time('bibfmt')) database_tables_timestamps.append(get_table_update_time('idxWORD%')) database_tables_timestamps.append(get_table_update_time('collection%')) database_tables_timestamps.append(get_table_update_time('portalbox')) database_tables_timestamps.append(get_table_update_time('field%')) database_tables_timestamps.append(get_table_update_time('format%')) database_tables_timestamps.append(get_table_update_time('rnkMETHODNAME')) return max(database_tables_timestamps) def get_cache_last_updated_timestamp(): """Return last updated cache timestamp.""" try: f = open(cfg_cache_last_updated_timestamp_file, "r") except: return "1970-01-01 00:00:00" timestamp = f.read() f.close() return timestamp def set_cache_last_updated_timestamp(timestamp): """Set last updated cache timestamp to TIMESTAMP.""" try: f = open(cfg_cache_last_updated_timestamp_file, "w") except: pass f.write(timestamp) f.close() return timestamp def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame)) write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame)) write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame)) write_message("stopping...") task_update_status("STOPPING") pass # FIXME: is there anything to be done? task_update_status("STOPPED") sys.exit(0) def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" if options["verbose"] >= 9: write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame)) write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" # do nothing for unknown signals: write_message("unknown signal %d (frame %s) ignored" % (sig, frame)) def authenticate(user, header="WebColl Task Submission", action="runwebcoll"): """Authenticate the user against the user database. Check for its password, if it exists. Check for action access rights. Return user name upon authorization success, do system exit upon authorization failure. """ print header print "=" * len(header) if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user ## first check user pw: - res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1) + res = run_sql("select id,password from user where email=%s", (user,), 1) + \ + run_sql("select id,password from user where nickname=%s", (user,), 1) if not res: print "Sorry, %s does not exist." % user sys.exit(1) else: (uid_db, password_db) = res[0] if password_db: password_entered = getpass.getpass() if password_db == password_entered: pass else: print "Sorry, wrong credentials for %s." % user sys.exit(1) ## secondly check authorization for the action: (auth_code, auth_message) = acc_authorize_action(uid_db, action) if auth_code != 0: print auth_message sys.exit(1) return user def task_submit(): """Submits task to the BibSched task queue. This is what people will be invoking via command line.""" global options ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] ## authenticate user: user = authenticate(options.get("user", "")) ## submit task: if options["verbose"] >= 9: print "" write_message("storing task options %s\n" % options) task_id = run_sql("""INSERT INTO schTASK (id,proc,user,runtime,sleeptime,status,arguments) VALUES (NULL,'webcoll',%s,%s,%s,'WAITING',%s)""", (user, options["runtime"], options["sleeptime"], marshal.dumps(options))) ## update task number: options["task"] = task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), task_id)) write_message("Task #%d submitted." % task_id) return task_id def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global options return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"])) def task_update_status(val): """Updates status information in the BibSched task table.""" global options return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"])) def task_read_status(task_id): """Read status information in the BibSched task table.""" res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1) try: out = res[0][0] except: out = 'UNKNOWN' return out def task_get_options(id): """Returns options for the task 'id' read from the BibSched task queue table.""" out = {} res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='webcoll'", (id,)) try: out = marshal.loads(res[0][0]) except: write_message("Error: WebColl task %d does not seem to exist." % id) sys.exit(1) return out def task_run(task_id): """Run the WebColl task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call. The task will update collection reclist cache and collection web pages for given collection. (default is all). Arguments described in usage() function. Return 1 in case of success and 0 in case of failure.""" global options task_run_start_timestamp = get_current_time_timestamp() options = task_get_options(task_id) # get options from BibSched task table ## check task id: if not options.has_key("task"): write_message("Error: The task #%d does not seem to be a WebColl task." % task_id) return 0 ## check task status: task_status = task_read_status(task_id) if task_status != "WAITING": write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status)) return 0 ## we can run the task now: if options["verbose"]: write_message("Task #%d started." % task_id) task_update_status("RUNNING") ## initialize signal handler: signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) colls = [] # decide whether we need to run or not, by comparing last updated timestamps: if options["verbose"] >= 3: write_message("Database timestamp is %s." % get_database_last_updated_timestamp()) write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp()) if options.has_key("force") or \ compare_timestamps_with_tolerance(get_database_last_updated_timestamp(), get_cache_last_updated_timestamp(), cfg_cache_last_updated_timestamp_tolerance) >= 0: ## either forced update was requested or cache is not up to date, so recreate it: # firstly, decide which collections to do: if options.has_key("collection"): coll = get_collection(options["collection"]) if coll.id == None: usage(1, 'Collection %s does not exist' % coll.name) colls.append(coll) else: res = run_sql("SELECT name FROM collection ORDER BY id") for row in res: colls.append(get_collection(row[0])) # secondly, update collection reclist cache: i = 0 for coll in colls: i += 1 if options["verbose"]: write_message("%s / reclist cache update" % coll.name) coll.calculate_reclist() coll.update_reclist() task_update_progress("Part 1/2: done %d/%d" % (i, len(colls))) # thirdly, update collection webpage cache: i = 0 for coll in colls: i += 1 if options["verbose"]: write_message("%s / web cache update" % coll.name) coll.update_webpage_cache() task_update_progress("Part 2/2: done %d/%d" % (i, len(colls))) # finally update the cache last updated timestamp: # (but only when all collections were updated, not when only # some of them were forced-updated as per admin's demand) if not options.has_key("collection"): set_cache_last_updated_timestamp(task_run_start_timestamp) if options["verbose"] >= 3: write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp()) else: ## cache up to date, we don't have to run if options["verbose"]: write_message("Collection cache is up to date, no need to run.") pass ## we are done: task_update_progress("Done.") task_update_status("DONE") if options["verbose"]: write_message("Task #%d finished." % task_id) return 1 def usage(exitcode=1, msg=""): """Prints usage info.""" if msg: sys.stderr.write("Error: %s.\n" % msg) sys.stderr.write("Usage: %s [options]\n" % sys.argv[0]) sys.stderr.write("Command options:\n") sys.stderr.write(" -c, --collection\t Update cache for the given collection only. [all]\n") sys.stderr.write(" -f, --force\t Force update even if cache is up to date. [no]\n") sys.stderr.write("Scheduling options:\n") sys.stderr.write(" -u, --user=USER \t User name to submit the task as, password needed.\n") sys.stderr.write(" -t, --runtime=TIME \t Time to execute the task (now), e.g.: +15s, 5m, 3h, 2002-10-27 13:57:26\n") sys.stderr.write(" -s, --sleeptime=SLEEP \t Sleeping frequency after which to repeat task (no), e.g.: 30m, 2h, 1d\n") sys.stderr.write("General options:\n") sys.stderr.write(" -h, --help \t\t Print this help.\n") sys.stderr.write(" -V, --version \t\t Print version information.\n") sys.stderr.write(" -v, --verbose=LEVEL \t Verbose level (from 0 to 9, default 1).\n") sys.stderr.write("""Description: %s updates the collection cache (record universe for a given collection plus web page elements) based on WML and DB configuration parameters. If the collection name is passed as the second argument, it'll update this collection only. If the collection name is immediately followed by a plus sign, it will also update all its desdendants. The top-level collection name may be entered as the void string.\n""" % sys.argv[0]) sys.exit(exitcode) def main(): """Main function that analyzes command line input and calls whatever is appropriate. Useful for learning on how to write BibSched tasks.""" global options ## parse command line: if len(sys.argv) == 2 and sys.argv[1].isdigit(): ## A - run the task task_id = int(sys.argv[1]) try: if not task_run(task_id): write_message("Error occurred. Exiting.", sys.stderr) except StandardError, e: write_message("Unexpected error occurred: %s." % e, sys.stderr) write_message("Traceback is:", sys.stderr) traceback.print_tb(sys.exc_info()[2]) write_message("Exiting.", sys.stderr) task_update_status("ERROR") else: ## B - submit the task # set default values: options["runtime"] = time.strftime("%Y-%m-%d %H:%M:%S") options["verbose"] = 1 options["sleeptime"] = "" # set user-defined options: try: opts, args = getopt.getopt(sys.argv[1:], "hVv:u:s:t:c:f", ["help", "version", "verbose=","user=","sleep=","time=","collection=","force"]) except getopt.GetoptError, err: usage(1, err) try: for opt in opts: if opt[0] in ["-h", "--help"]: usage(0) elif opt[0] in ["-V", "--version"]: print __revision__ sys.exit(0) elif opt[0] in [ "-u", "--user"]: options["user"] = opt[1] elif opt[0] in ["-v", "--verbose"]: options["verbose"] = int(opt[1]) elif opt[0] in [ "-s", "--sleeptime" ]: get_datetime(opt[1]) # see if it is a valid shift options["sleeptime"] = opt[1] elif opt[0] in [ "-t", "--runtime" ]: options["runtime"] = get_datetime(opt[1]) elif opt[0] in [ "-c", "--collection"]: options["collection"] = opt[1] elif opt[0] in [ "-f", "--force"]: options["force"] = 1 else: usage(1) except StandardError, e: usage(e) task_submit() return ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/websession/lib/webuser.py b/modules/websession/lib/webuser.py index f5bf04183..ee36a0dc5 100644 --- a/modules/websession/lib/webuser.py +++ b/modules/websession/lib/webuser.py @@ -1,738 +1,739 @@ # -*- coding: utf-8 -*- ## ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ This file implements all methods necessary for working with users and sessions in CDS Invenio. Contains methods for logging/registration when a user log/register into the system, checking if it is a guest user or not. At the same time this presents all the stuff it could need with sessions managements, working with websession. It also contains Apache-related user authentication stuff. """ __revision__ = "$Id$" from marshal import loads, dumps from zlib import compress, decompress import time import os import crypt import string import smtplib import sre from invenio.config import \ CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS, \ CFG_ACCESS_CONTROL_LEVEL_GUESTS, \ CFG_ACCESS_CONTROL_LEVEL_SITE, \ CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN, \ CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS, \ CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT, \ CFG_APACHE_GROUP_FILE, \ CFG_APACHE_PASSWORD_FILE, \ adminemail, \ cdslang, \ cdsname, \ supportemail, \ sweburl, \ tmpdir, \ version, \ weburl from invenio import session, websession from invenio.dbquery import run_sql, escape_string, OperationalError from invenio.websession import pSession, pSessionMapping from invenio.session import SessionError from invenio.access_control_config import * from invenio.access_control_engine import acc_authorize_action from invenio.access_control_admin import acc_findUserRoleActions from invenio.messages import gettext_set_language import invenio.template tmpl = invenio.template.load('websession') sre_invalid_nickname = sre.compile(""".*[,'@]+.*""") # pylint: disable-msg=C0301 def createGuestUser(): """Create a guest user , insert into user null values in all fields createGuestUser() -> GuestUserID """ if CFG_ACCESS_CONTROL_LEVEL_GUESTS == 0: try: return run_sql("insert into user (email, note) values ('', '1')") except OperationalError: return None elif CFG_ACCESS_CONTROL_LEVEL_GUESTS >= 1: try: return run_sql("insert into user (email, note) values ('', '0')") except OperationalError: return None def page_not_authorized(req, referer='', uid='', text='', navtrail='', ln=cdslang): """Show error message when account is not activated""" from invenio.webpage import page _ = gettext_set_language(ln) if not CFG_ACCESS_CONTROL_LEVEL_SITE: title = CFG_WEBACCESS_MSGS[5] if not uid: uid = getUid(req) try: res = run_sql("SELECT email FROM user WHERE id=%s" % uid) if res and res[0][0]: if text: body = text else: body = "%s %s" % (CFG_WEBACCESS_WARNING_MSGS[9] % res[0][0], ("%s %s" % (CFG_WEBACCESS_MSGS[0] % referer, CFG_WEBACCESS_MSGS[1]))) else: if text: body = text else: if CFG_ACCESS_CONTROL_LEVEL_GUESTS == 1: body = CFG_WEBACCESS_MSGS[3] else: body = CFG_WEBACCESS_WARNING_MSGS[4] + CFG_WEBACCESS_MSGS[2] except OperationalError, e: body = _("Database problem") + ': ' + str(e) elif CFG_ACCESS_CONTROL_LEVEL_SITE == 1: title = CFG_WEBACCESS_MSGS[8] body = "%s %s" % (CFG_WEBACCESS_MSGS[7], CFG_WEBACCESS_MSGS[2]) elif CFG_ACCESS_CONTROL_LEVEL_SITE == 2: title = CFG_WEBACCESS_MSGS[6] body = "%s %s" % (CFG_WEBACCESS_MSGS[4], CFG_WEBACCESS_MSGS[2]) return page(title=title, uid=getUid(req), body=body, navtrail=navtrail, req=req) def getUid (req): """Return user ID taking it from the cookie of the request. Includes control mechanism for the guest users, inserting in the database table when need be, raising the cookie back to the client. User ID is set to 0 when client refuses cookie or we are in the read-only site operation mode. User ID is set to -1 when we are in the permission denied site operation mode. getUid(req) -> userId """ if CFG_ACCESS_CONTROL_LEVEL_SITE == 1: return 0 if CFG_ACCESS_CONTROL_LEVEL_SITE == 2: return -1 guest = 0 sm = session.MPSessionManager(pSession, pSessionMapping()) try: s = sm.get_session(req) except SessionError: sm.revoke_session_cookie (req) s = sm.get_session(req) userId = s.getUid() if userId == -1: # first time, so create a guest user s.setUid(createGuestUser()) userId = s.getUid() guest = 1 sm.maintain_session(req, s) if guest == 0: guest = isGuestUser(userId) if guest: if CFG_ACCESS_CONTROL_LEVEL_GUESTS == 0: return userId elif CFG_ACCESS_CONTROL_LEVEL_GUESTS >= 1: return -1 else: res = run_sql("SELECT note FROM user WHERE id=%s" % userId) if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 0: return userId elif CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS >= 1 and res and res[0][0] in [1, "1"]: return userId else: return -1 def setUid(req, uid): """It sets the userId into the session, and raise the cookie to the client. """ sm = session.MPSessionManager(pSession, pSessionMapping()) try: s = sm.get_session(req) except SessionError: sm.revoke_session_cookie(req) s = sm.get_session(req) s.setUid(uid) sm.maintain_session(req, s) return uid def get_user_info(uid, ln=cdslang): """Get infos for a given user. @param uid: user id (int) @return tuple: (uid, nickname, display_name) """ _ = gettext_set_language(ln) query = """SELECT id, nickname FROM user WHERE id=%i""" res = run_sql(query%uid) if res: if res[0]: user = list(res[0]) if user[1]: user.append(user[1]) else: user[1] = str(user[0]) user.append(_("user") + ' #' + str(user[0])) return tuple(user) return (uid, '', _("N/A")) def isGuestUser(uid): """It Checks if the userId corresponds to a guestUser or not isGuestUser(uid) -> boolean """ out = 1 try: res = run_sql("select email from user where id=%s", (uid,)) if res: if res[0][0]: out = 0 except OperationalError: pass return out def isUserSubmitter(uid): u_email = get_email(uid) res = run_sql("select * from sbmSUBMISSIONS where email=%s", (u_email,)) if len(res) > 0: return 1 else: return 0 def isUserReferee(uid): res = run_sql("select sdocname from sbmDOCTYPE") for row in res: doctype = row[0] categ = "*" (auth_code, auth_message) = acc_authorize_action(uid, "referee", doctype=doctype, categ=categ) if auth_code == 0: return 1 res2 = run_sql("select sname from sbmCATEGORIES where doctype=%s", (doctype,)) for row2 in res2: categ = row2[0] (auth_code, auth_message) = acc_authorize_action(uid, "referee", doctype=doctype, categ=categ) if auth_code == 0: return 1 return 0 def isUserAdmin(uid): "Return 1 if the user UID has some admin rights; 0 otherwise." out = 0 if acc_findUserRoleActions(uid): out = 1 return out def nickname_valid_p(nickname): """Check whether wanted NICKNAME supplied by the user is valid. At the moment we just check whether it is not empty, does not contain blanks or @, is not equal to `guest', etc. This check relies on sre_invalid_nickname regexp (see above) Return 1 if nickname is okay, return 0 if it is not. """ if nickname and \ not(nickname.startswith(' ') or nickname.endswith(' ')) and \ nickname.lower() != 'guest': if not sre_invalid_nickname.match(nickname): return 1 return 0 def email_valid_p(email): """Check whether wanted EMAIL address supplied by the user is valid. At the moment we just check whether it contains '@' and whether it doesn't contain blanks. We also check the email domain if CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN is set. Return 1 if email is okay, return 0 if it is not. """ if (string.find(email, "@") <= 0) or (string.find(email, " ") > 0): return 0 elif CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN: if not email.endswith(CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN): return 0 return 1 def registerUser(req, email, passw, nickname, register_without_nickname=False): """Register user with the desired values of NICKNAME, EMAIL and PASSW. If REGISTER_WITHOUT_NICKNAME is set to True, then ignore desired NICKNAME and do not set any. This is suitable for external authentications so that people can login without having to register an internal account first. Return 0 if the registration is successful, 1 if email is not valid, 2 if nickname is not valid, 3 if email is already in the database, 4 if nickname is already in the database, 5 when users cannot register themselves because of the site policy. """ # is email valid? if not email_valid_p(email): return 1 # is email already taken? res = run_sql("SELECT * FROM user WHERE email=%s", (email,)) if len(res) > 0: return 3 if register_without_nickname: # ignore desired nick and use default empty string one: nickname = "" else: # is nickname valid? if not nickname_valid_p(nickname): return 2 # is nickname already taken? res = run_sql("SELECT * FROM user WHERE nickname=%s", (nickname,)) if len(res) > 0: return 4 # okay, go on and register the user: if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 0: activated = 1 elif CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 1: activated = 0 elif CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS >= 2: return 5 user_preference = get_default_user_preferences() setUid(req, run_sql("INSERT INTO user (nickname, email, password, note, settings) VALUES (%s,%s,%s,%s,%s)", (nickname, email, passw, activated, serialize_via_marshal(user_preference),))) if CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT: sendNewUserAccountWarning(email, email, passw) if CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS: sendNewAdminAccountWarning(email, adminemail) return 0 def updateDataUser(uid, email, password, nickname): """Update user data. Used when a user changed his email or password or nickname. """ if email == 'guest': return 0 if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS >= 2: run_sql("update user set password=%s where id=%s", (password, uid)) else: run_sql("update user set email=%s,password=%s where id=%s", (email, password, uid)) if nickname and nickname != '': run_sql("update user set nickname=%s where id=%s", (nickname, uid)) return 1 def loginUser(req, p_un, p_pw, login_method): """It is a first simple version for the authentication of user. It returns the id of the user, for checking afterwards if the login is correct """ # p_un passed may be an email or a nickname: p_email = get_email_from_username(p_un) # go on with the old stuff based on p_email: user_prefs = get_user_preferences(emailUnique(p_email)) if user_prefs and login_method != user_prefs["login_method"]: if CFG_EXTERNAL_AUTHENTICATION.has_key(user_prefs["login_method"]): return ([], p_email, p_pw, 11) if not CFG_EXTERNAL_AUTHENTICATION.has_key(login_method): return ([], p_email, p_pw, 12) if CFG_EXTERNAL_AUTHENTICATION[login_method][0]: p_email = CFG_EXTERNAL_AUTHENTICATION[login_method][0].auth_user(p_email, p_pw) if p_email: p_pw = givePassword(p_email) if not p_pw or p_pw < 0: import random p_pw = int(random.random() * 1000000) if registerUser(req, p_email, p_pw, "", register_without_nickname=True) != 0: return ([], p_email, p_pw, 13) else: query_result = run_sql("SELECT id from user where email=%s and password=%s", (p_email, p_pw,)) user_prefs = get_user_preferences(query_result[0][0]) user_prefs["login_method"] = login_method set_user_preferences(query_result[0][0], user_prefs) else: return ([], p_email, p_pw, 10) query_result = run_sql("SELECT id from user where email=%s and password=%s", (p_email, p_pw,)) if query_result: prefered_login_method = get_user_preferences(query_result[0][0])['login_method'] else: return ([], p_email, p_pw, 14) if login_method != prefered_login_method: if CFG_EXTERNAL_AUTHENTICATION.has_key(prefered_login_method): return ([], p_email, p_pw, 11) return (query_result, p_email, p_pw, 0) def logoutUser(req): """It logout the user of the system, creating a guest user. """ getUid(req) sm = session.MPSessionManager(pSession, pSessionMapping()) try: s = sm.get_session(req) except SessionError: sm.revoke_session_cookie(req) s = sm.get_session(req) id1 = createGuestUser() s.setUid(id1) sm.maintain_session(req, s) return id1 def username_exists_p(username): """Check if USERNAME exists in the system. Username may be either nickname or email. Return 1 if it does exist, 0 if it does not. """ if username == "": # return not exists if asked for guest users return 0 - res = run_sql("SELECT email FROM user WHERE nickname=%s OR email=%s", - (username, username,)) + res = run_sql("SELECT email FROM user WHERE email=%s", (username,)) + \ + run_sql("SELECT email FROM user WHERE nickname=%s", (username,)) if len(res) > 0: return 1 return 0 def emailUnique(p_email): """Check if the email address only exists once. If yes, return userid, if not, -1 """ query_result = run_sql("select id, email from user where email=%s", (p_email,)) if len(query_result) == 1: return query_result[0][0] elif len(query_result) == 0: return 0 return -1 def nicknameUnique(p_nickname): """Check if the nickname only exists once. If yes, return userid, if not, -1 """ query_result = run_sql("select id, nickname from user where nickname=%s", (p_nickname,)) if len(query_result) == 1: return query_result[0][0] elif len(query_result) == 0: return 0 return -1 def update_Uid(req, p_email, p_pw): """It updates the userId of the session. It is used when a guest user is logged in succesfully in the system with a given email and password """ query_ID = int(run_sql("select id from user where email=%s and password=%s", (p_email, p_pw))[0][0]) setUid(req, query_ID) return query_ID def givePassword(email): """ It checks in the database the password for a given email. It is used to send the password to the email of the user.It returns the password if the user exists, otherwise it returns -999 """ query_pass = run_sql("select password from user where email =%s", (email,)) if len(query_pass)>0: return query_pass[0][0] return -999 def sendNewAdminAccountWarning(newAccountEmail, sendTo, ln=cdslang): """Send an email to the address given by sendTo about the new account newAccountEmail.""" _ = gettext_set_language(ln) fromaddr = "From: %s" % supportemail toaddrs = "To: %s" % sendTo to = toaddrs + "\n" sub = "Subject: New account on '%s'" % cdsname if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 1: sub += " - PLEASE ACTIVATE" sub += "\n\n" body = "A new account has been created on '%s'" % cdsname if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS == 1: body += " and is awaiting activation" body += ":\n\n" body += " Username/Email: %s\n\n" % newAccountEmail body += "You can approve or reject this account request at: %s/admin/webaccess/webaccessadmin.py/manageaccounts\n" % weburl body += "\n---------------------------------" body += "\n%s" % cdsname body += "\nContact: %s" % supportemail msg = to + sub + body server = smtplib.SMTP('localhost') server.set_debuglevel(1) try: server.sendmail(fromaddr, toaddrs, msg) except smtplib.SMTPRecipientsRefused: return 0 server.quit() return 1 def sendNewUserAccountWarning(newAccountEmail, sendTo, password, ln=cdslang): """Send an email to the address given by sendTo about the new account newAccountEmail.""" _ = gettext_set_language(ln) fromaddr = "From: %s" % supportemail toaddrs = "To: %s" % sendTo to = toaddrs + "\n" sub = "Subject: Your account created on '%s'\n\n" % cdsname body = "You have created a new account on '%s':\n\n" % cdsname body += " Username/Email: %s\n" % newAccountEmail body += " Password: %s\n\n" % ("*" * len(password)) if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS >= 1: body += "This account is awaiting approval by the site administrators and therefore cannot be used as of yet.\nYou will receive an email notification as soon as your account request has been processed.\n" body += "\n---------------------------------" body += "\n%s" % cdsname body += "\nContact: %s" % supportemail msg = to + sub + body server = smtplib.SMTP('localhost') server.set_debuglevel(1) try: server.sendmail(fromaddr, toaddrs, msg) except smtplib.SMTPRecipientsRefused: return 0 server.quit() return 1 def get_email(uid): """Return email address of the user uid. Return string 'guest' in case the user is not found.""" out = "guest" res = run_sql("SELECT email FROM user WHERE id=%s", (uid,), 1) if res and res[0][0]: out = res[0][0] return out def get_email_from_username(username): """Return email address of the user corresponding to USERNAME. The username may be either nickname or email. Return USERNAME untouched if not found in the database or if found several matching entries. """ out = username - res = run_sql("SELECT email FROM user WHERE email=%s OR nickname=%s", (username, username,), 1) + res = run_sql("SELECT email FROM user WHERE email=%s", (username,), 1) + \ + run_sql("SELECT email FROM user WHERE nickname=%s", (username,), 1) if res and len(res) == 1: out = res[0][0] return out def get_password(uid): """Return password of the user uid. Return None in case the user is not found.""" out = None res = run_sql("SELECT password FROM user WHERE id=%s", (uid,), 1) if res and res[0][0]: out = res[0][0] return out def get_nickname(uid): """Return nickname of the user uid. Return None in case the user is not found.""" out = None res = run_sql("SELECT nickname FROM user WHERE id=%s", (uid,), 1) if res and res[0][0]: out = res[0][0] return out def get_nickname_or_email(uid): """Return nickname (preferred) or the email address of the user uid. Return string 'guest' in case the user is not found.""" out = "guest" res = run_sql("SELECT nickname, email FROM user WHERE id=%s", (uid,), 1) if res and res[0]: if res[0][0]: out = res[0][0] elif res[0][1]: out = res[0][1] return out def create_userinfobox_body(req, uid, language="en"): """Create user info box body for user UID in language LANGUAGE.""" if req: if req.subprocess_env.has_key('HTTPS') \ and req.subprocess_env['HTTPS'] == 'on': url_referer = sweburl + req.unparsed_uri else: url_referer = weburl + req.unparsed_uri else: url_referer = weburl try: return tmpl.tmpl_create_userinfobox(ln=language, url_referer=url_referer, guest = isGuestUser(uid), username = get_nickname_or_email(uid), submitter = isUserSubmitter(uid), referee = isUserReferee(uid), admin = isUserAdmin(uid), ) except OperationalError: return "" def list_registered_users(): """List all registered users.""" return run_sql("SELECT id,email FROM user where email!=''") def list_users_in_role(role): """List all users of a given role (see table accROLE) @param role: role of user (string) @return list of uids """ query = """SELECT uacc.id_user FROM user_accROLE uacc JOIN accROLE acc ON uacc.id_accROLE=acc.id WHERE acc.name='%s'""" res = run_sql(query% escape_string(role)) if res: return map(lambda x: int(x[0]), res) return [] def list_users_in_roles(role_list): """List all users of given roles (see table accROLE) @param role_list: list of roles [string] @return list of uids """ if not(type(role_list) is list or type(role_list) is tuple): role_list = [role_list] params = '' query = """SELECT distinct(uacc.id_user) FROM user_accROLE uacc JOIN accROLE acc ON uacc.id_accROLE=acc.id %s""" if len(role_list) > 0: params = 'WHERE ' for role in role_list[:-1]: params += "acc.name='%s' OR " % escape_string(role) params += "acc.name='%s'" % escape_string(role_list[-1]) res = run_sql(query% params) if res: return map(lambda x: int(x[0]), res) return [] ## --- follow some functions for Apache user/group authentication def auth_apache_user_p(user, password, apache_password_file=CFG_APACHE_PASSWORD_FILE): """Check whether user-supplied credentials correspond to valid Apache password data file. Return 0 in case of failure, 1 in case of success.""" try: if not apache_password_file.startswith("/"): apache_password_file = tmpdir + "/" + apache_password_file dummy, pipe_output = os.popen2(["grep", "^" + user + ":", apache_password_file], 'r') line = pipe_output.readlines()[0] password_apache = string.split(string.strip(line),":")[1] except: # no pw found, so return not-allowed status return 0 salt = password_apache[:2] if crypt.crypt(password, salt) == password_apache: return 1 else: return 0 def auth_apache_user_in_groups(user, apache_group_file=CFG_APACHE_GROUP_FILE): """Return list of Apache groups to which Apache user belong.""" out = [] try: if not apache_group_file.startswith("/"): apache_group_file = tmpdir + "/" + apache_group_file dummy, pipe_output = os.popen2(["grep", user, apache_group_file], 'r') for line in pipe_output.readlines(): out.append(string.split(string.strip(line),":")[0]) except: # no groups found, so return empty list pass return out def auth_apache_user_collection_p(user, password, coll): """Check whether user-supplied credentials correspond to valid Apache password data file, and whether this user is authorized to see the given collections. Return 0 in case of failure, 1 in case of success.""" from invenio.search_engine import coll_restricted_p, coll_restricted_group if not auth_apache_user_p(user, password): return 0 if not coll_restricted_p(coll): return 1 if coll_restricted_group(coll) in auth_apache_user_in_groups(user): return 1 else: return 0 def get_user_preferences(uid): pref = run_sql("SELECT id, settings FROM user WHERE id=%s", (uid,)) if pref: try: return deserialize_via_marshal(pref[0][1]) except: return get_default_user_preferences() return None def set_user_preferences(uid, pref): run_sql("UPDATE user SET settings='%s' WHERE id=%s" % (serialize_via_marshal(pref), uid)) def get_default_user_preferences(): user_preference = { 'login_method': ''} for system in CFG_EXTERNAL_AUTHENTICATION.keys(): if CFG_EXTERNAL_AUTHENTICATION[system][1]: user_preference['login_method'] = system break return user_preference def serialize_via_marshal(obj): """Serialize Python object via marshal into a compressed string.""" return escape_string(compress(dumps(obj))) def deserialize_via_marshal(string): """Decompress and deserialize string into a Python object via marshal.""" return loads(decompress(string))