diff --git a/modules/bibformat/bin/bibreformat.in b/modules/bibformat/bin/bibreformat.in index 03a327d2f..7d86144be 100644 --- a/modules/bibformat/bin/bibreformat.in +++ b/modules/bibformat/bin/bibreformat.in @@ -1,694 +1,694 @@ ## $Id$ ## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## read config variables: #include "config.wml" #include "configbis.wml" ## start Python: #! ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. ## import interesting modules: pylibdir = "/python" ## okay, rest of the Python code goes below ####### ## version number: __version__ = "$Id$" ## import interesting modules: try: import sys sys.path.append('%s' % pylibdir) from cdsware.dbquery import run_sql from cdsware.config import * from cdsware.search_engine import perform_request_search from cdsware.search_engine import print_record import getopt import marshal import signal import string import sys import os import re import time import MySQLdb except ImportError, e: print "Error: %s" % e import sys sys.exit(1) sql_queries = [] # holds SQL queries to be executed cds_query = {} # holds CDS query parameters (fields, collection, pattern) process_format = 0 # flag, process records without created format process = 1 # flag, process records (unless count only) fmt = "hb" # default format to be processed sleeptime = "" # default sleeptime format_string = "%Y-%m-%d %H:%M:%S" # date/time format sched_time = time.strftime(format_string) # scheduled execution time in the date/time format ### sql commands to be executed during the script run ### sql = { "all" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt, "last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt, "q1" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'", "q2" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt } ### run the bibreformat task bibsched scheduled ### def bibreformat_task(sql_queries, cds_query, process_format): global process, fmt t1 = os.times()[4] ### Query the database ### if process_format: print "Querying database for records with missing format ..." without_format = without_fmt() recIDs = [] if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": print "Querying database for records with old format (CDS query)..." - res = perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])#.tolist() + res = perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field']) for item in res: recIDs.append(item) for sql_query in sql_queries: print "Querying database for records with old format (SQL query) ..." res = run_sql(sql_query) for item in res: recIDs.append(item[0]) ### list of corresponding record IDs was retrieved ### bibformat the records selected if process_format: print "Records to be processed: %d" % (len(recIDs)+len(without_format)) print "Out of it records without created format: %d" % len(without_format) else: print "Records to be processed: %d" % (len(recIDs)) ### Initialize main loop total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: iterate_over(recIDs, weburl, fmt) ### Iterate over all records prepared in list II (no_format) if process_format and process: iterate_over(without_format, weburl, fmt) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec print message message = "total processing time: %2f sec" % elapsed print message message = "Time spent on external call (os.system):" print message message = " bibformat: %2f sec" % tbibformat print message message = " bibupload: %2f sec" % tbibupload print message ### Result set operations ### def lhdiff(l1, l2): "Does list difference via intermediate hash." d = {} ld = [] for e in l2: d[e]=1 for e in l1: if not d.has_key(e): ld.append(e) return ld ### Result set operations ### def ldiff(l1, l2): "Returns l1 - l2." ld = [] for e in l1: if not e in l2: ld.append(e) return ld ### Identify recIDs of records with missing format ### def without_fmt(): "List of record IDs to be reformated, not having the specified format yet" global fmt xm1, xm2, format1, format2 = [],[],[],[] q1 = sql['q1'] # "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'" q2 = sql['q2'] # "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt ## get complete recID list of xm formatted records xm1 = run_sql(q1) for item in xm1: xm2.append(item[0]) ## get complete recID list of formatted records format1 = run_sql(q2) for item in format1: format2.append(item[0]) return lhdiff(xm2,format2) ### Bibreformat all selected records ### def iterate_over(list, weburl, fmt): "Iterate odver list of IDs" n_rec = 0 n_max = 10000 total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call for record in list: n_rec = n_rec + 1 total_rec = total_rec + 1 message = "Processing record: %d" % (record) print message query = "id=%d&of=xm" % (record) count = 0 contents = print_record(record, 'xm') while (contents == "") and (count < 10): contents = print_record(record, 'xm') count = count + 1 time.sleep(10) if count == 10: sys.stderr.write("Failed to download %s from %s after 10 attempts... terminating" % (query, weburl)) sys.exit(0) xml_content = xml_content + contents if xml_content: if n_rec >= n_max: finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S')) filename = "%s/bibreformat.xml" % tmpdir filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message command = "%s/bibupload -f %s" % (bindir,finalfilename) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) n_rec = 0 xml_content = '' ### Process the last re-formated chunk ### if n_rec > 0: print "Processing last record set (%d)" % n_rec finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S')) filename = "%s/bibreformat.xml" % tmpdir filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message command = "%s/bibupload -f %s" % (bindir,finalfilename) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) return ### Bibshed compatibility procedures ### def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) print value date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def write_message(msg, stream=sys.stdout): """Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff.""" if stream == sys.stdout or stream == sys.stderr: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" write_message("stopping...") task_update_status("STOPPING") write_message("flushing cache or whatever...") time.sleep(3) write_message("closing tables or whatever...") time.sleep(1) write_message("stopped") task_update_status("STOPPED") sys.exit(0) def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" write_message("unknown signal %d ignored" % sig) # do nothing for other signals def getpass(prompt = "Password: "): """Prompts for a password without echoing it back to the screen""" import termios, sys fd = sys.stdin.fileno() old = termios.tcgetattr(fd) new = termios.tcgetattr(fd) new[3] = new[3] & ~termios.ECHO # lflags passwd = "" try: termios.tcsetattr(fd, termios.TCSADRAIN, new) passwd = raw_input(prompt) print finally: termios.tcsetattr(fd, termios.TCSADRAIN, old) return passwd def authenticate(user): """Authenticates a user against the user database. NOTE: Access might be more complex in the future""" print "BibReformat Task Submission" print "=========================" if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user res = run_sql("select password from user where email=%s", (user,), 1) if res: row = res[0] password_db = row[0] if password_db != '': # authentication needed password_entered = getpass() if password_db == password_entered: return user else: print "Sorry, you seem to be unauthorized user. Exiting." sys.exit(1) else: # no authentication needed return user else: print "Sorry, %s seems to be unauthorized user. Exiting." % user sys.exit(1) def task_submit(options): """Submits task to the BibSched task queue. This is what people will be invoking via command line.""" global sched_time, sleep_time ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] ## authenticate user: user = authenticate(options.get("user", "")) ## submit task: task_id = run_sql("""INSERT INTO schTASK (id,proc,user,status,arguments,sleeptime,runtime) VALUES (NULL,'bibreformat',%s,'WAITING',%s,%s,%s)""", (user, marshal.dumps(options),sleeptime,MySQLdb.escape_string(sched_time))) ## update task number: options["task"] = task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id)) write_message("Task #%d submitted." % task_id) return task_id def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global task_id return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, task_id)) def task_update_status(val): """Updates status information in the BibSched task table.""" global task_id return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, task_id)) def task_read_status(task_id): """Read status information in the BibSched task table.""" res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1) try: out = res[0][0] except: out = 'UNKNOWN' return out def task_get_options(id): """Returns options for the task 'id' read from the BibSched task queue table.""" out = {} res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibreformat'", (id,)) try: out = marshal.loads(res[0][0]) except: write_message("Error: BibReformat task %d does not seem to exist." % id) sys.exit(1) return out def task_run(process_format): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call.""" global task_id, process, fmt, sched_time options = task_get_options(task_id) # get options from BibSched task table ## check task id: if not options.has_key("task"): write_message("Error: The task #%d does not seem to be a BibReformat task." % task_id) return ## initialize parameters if options.has_key("all"): # sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt) sql_queries.append(sql['all']) if options.has_key("without"): process_format = 1 if options.has_key("noprocess"): process = 0 if options.has_key("last"): # sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt) sql_queries.append(sql['last']) if options.has_key("collection"): cds_query['collection'] = options['collection'] else: cds_query['collection'] = "" if options.has_key("field"): cds_query['field'] = options['field'] else: cds_query['field'] = "" if options.has_key("pattern"): cds_query['pattern'] = options['pattern'] else: cds_query['pattern'] = "" if options.has_key("format"): fmt = options["format"] ## check task status: task_status = task_read_status(task_id) if task_status != "WAITING": write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status)) return ## update task status: task_update_status("RUNNING") ## initialize signal handler: signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) ## run the task: bibreformat_task(sql_queries, cds_query, process_format) ## we are done: task_update_status("DONE") return def usage(exitcode=1, msg=""): """Prints usage info.""" if msg: sys.stderr.write("Error: %s.\n" % msg) sys.stderr.write("Usage: %s [options]\n" % sys.argv[0]) sys.stderr.write(" -u, --user=USER \t\t User name to submit the task as, password needed.\n") sys.stderr.write(" -h, --help \t\t Print this help.\n") sys.stderr.write(" -V, --version \t\t Print version information.\n") sys.stderr.write(" -d, --debug \t\t Print debugging information.\n") sys.stderr.write(" -s, --sleeptime=SLEEP\t\t Time after which to repeat tasks (no)\n") sys.stderr.write(" -t, --time=DATE \t\t Moment for the task to be active (now).\n") sys.stderr.write(" -a, --all \t\t All records\n") sys.stderr.write(" -c, --collection \t\t Select records by collection\n") sys.stderr.write(" -f, --field \t\t Select records by field.\n") sys.stderr.write(" -p, --pattern \t\t Select records by pattern.\n") sys.stderr.write(" -o, --format \t\t Specify output format to be (re-)created. (default HB)\n") sys.stderr.write(" -n, --noprocess \t\t Count records to be processed only (no processing done)\n") sys.stderr.write("\n") sys.stderr.write(" Example: bibreformat -n Show how many records are to be bibreformated.") sys.stderr.write("\n") sys.exit(exitcode) def main(): """Main function that analyzes command line input and calls whatever is appropriate. Useful for learning on how to write BibSched tasks.""" global task_id, sched_time, sleeptime ## parse command line: if len(sys.argv) == 2 and sys.argv[1].isdigit(): ## A - run the task task_id = int(sys.argv[1]) process_format = 0 task_run(process_format) else: ## B - submit the task process_format = 0 options = {} # will hold command-line options try: opts, args = getopt.getopt(sys.argv[1:], "hVdu:ac:f:s:p:lo:nt:wl", ["help", "version", "debug","user=","all","collection=","field=","sleeptime=","pattern=","format=","noprocess","time=","without","last"]) except getopt.GetoptError, err: usage(1, err) clp = 0 # default parameters flag try: for opt in opts: if opt[0] in ["-h", "--help"]: usage(0) elif opt[0] in ["-V", "--version"]: print __version__ sys.exit(0) elif opt[0] in [ "-u", "--user"]: options["user"] = opt[1] elif opt[0] in ["-d", "--debug"]: options["debug"] = 1 elif opt[0] in ["-a", "--all"]: options["all"] = 1 options["without"] = 1 clp = 1 elif opt[0] in ["-c", "--collection"]: options["collection"]=opt[1] clp = 1 elif opt[0] in ["-n", "--noprocess"]: options["noprocess"] = 1 elif opt[0] in ["-f", "--field"]: options["field"] = opt[1] clp = 1 elif opt[0] in ["-p","--pattern"]: options["pattern"] = opt[1] clp = 1 elif opt[0] in ["-o","--format"]: options["format"] = opt[1] elif opt[0] in ["-s", "--sleeptime" ]: get_date(opt[1]) # see if it is a valid shift sleeptime = opt[1] elif opt[0] in [ "-t", "--time" ]: sched_time = get_date(opt[1]) if clp == 0: # default options["without"] = 1 options["last"] = 1 except StandardError, e: usage(e) task_submit(options) return ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/bibformat/bin/bibreformat.wml b/modules/bibformat/bin/bibreformat.wml index 03a327d2f..7d86144be 100644 --- a/modules/bibformat/bin/bibreformat.wml +++ b/modules/bibformat/bin/bibreformat.wml @@ -1,694 +1,694 @@ ## $Id$ ## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## read config variables: #include "config.wml" #include "configbis.wml" ## start Python: #! ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. ## import interesting modules: pylibdir = "/python" ## okay, rest of the Python code goes below ####### ## version number: __version__ = "$Id$" ## import interesting modules: try: import sys sys.path.append('%s' % pylibdir) from cdsware.dbquery import run_sql from cdsware.config import * from cdsware.search_engine import perform_request_search from cdsware.search_engine import print_record import getopt import marshal import signal import string import sys import os import re import time import MySQLdb except ImportError, e: print "Error: %s" % e import sys sys.exit(1) sql_queries = [] # holds SQL queries to be executed cds_query = {} # holds CDS query parameters (fields, collection, pattern) process_format = 0 # flag, process records without created format process = 1 # flag, process records (unless count only) fmt = "hb" # default format to be processed sleeptime = "" # default sleeptime format_string = "%Y-%m-%d %H:%M:%S" # date/time format sched_time = time.strftime(format_string) # scheduled execution time in the date/time format ### sql commands to be executed during the script run ### sql = { "all" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt, "last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt, "q1" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'", "q2" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt } ### run the bibreformat task bibsched scheduled ### def bibreformat_task(sql_queries, cds_query, process_format): global process, fmt t1 = os.times()[4] ### Query the database ### if process_format: print "Querying database for records with missing format ..." without_format = without_fmt() recIDs = [] if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": print "Querying database for records with old format (CDS query)..." - res = perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])#.tolist() + res = perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field']) for item in res: recIDs.append(item) for sql_query in sql_queries: print "Querying database for records with old format (SQL query) ..." res = run_sql(sql_query) for item in res: recIDs.append(item[0]) ### list of corresponding record IDs was retrieved ### bibformat the records selected if process_format: print "Records to be processed: %d" % (len(recIDs)+len(without_format)) print "Out of it records without created format: %d" % len(without_format) else: print "Records to be processed: %d" % (len(recIDs)) ### Initialize main loop total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: iterate_over(recIDs, weburl, fmt) ### Iterate over all records prepared in list II (no_format) if process_format and process: iterate_over(without_format, weburl, fmt) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec print message message = "total processing time: %2f sec" % elapsed print message message = "Time spent on external call (os.system):" print message message = " bibformat: %2f sec" % tbibformat print message message = " bibupload: %2f sec" % tbibupload print message ### Result set operations ### def lhdiff(l1, l2): "Does list difference via intermediate hash." d = {} ld = [] for e in l2: d[e]=1 for e in l1: if not d.has_key(e): ld.append(e) return ld ### Result set operations ### def ldiff(l1, l2): "Returns l1 - l2." ld = [] for e in l1: if not e in l2: ld.append(e) return ld ### Identify recIDs of records with missing format ### def without_fmt(): "List of record IDs to be reformated, not having the specified format yet" global fmt xm1, xm2, format1, format2 = [],[],[],[] q1 = sql['q1'] # "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'" q2 = sql['q2'] # "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt ## get complete recID list of xm formatted records xm1 = run_sql(q1) for item in xm1: xm2.append(item[0]) ## get complete recID list of formatted records format1 = run_sql(q2) for item in format1: format2.append(item[0]) return lhdiff(xm2,format2) ### Bibreformat all selected records ### def iterate_over(list, weburl, fmt): "Iterate odver list of IDs" n_rec = 0 n_max = 10000 total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call for record in list: n_rec = n_rec + 1 total_rec = total_rec + 1 message = "Processing record: %d" % (record) print message query = "id=%d&of=xm" % (record) count = 0 contents = print_record(record, 'xm') while (contents == "") and (count < 10): contents = print_record(record, 'xm') count = count + 1 time.sleep(10) if count == 10: sys.stderr.write("Failed to download %s from %s after 10 attempts... terminating" % (query, weburl)) sys.exit(0) xml_content = xml_content + contents if xml_content: if n_rec >= n_max: finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S')) filename = "%s/bibreformat.xml" % tmpdir filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message command = "%s/bibupload -f %s" % (bindir,finalfilename) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) n_rec = 0 xml_content = '' ### Process the last re-formated chunk ### if n_rec > 0: print "Processing last record set (%d)" % n_rec finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S')) filename = "%s/bibreformat.xml" % tmpdir filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message command = "%s/bibupload -f %s" % (bindir,finalfilename) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) return ### Bibshed compatibility procedures ### def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) print value date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def write_message(msg, stream=sys.stdout): """Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff.""" if stream == sys.stdout or stream == sys.stderr: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" write_message("stopping...") task_update_status("STOPPING") write_message("flushing cache or whatever...") time.sleep(3) write_message("closing tables or whatever...") time.sleep(1) write_message("stopped") task_update_status("STOPPED") sys.exit(0) def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" write_message("unknown signal %d ignored" % sig) # do nothing for other signals def getpass(prompt = "Password: "): """Prompts for a password without echoing it back to the screen""" import termios, sys fd = sys.stdin.fileno() old = termios.tcgetattr(fd) new = termios.tcgetattr(fd) new[3] = new[3] & ~termios.ECHO # lflags passwd = "" try: termios.tcsetattr(fd, termios.TCSADRAIN, new) passwd = raw_input(prompt) print finally: termios.tcsetattr(fd, termios.TCSADRAIN, old) return passwd def authenticate(user): """Authenticates a user against the user database. NOTE: Access might be more complex in the future""" print "BibReformat Task Submission" print "=========================" if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user res = run_sql("select password from user where email=%s", (user,), 1) if res: row = res[0] password_db = row[0] if password_db != '': # authentication needed password_entered = getpass() if password_db == password_entered: return user else: print "Sorry, you seem to be unauthorized user. Exiting." sys.exit(1) else: # no authentication needed return user else: print "Sorry, %s seems to be unauthorized user. Exiting." % user sys.exit(1) def task_submit(options): """Submits task to the BibSched task queue. This is what people will be invoking via command line.""" global sched_time, sleep_time ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] ## authenticate user: user = authenticate(options.get("user", "")) ## submit task: task_id = run_sql("""INSERT INTO schTASK (id,proc,user,status,arguments,sleeptime,runtime) VALUES (NULL,'bibreformat',%s,'WAITING',%s,%s,%s)""", (user, marshal.dumps(options),sleeptime,MySQLdb.escape_string(sched_time))) ## update task number: options["task"] = task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id)) write_message("Task #%d submitted." % task_id) return task_id def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global task_id return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, task_id)) def task_update_status(val): """Updates status information in the BibSched task table.""" global task_id return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, task_id)) def task_read_status(task_id): """Read status information in the BibSched task table.""" res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1) try: out = res[0][0] except: out = 'UNKNOWN' return out def task_get_options(id): """Returns options for the task 'id' read from the BibSched task queue table.""" out = {} res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibreformat'", (id,)) try: out = marshal.loads(res[0][0]) except: write_message("Error: BibReformat task %d does not seem to exist." % id) sys.exit(1) return out def task_run(process_format): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call.""" global task_id, process, fmt, sched_time options = task_get_options(task_id) # get options from BibSched task table ## check task id: if not options.has_key("task"): write_message("Error: The task #%d does not seem to be a BibReformat task." % task_id) return ## initialize parameters if options.has_key("all"): # sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt) sql_queries.append(sql['all']) if options.has_key("without"): process_format = 1 if options.has_key("noprocess"): process = 0 if options.has_key("last"): # sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt) sql_queries.append(sql['last']) if options.has_key("collection"): cds_query['collection'] = options['collection'] else: cds_query['collection'] = "" if options.has_key("field"): cds_query['field'] = options['field'] else: cds_query['field'] = "" if options.has_key("pattern"): cds_query['pattern'] = options['pattern'] else: cds_query['pattern'] = "" if options.has_key("format"): fmt = options["format"] ## check task status: task_status = task_read_status(task_id) if task_status != "WAITING": write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status)) return ## update task status: task_update_status("RUNNING") ## initialize signal handler: signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) ## run the task: bibreformat_task(sql_queries, cds_query, process_format) ## we are done: task_update_status("DONE") return def usage(exitcode=1, msg=""): """Prints usage info.""" if msg: sys.stderr.write("Error: %s.\n" % msg) sys.stderr.write("Usage: %s [options]\n" % sys.argv[0]) sys.stderr.write(" -u, --user=USER \t\t User name to submit the task as, password needed.\n") sys.stderr.write(" -h, --help \t\t Print this help.\n") sys.stderr.write(" -V, --version \t\t Print version information.\n") sys.stderr.write(" -d, --debug \t\t Print debugging information.\n") sys.stderr.write(" -s, --sleeptime=SLEEP\t\t Time after which to repeat tasks (no)\n") sys.stderr.write(" -t, --time=DATE \t\t Moment for the task to be active (now).\n") sys.stderr.write(" -a, --all \t\t All records\n") sys.stderr.write(" -c, --collection \t\t Select records by collection\n") sys.stderr.write(" -f, --field \t\t Select records by field.\n") sys.stderr.write(" -p, --pattern \t\t Select records by pattern.\n") sys.stderr.write(" -o, --format \t\t Specify output format to be (re-)created. (default HB)\n") sys.stderr.write(" -n, --noprocess \t\t Count records to be processed only (no processing done)\n") sys.stderr.write("\n") sys.stderr.write(" Example: bibreformat -n Show how many records are to be bibreformated.") sys.stderr.write("\n") sys.exit(exitcode) def main(): """Main function that analyzes command line input and calls whatever is appropriate. Useful for learning on how to write BibSched tasks.""" global task_id, sched_time, sleeptime ## parse command line: if len(sys.argv) == 2 and sys.argv[1].isdigit(): ## A - run the task task_id = int(sys.argv[1]) process_format = 0 task_run(process_format) else: ## B - submit the task process_format = 0 options = {} # will hold command-line options try: opts, args = getopt.getopt(sys.argv[1:], "hVdu:ac:f:s:p:lo:nt:wl", ["help", "version", "debug","user=","all","collection=","field=","sleeptime=","pattern=","format=","noprocess","time=","without","last"]) except getopt.GetoptError, err: usage(1, err) clp = 0 # default parameters flag try: for opt in opts: if opt[0] in ["-h", "--help"]: usage(0) elif opt[0] in ["-V", "--version"]: print __version__ sys.exit(0) elif opt[0] in [ "-u", "--user"]: options["user"] = opt[1] elif opt[0] in ["-d", "--debug"]: options["debug"] = 1 elif opt[0] in ["-a", "--all"]: options["all"] = 1 options["without"] = 1 clp = 1 elif opt[0] in ["-c", "--collection"]: options["collection"]=opt[1] clp = 1 elif opt[0] in ["-n", "--noprocess"]: options["noprocess"] = 1 elif opt[0] in ["-f", "--field"]: options["field"] = opt[1] clp = 1 elif opt[0] in ["-p","--pattern"]: options["pattern"] = opt[1] clp = 1 elif opt[0] in ["-o","--format"]: options["format"] = opt[1] elif opt[0] in ["-s", "--sleeptime" ]: get_date(opt[1]) # see if it is a valid shift sleeptime = opt[1] elif opt[0] in [ "-t", "--time" ]: sched_time = get_date(opt[1]) if clp == 0: # default options["without"] = 1 options["last"] = 1 except StandardError, e: usage(e) task_submit(options) return ### okay, here we go: if __name__ == '__main__': main()