Page MenuHomec4science

bibreformat.wml
No OneTemporary

File Metadata

Created
Fri, Nov 15, 07:47

bibreformat.wml

## $Id$
## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
## read config variables:
#include "config.wml"
#include "configbis.wml"
## start Python:
<protect>#!</protect><PYTHON>
<protect>## $Id$</protect>
<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
## import interesting modules:
pylibdir = "<LIBDIR>/python"
## okay, rest of the Python code goes below
#######
## version number:
__version__ = "$Id$"
## import interesting modules:
try:
import sys
sys.path.append('%s' % pylibdir)
from cdsware.dbquery import run_sql
from cdsware.config import *
from cdsware.search_engine import perform_request_search
from cdsware.search_engine import print_record
from cdsware.access_control_engine import acc_authorize_action
import getopt
import getpass
import marshal
import signal
import string
import sys
import os
import re
import time
import MySQLdb
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
sql_queries = [] # holds SQL queries to be executed
cds_query = {} # holds CDS query parameters (fields, collection, pattern)
process_format = 0 # flag, process records without created format
process = 1 # flag, process records (unless count only)
fmt = "hb" # default format to be processed
sleeptime = "" # default sleeptime
format_string = "%Y-%m-%d %H:%M:%S" # date/time format
sched_time = time.strftime(format_string) # scheduled execution time in the date/time format
### sql commands to be executed during the script run
###
sql = {
"all" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt,
"last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt,
"q1" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'",
"q2" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt
}
### run the bibreformat task bibsched scheduled
###
def bibreformat_task(sql_queries, cds_query, process_format):
global process, fmt
t1 = os.times()[4]
### Query the database
###
if process_format:
print "Querying database for records with missing format ..."
without_format = without_fmt()
recIDs = []
if cds_query['field'] != "" or \
cds_query['collection'] != "" or \
cds_query['pattern'] != "":
print "Querying database for records with old format (CDS query)..."
res = perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])
for item in res:
recIDs.append(item)
for sql_query in sql_queries:
print "Querying database for records with old format (SQL query) ..."
res = run_sql(sql_query)
for item in res:
recIDs.append(item[0])
### list of corresponding record IDs was retrieved
### bibformat the records selected
if process_format:
print "Records to be processed: %d" % (len(recIDs)+len(without_format))
print "Out of it records without created format: %d" % len(without_format)
else:
print "Records to be processed: %d" % (len(recIDs))
### Initialize main loop
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
### Iterate over all records prepared in lists I (option)
if process:
iterate_over(recIDs, weburl, fmt)
### Iterate over all records prepared in list II (no_format)
if process_format and process:
iterate_over(without_format, weburl, fmt)
### Final statistics
t2 = os.times()[4]
elapsed = t2 - t1
message = "total records processed: %d" % total_rec
print message
message = "total processing time: %2f sec" % elapsed
print message
message = "Time spent on external call (os.system):"
print message
message = " bibformat: %2f sec" % tbibformat
print message
message = " bibupload: %2f sec" % tbibupload
print message
### Result set operations
###
def lhdiff(l1, l2):
"Does list difference via intermediate hash."
d = {}
ld = []
for e in l2:
d[e]=1
for e in l1:
if not d.has_key(e):
ld.append(e)
return ld
### Result set operations
###
def ldiff(l1, l2):
"Returns l1 - l2."
ld = []
for e in l1:
if not e in l2:
ld.append(e)
return ld
### Identify recIDs of records with missing format
###
def without_fmt():
"List of record IDs to be reformated, not having the specified format yet"
global fmt
xm1, xm2, format1, format2 = [],[],[],[]
q1 = sql['q1'] # "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'"
q2 = sql['q2'] # "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt
## get complete recID list of xm formatted records
xm1 = run_sql(q1)
for item in xm1:
xm2.append(item[0])
## get complete recID list of formatted records
format1 = run_sql(q2)
for item in format1:
format2.append(item[0])
return lhdiff(xm2,format2)
### Bibreformat all selected records
###
def iterate_over(list, weburl, fmt):
"Iterate odver list of IDs"
n_rec = 0
n_max = 10000
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
for record in list:
n_rec = n_rec + 1
total_rec = total_rec + 1
message = "Processing record: %d" % (record)
print message
query = "id=%d&of=xm" % (record)
count = 0
contents = print_record(record, 'xm')
while (contents == "") and (count < 10):
contents = print_record(record, 'xm')
count = count + 1
time.sleep(10)
if count == 10:
sys.stderr.write("Failed to download %s from %s after 10 attempts... terminating" % (query, weburl))
sys.exit(0)
xml_content = xml_content + contents
if xml_content:
if n_rec >= n_max:
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir,finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
n_rec = 0
xml_content = ''
### Process the last re-formated chunk
###
if n_rec > 0:
print "Processing last record set (%d)" % n_rec
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir,finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
return
### Bibshed compatibility procedures
###
def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
print value
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def write_message(msg, stream=sys.stdout):
"""Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
write_message("stopping...")
task_update_status("STOPPING")
write_message("flushing cache or whatever...")
time.sleep(3)
write_message("closing tables or whatever...")
time.sleep(1)
write_message("stopped")
task_update_status("STOPPED")
sys.exit(0)
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
write_message("unknown signal %d ignored" % sig) # do nothing for other signals
def authenticate(user, header="BibReformat Task Submission", action="runbibformat"):
"""Authenticate the user against the user database.
Check for its password, if it exists.
Check for action access rights.
Return user name upon authorization success,
do system exit upon authorization failure.
"""
print header
print "=" * len(header)
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername: ", user
## first check user pw:
res = run_sql("select id,password from user where email=%s", (user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
## secondly check authorization for the action:
if not acc_authorize_action(uid_db, action):
print "Sorry, %s has no right to %s." % (user, action)
sys.exit(1)
return user
def task_submit(options):
"""Submits task to the BibSched task queue. This is what people will be invoking via command line."""
global sched_time, sleep_time
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
## authenticate user:
user = authenticate(options.get("user", ""))
## submit task:
task_id = run_sql("""INSERT INTO schTASK (id,proc,user,status,arguments,sleeptime,runtime) VALUES (NULL,'bibreformat',%s,'WAITING',%s,%s,%s)""",
(user, marshal.dumps(options),sleeptime,MySQLdb.escape_string(sched_time)))
## update task number:
options["task"] = task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id))
write_message("Task #%d submitted." % task_id)
return task_id
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global task_id
return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, task_id))
def task_update_status(val):
"""Updates status information in the BibSched task table."""
global task_id
return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, task_id))
def task_read_status(task_id):
"""Read status information in the BibSched task table."""
res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1)
try:
out = res[0][0]
except:
out = 'UNKNOWN'
return out
def task_get_options(id):
"""Returns options for the task 'id' read from the BibSched task queue table."""
out = {}
res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibreformat'", (id,))
try:
out = marshal.loads(res[0][0])
except:
write_message("Error: BibReformat task %d does not seem to exist." % id)
sys.exit(1)
return out
def task_run(process_format):
"""Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call."""
global task_id, process, fmt, sched_time
options = task_get_options(task_id) # get options from BibSched task table
## check task id:
if not options.has_key("task"):
write_message("Error: The task #%d does not seem to be a BibReformat task." % task_id)
return
## initialize parameters
if options.has_key("all"):
# sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt)
sql_queries.append(sql['all'])
if options.has_key("without"):
process_format = 1
if options.has_key("noprocess"):
process = 0
if options.has_key("last"):
# sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt)
sql_queries.append(sql['last'])
if options.has_key("collection"):
cds_query['collection'] = options['collection']
else:
cds_query['collection'] = ""
if options.has_key("field"):
cds_query['field'] = options['field']
else:
cds_query['field'] = ""
if options.has_key("pattern"):
cds_query['pattern'] = options['pattern']
else:
cds_query['pattern'] = ""
if options.has_key("format"):
fmt = options["format"]
## check task status:
task_status = task_read_status(task_id)
if task_status != "WAITING":
write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status))
return
## update task status:
task_update_status("RUNNING")
## initialize signal handler:
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
## run the task:
bibreformat_task(sql_queries, cds_query, process_format)
## we are done:
task_update_status("DONE")
return
def usage(exitcode=1, msg=""):
"""Prints usage info."""
if msg:
sys.stderr.write("Error: %s.\n" % msg)
sys.stderr.write("Usage: %s [options]\n" % sys.argv[0])
sys.stderr.write(" -u, --user=USER \t\t User name to submit the task as, password needed.\n")
sys.stderr.write(" -h, --help \t\t Print this help.\n")
sys.stderr.write(" -V, --version \t\t Print version information.\n")
sys.stderr.write(" -v, --verbose=LEVEL \t\t Verbose level (0=min,1=normal,9=max).\n")
sys.stderr.write(" -s, --sleeptime=SLEEP\t\t Time after which to repeat tasks (no)\n")
sys.stderr.write(" -t, --time=DATE \t\t Moment for the task to be active (now).\n")
sys.stderr.write(" -a, --all \t\t All records\n")
sys.stderr.write(" -c, --collection \t\t Select records by collection\n")
sys.stderr.write(" -f, --field \t\t Select records by field.\n")
sys.stderr.write(" -p, --pattern \t\t Select records by pattern.\n")
sys.stderr.write(" -o, --format \t\t Specify output format to be (re-)created. (default HB)\n")
sys.stderr.write(" -n, --noprocess \t\t Count records to be processed only (no processing done)\n")
sys.stderr.write("\n")
sys.stderr.write(" Example: bibreformat -n Show how many records are to be bibreformated.")
sys.stderr.write("\n")
sys.exit(exitcode)
def main():
"""Main function that analyzes command line input and calls whatever is appropriate.
Useful for learning on how to write BibSched tasks."""
global task_id, sched_time, sleeptime
## parse command line:
if len(sys.argv) == 2 and sys.argv[1].isdigit():
## A - run the task
task_id = int(sys.argv[1])
process_format = 0
task_run(process_format)
else:
## B - submit the task
process_format = 0
options = {} # will hold command-line options
options["verbose"] = 1
try:
opts, args = getopt.getopt(sys.argv[1:], "hVv:u:ac:f:s:p:lo:nt:wl", ["help", "version", "verbose=","user=","all","collection=","field=","sleeptime=","pattern=","format=","noprocess","time=","without","last"])
except getopt.GetoptError, err:
usage(1, err)
clp = 0 # default parameters flag
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
usage(0)
elif opt[0] in ["-V", "--version"]:
print __version__
sys.exit(0)
elif opt[0] in [ "-u", "--user"]:
options["user"] = opt[1]
elif opt[0] in ["-v", "--verbose"]:
options["verbose"] = int(opt[1])
elif opt[0] in ["-a", "--all"]:
options["all"] = 1
options["without"] = 1
clp = 1
elif opt[0] in ["-c", "--collection"]:
options["collection"]=opt[1]
clp = 1
elif opt[0] in ["-n", "--noprocess"]:
options["noprocess"] = 1
elif opt[0] in ["-f", "--field"]:
options["field"] = opt[1]
clp = 1
elif opt[0] in ["-p","--pattern"]:
options["pattern"] = opt[1]
clp = 1
elif opt[0] in ["-o","--format"]:
options["format"] = opt[1]
elif opt[0] in ["-s", "--sleeptime" ]:
get_date(opt[1]) # see if it is a valid shift
sleeptime = opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time = get_date(opt[1])
if clp == 0: # default
options["without"] = 1
options["last"] = 1
except StandardError, e:
usage(e)
task_submit(options)
return
### okay, here we go:
if __name__ == '__main__':
main()

Event Timeline