Page MenuHomec4science

bibreformat.in
No OneTemporary

File Metadata

Created
Sun, Nov 17, 05:57

bibreformat.in

## $Id$
## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
## read config variables:
#include "config.wml"
#include "configbis.wml"
## start Python:
<protect>#!</protect><PYTHON>
<protect>## $Id$</protect>
<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
## import interesting modules:
pylibdir = "<LIBDIR>/python"
## okay, rest of the Python code goes below
#######
## version number:
__version__ = "$Id$"
## import interesting modules:
try:
import sys
sys.path.append('%s' % pylibdir)
from cdsware.dbquery import run_sql
from cdsware.config import *
from cdsware.search_engine import perform_request_search
import getopt
import urllib
import marshal
import signal
import string
import sys
import os
import re
import time
import MySQLdb
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
sql_queries = []
cds_query = {} # holds CDS query parameters (fields, collection, pattern)
process_hb = 0 # flag, process records without created format
process = 1 # flag, process records (unless count only)
fmt = "hb" # default format to be processed
sleeptime = "" # default sleeptime
format_string = "%Y-%m-%d %H:%M:%S"
sched_time = time.strftime(format_string)
sql = {
"all" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt,
"last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt,
"q1" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'",
"q2" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt
}
def bibreformat_task(sql_queries, cds_query, process_hb):
global process, fmt
t1 = os.times()[4]
### Query the database
###
if process_hb:
print "Querying database for records with missing format ..."
without_hb = withouthb()
recIDs = []
if cds_query['field'] != "" or \
cds_query['collection'] != "" or \
cds_query['pattern'] != "":
print "Querying database for records with old format (CDS query)..."
res = perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field']).tolist()
for item in res:
recIDs.append(item)
for sql_query in sql_queries:
print "Querying database for records with old format (SQL query) ..."
res = run_sql(sql_query)
for item in res:
recIDs.append(item[0])
### list of corresponding record IDs was retrieved
### bibformat the records selected
if process_hb:
print "Records to be processed: %d" % (len(recIDs)+len(without_hb))
print "Out of it records without created format: %d" % len(without_hb)
else:
print "Records to be processed: %d" % (len(recIDs))
### Initialize main loop
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
### Iterate over all records prepared in lists I (option)
if process:
iterate_over(recIDs, weburl, fmt)
### Iterate over all records prepared in list II (no_hb)
if process_hb and process:
iterate_over(without_hb, weburl, fmt)
### Final statistics
t2 = os.times()[4]
elapsed = t2 - t1
message = "total records processed: %d" % total_rec
print message
message = "total processing time: %2f sec" % elapsed
print message
message = "Time spent on external call (os.system):"
print message
message = " bibformat: %2f sec" % tbibformat
print message
message = " bibupload: %2f sec" % tbibupload
print message
### MySQL Search by recID
###
def read_xml_input(weburl, query):
"Read records to preprocess via http"
url = "%s/search.py?%s" % (weburl, query)
return urllib.urlopen(url).read()
### Result set operations
###
def lhdiff(l1, l2):
"Does list difference via intermediate hash."
d = {}
ld = []
for e in l2:
d[e]=1
for e in l1:
if not d.has_key(e):
ld.append(e)
return ld
### Result set operations
###
def ldiff(l1, l2):
"Returns l1 - l2."
ld = []
for e in l1:
if not e in l2:
ld.append(e)
return ld
### Identify recIDs of records with missing hb
###
def withouthb():
"List of record IDs to be reformated, not having the hb format yet"
global fmt
xm1, xm2, hb1, hb2 = [],[],[],[]
q1 = sql['q1'] # "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'"
q2 = sql['q2'] # "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt
## get complete recID list of xm formatted records
xm1 = run_sql(q1)
for item in xm1:
xm2.append(item[0])
## get complete recID list of hb formatted records
hb1 = run_sql(q2)
for item in hb1:
hb2.append(item[0])
return lhdiff(xm2,hb2)
### Bibreformat all selected records
###
def iterate_over(list, weburl, fmt):
"Iterate odver list of IDs"
n_rec = 0
n_max = 1000
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
for record in list:
n_rec = n_rec + 1
total_rec = total_rec + 1
message = "Processing record: %d" % (record)
print message
query = "id=%d&of=xm" % (record)
xml_content = xml_content + read_xml_input(weburl, query)
if xml_content:
if n_rec >= n_max:
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir,finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
n_rec = 0
xml_content = ''
### Process the last re-formated chunk
###
if n_rec > 0:
print "Processing last record set (%d)" % n_rec
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir,finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
return
### Bibshed compatibility procedures
###
def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
print value
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def write_message(msg, stream=sys.stdout):
"""Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
write_message("sleeping...")
task_update_state("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
write_message("continuing...")
task_update_state("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
write_message("stopping...")
task_update_state("STOPPING")
write_message("flushing cache or whatever...")
time.sleep(3)
write_message("closing tables or whatever...")
time.sleep(1)
write_message("stopped")
task_update_state("STOPPED")
sys.exit(0)
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
write_message("suiciding myself now...")
task_update_state("SUICIDING")
write_message("suicided")
task_update_state("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
write_message("unknown signal %d ignored" % sig) # do nothing for other signals
def getpass(prompt = "Password: "):
"""Prompts for a password without echoing it back to the screen"""
import termios, sys
fd = sys.stdin.fileno()
old = termios.tcgetattr(fd)
new = termios.tcgetattr(fd)
new[3] = new[3] & ~termios.ECHO # lflags
passwd = ""
try:
termios.tcsetattr(fd, termios.TCSADRAIN, new)
passwd = raw_input(prompt)
print
finally:
termios.tcsetattr(fd, termios.TCSADRAIN, old)
return passwd
def authenticate(user):
"""Authenticates a user against the user database.
NOTE: Access might be more complex in the future"""
print "BibReformat Task Submission"
print "========================="
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
res = run_sql("select password from user where email=%s", (user,), 1)
if res:
row = res[0]
password_db = row[0]
if password_db != '':
# authentication needed
password_entered = getpass()
if password_db == password_entered:
return user
else:
print "Sorry, you seem to be unauthorized user. Exiting."
sys.exit(1)
else:
# no authentication needed
return user
else:
print "Sorry, %s seems to be unauthorized user. Exiting." % user
sys.exit(1)
def task_submit(options):
"""Submits task to the BibSched task queue. This is what people will be invoking via command line."""
global sched_time, sleep_time
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
## authenticate user:
user = authenticate(options.get("user", ""))
## submit task:
task_id = run_sql("""INSERT INTO schTASKS (id,proc,user,state,arguments,sleeptime,date) VALUES (NULL,'bibreformat',%s,'WAITING',%s,%s,%s)""",
(user, marshal.dumps(options),sleeptime,MySQLdb.escape_string(sched_time)))
## update task number:
options["task"] = task_id
run_sql("""UPDATE schTASKS SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id))
write_message("Task #%d submitted." % task_id)
return task_id
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global task_id
return run_sql("UPDATE schTASKS SET progress=%s where id=%s", (msg, task_id))
def task_update_state(val):
"""Updates state information in the BibSched task table."""
global task_id
return run_sql("UPDATE schTASKS SET state=%s where id=%s", (val, task_id))
def task_read_state(task_id):
"""Read state information in the BibSched task table."""
res = run_sql("SELECT state FROM schTASKS where id=%s", (task_id,), 1)
try:
out = res[0][0]
except:
out = 'UNKNOWN'
return out
def task_get_options(id):
"""Returns options for the task 'id' read from the BibSched task queue table."""
out = {}
res = run_sql("SELECT arguments FROM schTASKS WHERE id=%s AND proc='bibreformat'", (id,))
try:
out = marshal.loads(res[0][0])
except:
write_message("Error: BibReformat task %d does not seem to exist." % id)
sys.exit(1)
return out
def task_run(process_hb):
"""Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call."""
global task_id, process, fmt, sched_time
options = task_get_options(task_id) # get options from BibSched task table
## check task id:
if not options.has_key("task"):
write_message("Error: The task #%d does not seem to be a BibReformat task." % task_id)
return
## initialize parameters
if options.has_key("all"):
# sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt)
sql_queries.append(sql['all'])
if options.has_key("without"):
process_hb = 1
if options.has_key("noprocess"):
process = 0
if options.has_key("last"):
# sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt)
sql_queries.append(sql['last'])
if options.has_key("collection"):
cds_query['collection'] = options['collection']
else:
cds_query['collection'] = ""
if options.has_key("field"):
cds_query['field'] = options['field']
else:
cds_query['field'] = ""
if options.has_key("pattern"):
cds_query['pattern'] = options['pattern']
else:
cds_query['pattern'] = ""
if options.has_key("format"):
fmt = options["format"]
## check task state:
task_state = task_read_state(task_id)
if task_state != "WAITING":
write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_state))
return
## update task state:
task_update_state("RUNNING")
## initialize signal handler:
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
## run the task:
bibreformat_task(sql_queries, cds_query, process_hb)
## we are done:
task_update_state("DONE")
return
def usage(exitcode=1, msg=""):
"""Prints usage info."""
if msg:
sys.stderr.write("Error: %s.\n" % msg)
sys.stderr.write("Usage: %s [options]\n" % sys.argv[0])
sys.stderr.write(" -u, --user=USER \t\t User name to submit the task as, password needed.\n")
sys.stderr.write(" -h, --help \t\t Print this help.\n")
sys.stderr.write(" -V, --version \t\t Print version information.\n")
sys.stderr.write(" -d, --debug \t\t Print debugging information.\n")
sys.stderr.write(" -s, --sleeptime=SLEEP\t\t Time after which to repeat tasks (no)\n")
sys.stderr.write(" -t, --time=DATE \t\t Moment for the task to be active (now).\n")
sys.stderr.write(" -a, --all \t\t All records\n")
sys.stderr.write(" -c, --collection \t\t Select records by collection\n")
sys.stderr.write(" -f, --field \t\t Select records by field.\n")
sys.stderr.write(" -p, --pattern \t\t Select records by pattern.\n")
sys.stderr.write(" -o, --format \t\t Specify output format to be (re-)created. (default HB)\n")
sys.stderr.write(" -n, --noprocess \t\t Count records to be processed only (no processing done)\n")
sys.stderr.write("\n")
sys.stderr.write(" Example: bibreformat -n Show how many records are to be bibreformated.")
sys.stderr.write("\n")
sys.exit(exitcode)
def main():
"""Main function that analyzes command line input and calls whatever is appropriate.
Useful for learning on how to write BibSched tasks."""
global task_id, sched_time, sleeptime
## parse command line:
if len(sys.argv) == 2 and sys.argv[1].isdigit():
## A - run the task
task_id = int(sys.argv[1])
process_hb = 0
task_run(process_hb)
else:
## B - submit the task
process_hb = 0
options = {} # will hold command-line options
try:
opts, args = getopt.getopt(sys.argv[1:], "hVdu:ac:f:s:p:lo:nt:wl", ["help", "version", "debug","user=","all","collection=","field=","sleeptime=","pattern=","format=","noprocess","time=","without","last"])
except getopt.GetoptError, err:
usage(1, err)
if len(sys.argv) == 1: # default
options["without"] = 1
options["last"] = 1
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
usage(0)
elif opt[0] in ["-V", "--version"]:
print __version__
sys.exit(0)
elif opt[0] in [ "-u", "--user"]:
options["user"] = opt[1]
elif opt[0] in ["-d", "--debug"]:
options["debug"] = 1
elif opt[0] in ["-a", "--all"]:
options["all"] = 1
options["without"] = 1
elif opt[0] in ["-c", "--collection"]:
options["collection"]=opt[1]
elif opt[0] in ["-n", "--noprocess"]:
options["noprocess"] = 1
if len(sys.argv) == 2:
options["without"] = 1
options["last"] = 1
elif opt[0] in ["-f", "--field"]:
options["field"] = opt[1]
elif opt[0] in ["-p","--pattern"]:
options["pattern"] = opt[1]
elif opt[0] in ["-o","--format"]:
options["format"] = opt[1]
elif opt[0] in ["-s", "--sleeptime" ]:
get_date(opt[1]) # see if it is a valid shift
sleeptime = opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time = get_date(opt[1])
except StandardError, e:
usage(e)
task_submit(options)
return
### okay, here we go:
if __name__ == '__main__':
main()

Event Timeline