Page MenuHomec4science

bibreformat.in
No OneTemporary

File Metadata

Created
Fri, May 10, 00:02

bibreformat.in

## $Id$
## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
## read config variables:
#include "config.wml"
#include "configbis.wml"
## start Python:
<protect>#!</protect><PYTHON>
<protect>## $Id$</protect>
<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
## import interesting modules:
pylibdir = "<LIBDIR>/python"
## okay, rest of the Python code goes below
#######
## version number:
__version__ = "$Id$"
## import interesting modules:
try:
import sys
sys.path.append('%s' % pylibdir)
from cdsware.dbquery import run_sql
from cdsware.config import *
from cdsware.search_engine import perform_request_search
import getopt
import urllib
import marshal
import signal
import string
import sys
import os
import time
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
sql_queries = []
mysql_queries = []
process_hb = 0
process = 1
fmt = "hb" # default processing format
def bibreformat_task(sql_queries, mysql_queries, process_hb):
global process
t1 = os.times()[4]
### Options, parameters
###
if len(sql_queries)+len(mysql_queries) == 0:
print_info()
sys.exit()
if process_hb:
without_hb = withouthb()
recIDs = []
if (mysql_queries[0] != "") or (mysql_queries[1] != "") or (mysql_queries[2] != ""):
res = perform_request_search(req=None, of='id', c=mysql_queries[0], p=mysql_queries[2], f=mysql_queries[1]).tolist()
for item in res:
recIDs.append(item)
for sql_query in sql_queries:
res = run_sql(sql_query)
for item in res:
recIDs.append(item[0])
### list of corresponding record IDs was retrieved
### bibformat the records selected
if process_hb:
print "Records to be processed: %d" % (len(recIDs)+len(without_hb))
else:
print "Records to be processed: %d" % (len(recIDs))
### Initialize main loop
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
weburl = "<WEBURL>"
### Iterate over all records prepared in lists I (option)
if process:
iterate_over(recIDs, weburl)
### Iterate over all records prepared in list II (no_hb)
if process_hb and process:
iterate_over(without_hb, weburl)
### Final statistics
t2 = os.times()[4]
elapsed = t2 - t1
message = "total records processed: %d" % total_rec
print message
message = "total processing time: %2f sec" % elapsed
print message
message = "Time spent on external call (os.system):"
print message
message = " bibformat: %2f sec" % tbibformat
print message
message = " bibupload: %2f sec" % tbibupload
print message
### MySQL Search by recID
###
def read_xml_input(weburl, query):
"Read records to preprocess via http"
url = "%s/search.py?%s" % (weburl, query)
return urllib.urlopen(url).read()
### Result set operations
###
def lhdiff(l1, l2):
"Does list difference via intermediate hash."
d = {}
ld = []
for e in l2:
d[e]=1
for e in l1:
if not d.has_key(e):
ld.append(e)
return ld
### Result set operations
###
def ldiff(l1, l2):
"Returns l1 - l2."
ld = []
for e in l1:
if not e in l2:
ld.append(e)
return ld
### Identify recIDs of records with missing hb
###
def withouthb():
"List of record IDs to be reformated, not having the hb format yet"
xm1, xm2, hb1, hb2 = [],[],[],[]
q1 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'"
q2 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt
## get complete recID list of xm formatted records
xm1 = run_sql(q1)
for item in xm1:
xm2.append(item[0])
## get complete recID list of hb formatted records
hb1 = run_sql(q2)
for item in hb1:
hb2.append(item[0])
return lhdiff(xm2,hb2)
### Bibreformat all selected records
###
def iterate_over(list, weburl):
"Iterate odver list of IDs"
n_rec = 0
n_max = 1000
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
tmpdir = "<TMPDIR>"
bindir = "<BINDIR>"
for record in list:
n_rec = n_rec + 1
total_rec = total_rec + 1
message = "Processing record: %d" % (record)
print message
query = "id=%d&of=xm" % (record)
xml_content = xml_content + read_xml_input(weburl, query)
if xml_content:
if n_rec >= n_max:
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s/rec_fmt.xml 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,tmpdir,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s/rec_fmt.xml" % (bindir,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
n_rec = 0
xml_content = ''
### Process the last re-formated chunk
###
if n_rec > 0:
print "Processing last record set (%d)" % n_rec
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s/rec_fmt.xml 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,tmpdir,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s/rec_fmt.xml" % (bindir,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
return
### Bibshed compatibility procedures
###
def write_message(msg, stream=sys.stdout):
"""Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
write_message("sleeping...")
task_update_state("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
write_message("continuing...")
task_update_state("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
write_message("stopping...")
task_update_state("STOPPING")
write_message("flushing cache or whatever...")
time.sleep(3)
write_message("closing tables or whatever...")
time.sleep(1)
write_message("stopped")
task_update_state("STOPPED")
sys.exit(0)
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
write_message("suiciding myself now...")
task_update_state("SUICIDING")
write_message("suicided")
task_update_state("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
write_message("unknown signal %d ignored" % sig) # do nothing for other signals
def getpass(prompt = "Password: "):
"""Prompts for a password without echoing it back to the screen"""
import termios, sys
fd = sys.stdin.fileno()
old = termios.tcgetattr(fd)
new = termios.tcgetattr(fd)
new[3] = new[3] & ~termios.ECHO # lflags
passwd = ""
try:
termios.tcsetattr(fd, termios.TCSADRAIN, new)
passwd = raw_input(prompt)
print
finally:
termios.tcsetattr(fd, termios.TCSADRAIN, old)
return passwd
def authenticate(user):
"""Authenticates a user against the user database.
NOTE: Access might be more complex in the future"""
print "BibReformat Task Submission"
print "========================="
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
res = run_sql("select password from user where email=%s", (user,), 1)
if res:
row = res[0]
password_db = row[0]
if password_db != '':
# authentication needed
password_entered = getpass()
if password_db == password_entered:
return user
else:
print "Sorry, you seem to be unauthorized user. Exiting."
sys.exit(1)
else:
# no authentication needed
return user
else:
print "Sorry, %s seems to be unauthorized user. Exiting." % user
sys.exit(1)
def task_submit(options):
"""Submits task to the BibSched task queue. This is what people will be invoking via command line."""
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
## authenticate user:
user = authenticate(options.get("user", ""))
## submit task:
task_id = run_sql("""INSERT INTO schTASKS (id,proc,user,state,date,arguments) VALUES (NULL,'bibreformat',%s,'WAITING',NOW(),%s)""",
(user, marshal.dumps(options)))
## update task number:
options["task"] = task_id
run_sql("""UPDATE schTASKS SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id))
write_message("Task #%d submitted." % task_id)
return task_id
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global task_id
return run_sql("UPDATE schTASKS SET progress=%s where id=%s", (msg, task_id))
def task_update_state(val):
"""Updates state information in the BibSched task table."""
global task_id
return run_sql("UPDATE schTASKS SET state=%s where id=%s", (val, task_id))
def task_read_state(task_id):
"""Read state information in the BibSched task table."""
res = run_sql("SELECT state FROM schTASKS where id=%s", (task_id,), 1)
try:
out = res[0][0]
except:
out = 'UNKNOWN'
return out
def task_get_options(id):
"""Returns options for the task 'id' read from the BibSched task queue table."""
out = {}
res = run_sql("SELECT arguments FROM schTASKS WHERE id=%s AND proc='bibreformat'", (id,))
try:
out = marshal.loads(res[0][0])
except:
write_message("Error: BibReformat task %d does not seem to exist." % id)
sys.exit(1)
return out
def task_run(process_hb, fmt):
"""Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call."""
global task_id, process
options = task_get_options(task_id) # get options from BibSched task table
## check task id:
if not options.has_key("task"):
write_message("Error: The task #%d does not seem to be a BibReformat task." % task_id)
return
## initialize parameters
if options.has_key("all"):
sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt)
if options.has_key("new"):
process_hb = 1
if options.has_key("noprocess"):
process = 0
if options.has_key("last"):
sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt)
if options.has_key("collection"):
mysql_queries.append(options["collection"])
else:
mysql_queries.append("")
if options.has_key("field"):
mysql_queries.append(options["field"])
else:
mysql_queries.append("")
if options.has_key("phrase"):
mysql_queries.append(options["phrase"])
else:
mysql_queries.append("")
if options.has_key("format"):
fmt = options["format"]
## check task state:
task_state = task_read_state(task_id)
if task_state != "WAITING":
write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_state))
return
## update task state:
task_update_state("RUNNING")
## initialize signal handler:
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
## run the task:
bibreformat_task(sql_queries, mysql_queries, process_hb)
## we are done:
task_update_state("DONE")
return
def usage(exitcode=1, msg=""):
"""Prints usage info."""
if msg:
sys.stderr.write("Error: %s.\n" % msg)
sys.stderr.write("Usage: %s [options]\n" % sys.argv[0])
sys.stderr.write(" -u, --user=USER \t\t User name to submit the task as, password needed.\n")
sys.stderr.write(" -h, --help \t\t Print this help.\n")
sys.stderr.write(" -V, --version \t\t Print version information.\n")
sys.stderr.write(" -d, --debug \t\t Print debugging information.\n")
sys.stderr.write(" -a, --all \t\t All records\n")
sys.stderr.write(" -c, --collection\t\t Select records by collection\n")
sys.stderr.write(" -f, --field \t\t Select records by field.\n")
sys.stderr.write(" -p, --phrase \t\t Select records by phrase.\n")
sys.stderr.write(" -o, --format \t\t Specify output format to be (re-)created. (default HB)\n")
sys.stderr.write(" -n, --noprocess \t\t Count records to be processed only (no processing done)\n")
sys.stderr.write("\n")
sys.stderr.write(" Example: bibreformat -l -b Standard synchronization bibreformat run.")
sys.exit(exitcode)
def main():
"""Main function that analyzes command line input and calls whatever is appropriate.
Useful for learning on how to write BibSched tasks."""
global task_id
## parse command line:
if len(sys.argv) == 2 and sys.argv[1].isdigit():
## A - run the task
task_id = int(sys.argv[1])
process_hb = 0
fmt = "hb"
task_run(process_hb, fmt)
else:
## B - submit the task
process_hb = 0
options = {} # will hold command-line options
try:
opts, args = getopt.getopt(sys.argv[1:], "hVdu:ac:f:p:lo:n", ["help", "version", "debug","user=","all","collection=","field=","phrase=","format=","noprocess"])
except getopt.GetoptError, err:
usage(1, err)
if len(sys.argv) == 1:
options["new"] = 1
options["last"] = 1
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
usage(0)
elif opt[0] in ["-V", "--version"]:
print __version__
sys.exit(0)
elif opt[0] in [ "-u", "--user"]:
options["user"] = opt[1]
elif opt[0] in ["-d", "--debug"]:
options["debug"] = 1
elif opt[0] in ["-a", "--all"]:
options["all"] = 1
options["new"] = 1
elif opt[0] in ["-c", "--collection"]:
options["collection"]=opt[1]
elif opt[0] in ["-n", "--noprocess"]:
options["noprocess"] = 1
elif opt[0] in ["-f", "--field"]:
options["field"] = opt[1]
elif opt[0] in ["-p","--phrase"]:
options["phrase"] = opt[1]
elif opt[0] in ["-o","--format"]:
options["format"] = opt[1]
elif opt[0] in ["-n","--noprocess"]:
options["noproces"] = 1
except StandardError, e:
usage(e)
task_submit(options)
return
### okay, here we go:
if __name__ == '__main__':
main()

Event Timeline