Page MenuHomec4science

bibreformat.in
No OneTemporary

File Metadata

Created
Sat, May 11, 16:49

bibreformat.in

#!@PYTHON@
## -*- mode: python; coding: utf-8; -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Call BibFormat engine and create HTML brief (and other) formats for bibliographic records.
Upload formats via BibUpload."""
__version__ = "$Id$"
## import interesting modules:
try:
import sys
pylibdir = "@prefix@/lib/python"
sys.path.append('%s' % pylibdir)
from invenio.dbquery import run_sql, escape_string
from invenio.config import *
from invenio.search_engine import perform_request_search
from invenio.search_engine import print_record, encode_for_xml
from invenio.access_control_engine import acc_authorize_action
from invenio.bibformat import format_record
from invenio.bibformat_utils import encode_for_xml
from invenio.bibformat_config import use_old_bibformat
from invenio.bibrecord import create_records
import getopt
import getpass
import marshal
import signal
import string
import sys
import os
import re
import time
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
options = {} # global variable to hold task options
sql_queries = [] # holds SQL queries to be executed
cds_query = {} # holds CDS query parameters (fields, collection, pattern)
process_format = 0 # flag, process records without created format
process = 1 # flag, process records (unless count only)
fmt = "hb" # default format to be processed
sleeptime = "" # default sleeptime
format_string = "%Y-%m-%d %H:%M:%S" # date/time format
sched_time = time.strftime(format_string) # scheduled execution time in the date/time format
### run the bibreformat task bibsched scheduled
###
def bibreformat_task(sql, sql_queries, cds_query, process_format):
global process, fmt
t1 = os.times()[4]
### Query the database
###
if process_format:
print "Querying database for records with missing format ..."
without_format = without_fmt(sql)
recIDs = []
if cds_query['field'] != "" or \
cds_query['collection'] != "" or \
cds_query['pattern'] != "":
print "Querying database for records with old format (CDS query)..."
res = perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])
for item in res:
recIDs.append(item)
for sql_query in sql_queries:
print "Querying database for records with old format (SQL query) ..."
res = run_sql(sql_query)
for item in res:
recIDs.append(item[0])
### list of corresponding record IDs was retrieved
### bibformat the records selected
if process_format:
print "Records to be processed: %d" % (len(recIDs)+len(without_format))
print "Out of it records without created format: %d" % len(without_format)
else:
print "Records to be processed: %d" % (len(recIDs))
### Initialize main loop
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
### Iterate over all records prepared in lists I (option)
if process:
if use_old_bibformat: # FIXME: remove this when migration from php to python bibformat is done
iterate_over_old(recIDs, weburl, fmt)
else:
iterate_over_new(recIDs, weburl, fmt)
### Iterate over all records prepared in list II (no_format)
if process_format and process:
if use_old_bibformat: # FIXME: remove this when migration from php to python bibformat is done
iterate_over_old(without_format, weburl, fmt)
else:
iterate_over_new(without_format, weburl, fmt)
### Final statistics
t2 = os.times()[4]
elapsed = t2 - t1
message = "total records processed: %d" % total_rec
print message
message = "total processing time: %2f sec" % elapsed
print message
message = "Time spent on external call (os.system):"
print message
message = " bibformat: %2f sec" % tbibformat
print message
message = " bibupload: %2f sec" % tbibupload
print message
### Result set operations
###
def lhdiff(l1, l2):
"Does list difference via intermediate hash."
d = {}
ld = []
for e in l2:
d[e]=1
for e in l1:
if not d.has_key(e):
ld.append(e)
return ld
### Result set operations
###
def ldiff(l1, l2):
"Returns l1 - l2."
ld = []
for e in l1:
if not e in l2:
ld.append(e)
return ld
### Identify recIDs of records with missing format
###
def without_fmt(sql):
"List of record IDs to be reformated, not having the specified format yet"
global fmt
xm1, xm2, format1, format2 = [],[],[],[]
q1 = sql['q1']
q2 = sql['q2']
## get complete recID list of xm formatted records
xm1 = run_sql(q1)
for item in xm1:
xm2.append(item[0])
## get complete recID list of formatted records
format1 = run_sql(q2)
for item in format1:
format2.append(item[0])
return lhdiff(xm2,format2)
### Bibreformat all selected records (using new python bibformat)
### (see iterate_over_old further down)
def iterate_over_new(list, weburl, fmt):
"Iterate odver list of IDs"
n_it_rec = 0 # Number of records for current iteration
n_it_max = 10000 # Number of max records in one iteration
n_rec = 0 # Number of formatted records
total_rec = len(list) # Total number of records
formatted_records = '' # (string-)List of formatted record of an iteration
for recID in list:
n_rec +=1
n_it_rec += 1
message = "Processing record %d with format %s (New BibFormat)" % (recID, fmt)
print message
t11 = os.times()[4]
message = "START bibformat external call"
print message
### bibformat external call
###
formatted_record = format_record(recID, fmt)
# Encapsulate record in xml tags that bibupload understands
prologue = '''
<record>
<controlfield tag="001">%s</controlfield>
<datafield tag="FMT" ind1="" ind2="">
<subfield code="f">%s</subfield>
<subfield code="g">''' % (recID, fmt)
epilogue = '''
</subfield>
</datafield>
</record>'''
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
formatted_records += prologue + encode_for_xml(formatted_record) + epilogue
# every n_max record, upload all formatted records.
# also upload if recID is last one
if n_it_rec > n_it_max or n_rec == total_rec:
#Save formatted records to disk for bibupload
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filehandle = open(finalfilename,"w")
filehandle.write(formatted_records)
filehandle.close()
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir, finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
#Reset iteration state
n_it_rec = 0
xml_content = ''
def iterate_over_old(list, weburl, fmt):
"Iterate odver list of IDs"
n_rec = 0
n_max = 10000
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
for record in list:
n_rec = n_rec + 1
total_rec = total_rec + 1
message = "Processing record: %d" % (record)
print message
query = "id=%d&of=xm" % (record)
count = 0
contents = print_record(record, 'xm')
while (contents == "") and (count < 10):
contents = print_record(record, 'xm')
count = count + 1
time.sleep(10)
if count == 10:
sys.stderr.write("Failed to download %s from %s after 10 attempts... terminating" % (query, weburl))
sys.exit(0)
xml_content = xml_content + contents
if xml_content:
if n_rec >= n_max:
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir,finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
n_rec = 0
xml_content = ''
### Process the last re-formated chunk
###
if n_rec > 0:
print "Processing last record set (%d)" % n_rec
finalfilename = "%s/rec_fmt_%s.xml" % (tmpdir,time.strftime('%Y%m%d_%H%M%S'))
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (bindir,string.upper(fmt),tmpdir,finalfilename,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s" % (bindir,finalfilename)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
return
### Bibshed compatibility procedures
###
def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
print value
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def write_message(msg, stream=sys.stdout):
"""Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
def task_sig_sleep(sig, frame):
"""Signal handler for the 'sleep' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_sleep(), got signal %s frame %s" % (sig, frame))
write_message("sleeping...")
task_update_status("SLEEPING")
signal.pause() # wait for wake-up signal
def task_sig_wakeup(sig, frame):
"""Signal handler for the 'wakeup' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_wakeup(), got signal %s frame %s" % (sig, frame))
write_message("continuing...")
task_update_status("CONTINUING")
def task_sig_stop(sig, frame):
"""Signal handler for the 'stop' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_stop(), got signal %s frame %s" % (sig, frame))
write_message("stopping...")
task_update_status("STOPPING")
write_message("flushing cache or whatever...")
time.sleep(3)
write_message("closing tables or whatever...")
time.sleep(1)
write_message("stopped")
task_update_status("STOPPED")
sys.exit(0)
def task_sig_suicide(sig, frame):
"""Signal handler for the 'suicide' signal sent by BibSched."""
if options["verbose"] >= 9:
write_message("task_sig_suicide(), got signal %s frame %s" % (sig, frame))
write_message("suiciding myself now...")
task_update_status("SUICIDING")
write_message("suicided")
task_update_status("SUICIDED")
sys.exit(0)
def task_sig_unknown(sig, frame):
"""Signal handler for the other unknown signals sent by shell or user."""
# do nothing for unknown signals:
write_message("unknown signal %d (frame %s) ignored" % (sig, frame))
def authenticate(user, header="BibReformat Task Submission", action="runbibformat"):
"""Authenticate the user against the user database.
Check for its password, if it exists.
Check for action access rights.
Return user name upon authorization success,
do system exit upon authorization failure.
"""
print header
print "=" * len(header)
if user == "":
print >> sys.stdout, "\rUsername: ",
user = string.strip(string.lower(sys.stdin.readline()))
else:
print >> sys.stdout, "\rUsername:", user
## first check user pw:
res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1)
if not res:
print "Sorry, %s does not exist." % user
sys.exit(1)
else:
(uid_db, password_db) = res[0]
if password_db:
password_entered = getpass.getpass()
if password_db == password_entered:
pass
else:
print "Sorry, wrong credentials for %s." % user
sys.exit(1)
## secondly check authorization for the action:
(auth_code, auth_message) = acc_authorize_action(uid_db, action)
if auth_code != 0:
print auth_message
sys.exit(1)
return user
def task_submit():
"""Submits task to the BibSched task queue. This is what people will be invoking via command line."""
global options, sched_time, sleep_time
## sanity check: remove eventual "task" option:
if options.has_key("task"):
del options["task"]
## authenticate user:
user = authenticate(options.get("user", ""))
## submit task:
task_id = run_sql("""INSERT INTO schTASK (id,proc,user,status,arguments,sleeptime,runtime) VALUES (NULL,'bibreformat',%s,'WAITING',%s,%s,%s)""",
(user, marshal.dumps(options),sleeptime,escape_string(sched_time)))
## update task number:
options["task"] = task_id
run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options),task_id))
write_message("Task #%d submitted." % task_id)
return task_id
def task_update_progress(msg):
"""Updates progress information in the BibSched task table."""
global options
return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, options["task"]))
def task_update_status(val):
"""Updates status information in the BibSched task table."""
global options
return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, options["task"]))
def task_read_status(task_id):
"""Read status information in the BibSched task table."""
res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1)
try:
out = res[0][0]
except:
out = 'UNKNOWN'
return out
def task_get_options(id):
"""Returns options for the task 'id' read from the BibSched task queue table."""
out = {}
res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibreformat'", (id,))
try:
out = marshal.loads(res[0][0])
except:
write_message("Error: BibReformat task %d does not seem to exist." % id)
sys.exit(1)
return out
def task_run(task_id, process_format):
"""Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call."""
global options, process, fmt, sched_time
options = task_get_options(task_id) # get options from BibSched task table
## check task id:
if not options.has_key("task"):
write_message("Error: The task #%d does not seem to be a BibReformat task." % task_id)
return
## initialize parameters
if options.has_key("format"):
fmt = options["format"]
else:
fmt = "hb"
sql = {
"all" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt,
"last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt,
"q1" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'",
"q2" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt
}
if options.has_key("all"):
sql_queries.append(sql['all'])
if options.has_key("without"):
process_format = 1
if options.has_key("noprocess"):
process = 0
if options.has_key("last"):
sql_queries.append(sql['last'])
if options.has_key("collection"):
cds_query['collection'] = options['collection']
else:
cds_query['collection'] = ""
if options.has_key("field"):
cds_query['field'] = options['field']
else:
cds_query['field'] = ""
if options.has_key("pattern"):
cds_query['pattern'] = options['pattern']
else:
cds_query['pattern'] = ""
### sql commands to be executed during the script run
###
## check task status:
task_status = task_read_status(task_id)
if task_status != "WAITING":
write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status))
return
## update task status:
task_update_status("RUNNING")
## initialize signal handler:
signal.signal(signal.SIGUSR1, task_sig_sleep)
signal.signal(signal.SIGTERM, task_sig_stop)
signal.signal(signal.SIGABRT, task_sig_suicide)
signal.signal(signal.SIGCONT, task_sig_wakeup)
signal.signal(signal.SIGINT, task_sig_unknown)
## run the task:
bibreformat_task(sql, sql_queries, cds_query, process_format)
## we are done:
task_update_status("DONE")
return
def usage(exitcode=1, msg=""):
"""Prints usage info."""
if msg:
sys.stderr.write("Error: %s.\n" % msg)
sys.stderr.write("Usage: %s [options]\n" % sys.argv[0])
sys.stderr.write(" -u, --user=USER \t\t User name to submit the task as, password needed.\n")
sys.stderr.write(" -h, --help \t\t Print this help.\n")
sys.stderr.write(" -V, --version \t\t Print version information.\n")
sys.stderr.write(" -v, --verbose=LEVEL \t\t Verbose level (0=min,1=normal,9=max).\n")
sys.stderr.write(" -s, --sleeptime=SLEEP\t\t Time after which to repeat tasks (no)\n")
sys.stderr.write(" -t, --time=DATE \t\t Moment for the task to be active (now).\n")
sys.stderr.write(" -a, --all \t\t All records\n")
sys.stderr.write(" -c, --collection \t\t Select records by collection\n")
sys.stderr.write(" -f, --field \t\t Select records by field.\n")
sys.stderr.write(" -p, --pattern \t\t Select records by pattern.\n")
sys.stderr.write(" -o, --format \t\t Specify output format to be (re-)created. (default HB)\n")
sys.stderr.write(" -n, --noprocess \t\t Count records to be processed only (no processing done)\n")
sys.stderr.write("\n")
sys.stderr.write(" Example: bibreformat -n Show how many records are to be bibreformated.")
sys.stderr.write("\n")
sys.exit(exitcode)
def main():
"""Main function that analyzes command line input and calls whatever is appropriate.
Useful for learning on how to write BibSched tasks."""
global options, sched_time, sleeptime
## parse command line:
if len(sys.argv) == 2 and sys.argv[1].isdigit():
## A - run the task
task_id = int(sys.argv[1])
process_format = 0
task_run(task_id, process_format)
else:
## B - submit the task
process_format = 0
options = {} # will hold command-line options
options["verbose"] = 1
try:
opts, args = getopt.getopt(sys.argv[1:], "hVv:u:ac:f:s:p:lo:nt:wl", ["help", "version", "verbose=","user=","all","collection=","field=","sleeptime=","pattern=","format=","noprocess","time=","without","last"])
except getopt.GetoptError, err:
usage(1, err)
clp = 0 # default parameters flag
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
usage(0)
elif opt[0] in ["-V", "--version"]:
print __version__
sys.exit(0)
elif opt[0] in [ "-u", "--user"]:
options["user"] = opt[1]
elif opt[0] in ["-v", "--verbose"]:
options["verbose"] = int(opt[1])
elif opt[0] in ["-a", "--all"]:
options["all"] = 1
options["without"] = 1
clp = 1
elif opt[0] in ["-c", "--collection"]:
options["collection"]=opt[1]
clp = 1
elif opt[0] in ["-n", "--noprocess"]:
options["noprocess"] = 1
elif opt[0] in ["-f", "--field"]:
options["field"] = opt[1]
clp = 1
elif opt[0] in ["-p","--pattern"]:
options["pattern"] = opt[1]
clp = 1
elif opt[0] in ["-o","--format"]:
options["format"] = opt[1]
elif opt[0] in ["-s", "--sleeptime" ]:
get_date(opt[1]) # see if it is a valid shift
sleeptime = opt[1]
elif opt[0] in [ "-t", "--time" ]:
sched_time = get_date(opt[1])
if clp == 0: # default
options["without"] = 1
options["last"] = 1
except StandardError, e:
usage(e)
task_submit()
return
### okay, here we go:
if __name__ == '__main__':
main()

Event Timeline