Page MenuHomec4science

bibreformat.in
No OneTemporary

File Metadata

Created
Sun, May 12, 15:51

bibreformat.in

## $Id$
## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
## read config variables:
#include "config.wml"
#include "configbis.wml"
## start Python:
<protect>#!</protect><PYTHON>
<protect>## $Id$
<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
"""
BibReformat -- to reformat HTML brief (and other) formats for bibliographic records
bibreformat : reformat only modified records (i.e. "bibformat.last_updated < bibrec.modification_date"), format: HB
bibreformat -a : reformat all records in database, format: HB
"""
## fill config variables:
pylibdir = "<LIBDIR>/python"
## okay, rest of the Python code goes below
#######
## version number:
__version__ = "$Id$"
## import interesting modules:
try:
import sys
import os
import getopt
import string
import zlib
import time
import MySQLdb
import signal
import urllib
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
try:
sys.path.append('%s' % pylibdir)
from cdsware.config import *
from cdsware.dbquery import run_sql
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
### SQL query for recIDs detection, XXX for command line parameter
###
sql_query_for_recIDs = {
'a' : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'",
'b1': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'",
'b2': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'",
'c1': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value='XXX'",
'c2': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value='XXX'",
'l' : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date",
'e' : "select br.id from bibrec as br left join bibrec_bib03x as br3 on br.id = br3.id_bibrec left join bib03x as b3 on br3.id_bibxxx = b3.id where b3.value='XXX'",
'r' : "select id from bibrec where id='XXX'",
's' : "select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='XXX'"
}
### MySQL Search by recID
###
def read_xml_input(query):
"Read records to preprocess via http"
url = "%s/search.py?id=%s&of=xm" % (weburl, query)
return urllib.urlopen(url).read()
### Result set operations
###
def lhdiff(l1, l2):
"Does list difference via intermediate hash."
d = {}
ld = []
for e in l2:
d[e]=1
for e in l1:
if not d.has_key(e):
ld.append(e)
return ld
### Result set operations
###
def ldiff(l1, l2):
"Returns l1 - l2."
ld = []
for e in l1:
if not e in l2:
ld.append(e)
return ld
### Identify recIDs of records with missing hb
###
def withouthb():
"List of record IDs to be reformated, not having the hb format yet"
xm1, xm2, hb1, hb2 = [],[],[],[]
q1 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'"
q2 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'"
## get complete recID list of xm formatted records
xm1 = run_sql(q1)
for item in xm1:
xm2.append(item[0])
## get complete recID list of hb formatted records
hb1 = run_sql(q2)
for item in hb1:
hb2.append(item[0])
return lhdiff(xm2,hb2)
### Print out info
###
def print_info():
"Print script info"
print "\n Usage: bibreformat -[options]\n"
print "\n Options:"
print " -a All records with existing 'hb' format"
print " -b Process in addition records without HB"
print " -c<collectionID> Individual set specified by collection identifier (dbcollid)"
print " -l Records modified since last bibreformat only"
print " -r<recID> Individual record specified by record identifier (recID)"
print " -s<sysno> Individual record specified by ancient Aleph300 system number\n"
print "\n Example:"
print " bibreformat -l -b Standard synchronization bibreformat run.\n"
### Bibreformat all selected records
###
def iterate_over(list):
"Iterate odver list of IDs"
n_rec = 0
n_max = 1000
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
for record in list:
n_rec = n_rec + 1
total_rec = total_rec + 1
message = "Processing record: %d" % (record)
print message
xml_content = xml_content + read_xml_input(record)
if xml_content:
if n_rec >= n_max:
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='HB' < %s/bibreformat.xml > %s/rec_fmt.xml 2>> %s/bibreformat.err" % (bindir,tmpdir,tmpdir,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s/rec_fmt.xml" % (bindir,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
n_rec = 0
xml_content = ''
### Process the last re-formated chunk
###
if n_rec > 0:
print "Processing last record set (%d)" % n_rec
filename = "%s/bibreformat.xml" % tmpdir
filehandle = open(filename ,"w")
filehandle.write(xml_content)
filehandle.close()
### bibformat external call
###
t11 = os.times()[4]
message = "START bibformat external call"
print message
command = "%s/bibformat otype='HB' < %s/bibreformat.xml > %s/rec_fmt.xml 2>> %s/bibreformat.err" % (bindir,tmpdir,tmpdir,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibformat external call (time elapsed:%2f)" % (t22-t11)
print message
tbibformat = tbibformat + (t22 - t11)
### bibupload external call
###
t11 = os.times()[4]
message = "START bibupload external call"
print message
command = "%s/bibupload -f %s/rec_fmt.xml" % (bindir,tmpdir)
os.system(command)
t22 = os.times()[4]
message = "END bibupload external call (time elapsed:%2f)" % (t22-t11)
print message
tbibupload = tbibupload + (t22- t11)
return
###
###
def main():
### Options, parameters
###
process_hb = 0
mode = ""
sql_queries = []
res = []
options, arguments = getopt.getopt(sys.argv[1:],'alq:r:bc:s:',["all","last","query=","record=","without-hb","collection=", "sysno="])
t1 = os.times()[4]
for option, value in options:
if option in ("-a","--all"):
sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'")
if option in ("-l","--last"):
sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date")
if option in ("-q","--query"):
sql_queries.append(value)
if option in ("-b","--without-hb"):
process_hb = 1
if option in ("-r","--record"):
sql_queries.append("select id from bibrec where id='%s'" % (value))
if option in ("-c","--collection"):
sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value = '%s'" % value)
sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value = '%s'" % value)
if option in ("-s","--sysno"):
sql_queries.append("select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='%s'" % value)
if len(sql_queries) == 0:
print_info()
sys.exit()
### Query database for record IDs
###
if process_hb:
without_hb = withouthb()
print "Records without hb: %s" % len(without_hb)
recIDs = []
for sql_query in sql_queries:
res = run_sql(sql_query)
for item in res:
recIDs.append(item[0])
### list of corresponding record IDs was retrieved
### bibformat the records selected
print "Records to be processed: %s" % len(recIDs)
### Initialize main loop
total_rec = 0 # Total number of records
xml_content = '' # hold the contents
tbibformat = 0 # time taken up by external call
tbibupload = 0 # time taken up by external call
### Iterate over all records prepared in lists I (option)
iterate_over(recIDs)
### Iterate over all records prepared in list II (no_hb)
if process_hb:
iterate_over(without_hb)
### Final statistics
t2 = os.times()[4]
elapsed = t2 - t1
message = "total records processed: %d" % total_rec
print message
message = "total processing time: %2f sec" % elapsed
print message
avg = total_rec / elapsed
message = "records per second average: %2f rec/sec" % avg
print message
message = "Time spent on external call (os.system):"
print message
message = " bibformat: %2f sec" % tbibformat
print message
message = " bibupload: %2f sec" % tbibupload
print message
### main
if __name__ == '__main__':
main()

Event Timeline