diff --git a/modules/bibformat/bin/bibreformat.in b/modules/bibformat/bin/bibreformat.in index f2b7e16fc..0a961566b 100644 --- a/modules/bibformat/bin/bibreformat.in +++ b/modules/bibformat/bin/bibreformat.in @@ -1,386 +1,388 @@ ## $Id$ ## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## read config variables: #include "config.wml" #include "configbis.wml" ## start Python: <protect>#!</protect><PYTHON> <protect>## $Id$ <protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect> """ BibReformat -- to reformat HTML brief (and other) formats for bibliographic records bibreformat : reformat only modified records (i.e. "bibformat.last_updated < bibrec.modification_date"), format: HB bibreformat -a : reformat all records in database, format: HB """ ## fill config variables: pylibdir = "<LIBDIR>/python" ## okay, rest of the Python code goes below ####### ## version number: __version__ = "$Id$" ## import interesting modules: try: import sys import os import getopt import string import zlib import time import MySQLdb import signal import urllib except ImportError, e: print "Error: %s" % e import sys sys.exit(1) try: sys.path.append('%s' % pylibdir) from cdsware.config import * from cdsware.dbquery import run_sql except ImportError, e: print "Error: %s" % e import sys sys.exit(1) ### SQL query for recIDs detection, XXX for command line parameter ### sql_query_for_recIDs = { 'a' : "select id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'", 'b1': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'", 'b2': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'", 'c1': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value='XXX'", 'c2': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value='XXX'", 'l' : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date", 'e' : "select br.id from bibrec as br left join bibrec_bib03x as br3 on br.id = br3.id_bibrec left join bib03x as b3 on br3.id_bibxxx = b3.id where b3.value='XXX'", 'r' : "select id from bibrec where id='XXX'", 's' : "select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='XXX'" } ### MySQL Search by recID ### def read_xml_input(query): "Read records to preprocess via http" url = "%s/search.py?id=%s&of=xm" % (weburl, query) return urllib.urlopen(url).read() ### Result set operations ### def lhdiff(l1, l2): "Does list difference via intermediate hash." d = {} ld = [] for e in l2: d[e]=1 for e in l1: if not d.has_key(e): ld.append(e) return ld ### Result set operations ### def ldiff(l1, l2): "Returns l1 - l2." ld = [] for e in l1: if not e in l2: ld.append(e) return ld ### Identify recIDs of records with missing hb ### def withouthb(): "List of record IDs to be reformated, not having the hb format yet" xm1, xm2, hb1, hb2 = [],[],[],[] q1 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'" q2 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'" ## get complete recID list of xm formatted records xm1 = run_sql(q1) for item in xm1: xm2.append(item[0]) ## get complete recID list of hb formatted records hb1 = run_sql(q2) for item in hb1: hb2.append(item[0]) return lhdiff(xm2,hb2) ### Print out info ### def print_info(): "Print script info" print "\n Usage: bibreformat -[options]\n" print "\n Options:" print " -a All records with existing 'hb' format" print " -b Process in addition records without HB" print " -c<collectionID> Individual set specified by collection identifier (dbcollid)" print " -l Records modified since last bibreformat only" print " -r<recID> Individual record specified by record identifier (recID)" print " -s<sysno> Individual record specified by ancient Aleph300 system number\n" print "\n Example:" print " bibreformat -l -b Standard synchronization bibreformat run.\n" ### Bibreformat all selected records ### def iterate_over(list): "Iterate odver list of IDs" n_rec = 0 n_max = 1000 total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call for record in list: n_rec = n_rec + 1 total_rec = total_rec + 1 message = "Processing record: %d" % (record) print message xml_content = xml_content + read_xml_input(record) if xml_content: if n_rec >= n_max: - filehandle = open("bibreformat.xml" ,"w") + filename = "%s/bibreformat.xml" % tmpdir + filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message - command = "%s/bibformat otype='HB' < bibreformat.xml > rec_fmt.xml" % bindir + command = "%s/bibformat otype='HB' < %s/bibreformat.xml > %s/rec_fmt.xml 2>> %s/bibreformat.err" % (bindir,tmpdir,tmpdir,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message - command = "%s/bibupload -f rec_fmt.xml" % bindir + command = "%s/bibupload -f %s/rec_fmt.xml" % (bindir,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) n_rec = 0 xml_content = '' ### Process the last re-formated chunk ### if n_rec > 0: print "Processing last record set (%d)" % n_rec - filehandle = open("bibreformat.xml" ,"w") + filename = "%s/bibreformat.xml" % tmpdir + filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message - command = "%s/bibformat otype='HB' < bibreformat.xml > rec_fmt.xml 2> bibreformat.err" % bindir + command = "%s/bibformat otype='HB' < %s/bibreformat.xml > %s/rec_fmt.xml 2>> %s/bibreformat.err" % (bindir,tmpdir,tmpdir,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message - command = "%s/bibupload -f rec_fmt.xml" % bindir + command = "%s/bibupload -f %s/rec_fmt.xml" % (bindir,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) return ### ### def main(): ### Options, parameters ### process_hb = 0 mode = "" sql_queries = [] res = [] options, arguments = getopt.getopt(sys.argv[1:],'alq:r:bc:s:',["all","last","query=","record=","without-hb","collection=", "sysno="]) t1 = os.times()[4] for option, value in options: if option in ("-a","--all"): sql_queries.append("select id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'") if option in ("-l","--last"): sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date") if option in ("-q","--query"): sql_queries.append(value) if option in ("-b","--without-hb"): process_hb = 1 if option in ("-r","--record"): sql_queries.append("select id from bibrec where id='%s'" % (value)) if option in ("-c","--collection"): sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value = '%s'" % value) sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value = '%s'" % value) if option in ("-s","--sysno"): sql_queries.append("select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='%s'" % value) if len(sql_queries) == 0: print_info() sys.exit() ### Query database for record IDs ### if process_hb: without_hb = withouthb() print "Records without hb: %s" % len(without_hb) recIDs = [] for sql_query in sql_queries: res = run_sql(sql_query) for item in res: recIDs.append(item[0]) ### list of corresponding record IDs was retrieved ### bibformat the records selected print "Records to be processed: %s" % len(recIDs) ### Initialize main loop total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) iterate_over(recIDs) ### Iterate over all records prepared in list II (no_hb) if process_hb: iterate_over(without_hb) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec print message message = "total processing time: %2f sec" % elapsed print message avg = total_rec / elapsed message = "records per second average: %2f rec/sec" % avg print message message = "Time spent on external call (os.system):" print message message = " bibformat: %2f sec" % tbibformat print message message = " bibupload: %2f sec" % tbibupload print message ### main if __name__ == '__main__': main() diff --git a/modules/bibformat/bin/bibreformat.wml b/modules/bibformat/bin/bibreformat.wml index f2b7e16fc..0a961566b 100644 --- a/modules/bibformat/bin/bibreformat.wml +++ b/modules/bibformat/bin/bibreformat.wml @@ -1,386 +1,388 @@ ## $Id$ ## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## read config variables: #include "config.wml" #include "configbis.wml" ## start Python: <protect>#!</protect><PYTHON> <protect>## $Id$ <protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect> """ BibReformat -- to reformat HTML brief (and other) formats for bibliographic records bibreformat : reformat only modified records (i.e. "bibformat.last_updated < bibrec.modification_date"), format: HB bibreformat -a : reformat all records in database, format: HB """ ## fill config variables: pylibdir = "<LIBDIR>/python" ## okay, rest of the Python code goes below ####### ## version number: __version__ = "$Id$" ## import interesting modules: try: import sys import os import getopt import string import zlib import time import MySQLdb import signal import urllib except ImportError, e: print "Error: %s" % e import sys sys.exit(1) try: sys.path.append('%s' % pylibdir) from cdsware.config import * from cdsware.dbquery import run_sql except ImportError, e: print "Error: %s" % e import sys sys.exit(1) ### SQL query for recIDs detection, XXX for command line parameter ### sql_query_for_recIDs = { 'a' : "select id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'", 'b1': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'", 'b2': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'", 'c1': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value='XXX'", 'c2': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value='XXX'", 'l' : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date", 'e' : "select br.id from bibrec as br left join bibrec_bib03x as br3 on br.id = br3.id_bibrec left join bib03x as b3 on br3.id_bibxxx = b3.id where b3.value='XXX'", 'r' : "select id from bibrec where id='XXX'", 's' : "select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='XXX'" } ### MySQL Search by recID ### def read_xml_input(query): "Read records to preprocess via http" url = "%s/search.py?id=%s&of=xm" % (weburl, query) return urllib.urlopen(url).read() ### Result set operations ### def lhdiff(l1, l2): "Does list difference via intermediate hash." d = {} ld = [] for e in l2: d[e]=1 for e in l1: if not d.has_key(e): ld.append(e) return ld ### Result set operations ### def ldiff(l1, l2): "Returns l1 - l2." ld = [] for e in l1: if not e in l2: ld.append(e) return ld ### Identify recIDs of records with missing hb ### def withouthb(): "List of record IDs to be reformated, not having the hb format yet" xm1, xm2, hb1, hb2 = [],[],[],[] q1 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'" q2 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'" ## get complete recID list of xm formatted records xm1 = run_sql(q1) for item in xm1: xm2.append(item[0]) ## get complete recID list of hb formatted records hb1 = run_sql(q2) for item in hb1: hb2.append(item[0]) return lhdiff(xm2,hb2) ### Print out info ### def print_info(): "Print script info" print "\n Usage: bibreformat -[options]\n" print "\n Options:" print " -a All records with existing 'hb' format" print " -b Process in addition records without HB" print " -c<collectionID> Individual set specified by collection identifier (dbcollid)" print " -l Records modified since last bibreformat only" print " -r<recID> Individual record specified by record identifier (recID)" print " -s<sysno> Individual record specified by ancient Aleph300 system number\n" print "\n Example:" print " bibreformat -l -b Standard synchronization bibreformat run.\n" ### Bibreformat all selected records ### def iterate_over(list): "Iterate odver list of IDs" n_rec = 0 n_max = 1000 total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call for record in list: n_rec = n_rec + 1 total_rec = total_rec + 1 message = "Processing record: %d" % (record) print message xml_content = xml_content + read_xml_input(record) if xml_content: if n_rec >= n_max: - filehandle = open("bibreformat.xml" ,"w") + filename = "%s/bibreformat.xml" % tmpdir + filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message - command = "%s/bibformat otype='HB' < bibreformat.xml > rec_fmt.xml" % bindir + command = "%s/bibformat otype='HB' < %s/bibreformat.xml > %s/rec_fmt.xml 2>> %s/bibreformat.err" % (bindir,tmpdir,tmpdir,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message - command = "%s/bibupload -f rec_fmt.xml" % bindir + command = "%s/bibupload -f %s/rec_fmt.xml" % (bindir,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) n_rec = 0 xml_content = '' ### Process the last re-formated chunk ### if n_rec > 0: print "Processing last record set (%d)" % n_rec - filehandle = open("bibreformat.xml" ,"w") + filename = "%s/bibreformat.xml" % tmpdir + filehandle = open(filename ,"w") filehandle.write(xml_content) filehandle.close() ### bibformat external call ### t11 = os.times()[4] message = "START bibformat external call" print message - command = "%s/bibformat otype='HB' < bibreformat.xml > rec_fmt.xml 2> bibreformat.err" % bindir + command = "%s/bibformat otype='HB' < %s/bibreformat.xml > %s/rec_fmt.xml 2>> %s/bibreformat.err" % (bindir,tmpdir,tmpdir,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) print message tbibformat = tbibformat + (t22 - t11) ### bibupload external call ### t11 = os.times()[4] message = "START bibupload external call" print message - command = "%s/bibupload -f rec_fmt.xml" % bindir + command = "%s/bibupload -f %s/rec_fmt.xml" % (bindir,tmpdir) os.system(command) t22 = os.times()[4] message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) print message tbibupload = tbibupload + (t22- t11) return ### ### def main(): ### Options, parameters ### process_hb = 0 mode = "" sql_queries = [] res = [] options, arguments = getopt.getopt(sys.argv[1:],'alq:r:bc:s:',["all","last","query=","record=","without-hb","collection=", "sysno="]) t1 = os.times()[4] for option, value in options: if option in ("-a","--all"): sql_queries.append("select id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'") if option in ("-l","--last"): sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date") if option in ("-q","--query"): sql_queries.append(value) if option in ("-b","--without-hb"): process_hb = 1 if option in ("-r","--record"): sql_queries.append("select id from bibrec where id='%s'" % (value)) if option in ("-c","--collection"): sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value = '%s'" % value) sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value = '%s'" % value) if option in ("-s","--sysno"): sql_queries.append("select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='%s'" % value) if len(sql_queries) == 0: print_info() sys.exit() ### Query database for record IDs ### if process_hb: without_hb = withouthb() print "Records without hb: %s" % len(without_hb) recIDs = [] for sql_query in sql_queries: res = run_sql(sql_query) for item in res: recIDs.append(item[0]) ### list of corresponding record IDs was retrieved ### bibformat the records selected print "Records to be processed: %s" % len(recIDs) ### Initialize main loop total_rec = 0 # Total number of records xml_content = '' # hold the contents tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) iterate_over(recIDs) ### Iterate over all records prepared in list II (no_hb) if process_hb: iterate_over(without_hb) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec print message message = "total processing time: %2f sec" % elapsed print message avg = total_rec / elapsed message = "records per second average: %2f rec/sec" % avg print message message = "Time spent on external call (os.system):" print message message = " bibformat: %2f sec" % tbibformat print message message = " bibupload: %2f sec" % tbibupload print message ### main if __name__ == '__main__': main()