diff --git a/modules/bibformat/bin/.cvsignore b/modules/bibformat/bin/.cvsignore index b72a82217..5445d22c1 100644 --- a/modules/bibformat/bin/.cvsignore +++ b/modules/bibformat/bin/.cvsignore @@ -1,6 +1,7 @@ Makefile Makefile.in z_* *.O *~ -bibformat \ No newline at end of file +bibformat +bibreformat \ No newline at end of file diff --git a/modules/bibformat/bin/Makefile.am b/modules/bibformat/bin/Makefile.am index 587412936..dfa776452 100644 --- a/modules/bibformat/bin/Makefile.am +++ b/modules/bibformat/bin/Makefile.am @@ -1,28 +1,28 @@ ## $Id$ ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -bin_SCRIPTS=bibformat +bin_SCRIPTS=bibformat bibreformat EXTRA_DIST = $(wildcard *.wml) CLEANFILES = $(bin_SCRIPTS) *~ *.tmp %: %.wml ../../../config/config.wml ../../../config/configbis.wml $(WML) -o $@ $< chmod u+x $@ diff --git a/modules/bibformat/bin/bibreformat.in b/modules/bibformat/bin/bibreformat.in new file mode 100644 index 000000000..f2b7e16fc --- /dev/null +++ b/modules/bibformat/bin/bibreformat.in @@ -0,0 +1,386 @@ +## $Id$ +## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records + +## This file is part of the CERN Document Server Software (CDSware). +## Copyright (C) 2002 CERN. +## +## The CDSware is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## The CDSware is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with CDSware; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +## read config variables: +#include "config.wml" +#include "configbis.wml" +## start Python: +#! +## $Id$ +## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. +""" +BibReformat -- to reformat HTML brief (and other) formats for bibliographic records + + bibreformat : reformat only modified records (i.e. "bibformat.last_updated < bibrec.modification_date"), format: HB + bibreformat -a : reformat all records in database, format: HB +""" + +## fill config variables: +pylibdir = "/python" + +## okay, rest of the Python code goes below +####### + +## version number: +__version__ = "$Id$" + +## import interesting modules: +try: + import sys + import os + import getopt + import string + import zlib + import time + import MySQLdb + import signal + import urllib +except ImportError, e: + print "Error: %s" % e + import sys + sys.exit(1) + +try: + sys.path.append('%s' % pylibdir) + from cdsware.config import * + from cdsware.dbquery import run_sql +except ImportError, e: + print "Error: %s" % e + import sys + sys.exit(1) + +### SQL query for recIDs detection, XXX for command line parameter +### + +sql_query_for_recIDs = { + + 'a' : "select id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'", + 'b1': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'", + 'b2': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'", + 'c1': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value='XXX'", + 'c2': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value='XXX'", + 'l' : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date", + 'e' : "select br.id from bibrec as br left join bibrec_bib03x as br3 on br.id = br3.id_bibrec left join bib03x as b3 on br3.id_bibxxx = b3.id where b3.value='XXX'", + 'r' : "select id from bibrec where id='XXX'", + 's' : "select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='XXX'" + +} + +### MySQL Search by recID +### + +def read_xml_input(query): + "Read records to preprocess via http" + + url = "%s/search.py?id=%s&of=xm" % (weburl, query) + return urllib.urlopen(url).read() + +### Result set operations +### + +def lhdiff(l1, l2): + "Does list difference via intermediate hash." + d = {} + ld = [] + for e in l2: + d[e]=1 + for e in l1: + if not d.has_key(e): + ld.append(e) + return ld + +### Result set operations +### + +def ldiff(l1, l2): + "Returns l1 - l2." + + ld = [] + for e in l1: + if not e in l2: + ld.append(e) + return ld + +### Identify recIDs of records with missing hb +### + +def withouthb(): + "List of record IDs to be reformated, not having the hb format yet" + + xm1, xm2, hb1, hb2 = [],[],[],[] + + q1 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'" + q2 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'" + + ## get complete recID list of xm formatted records + xm1 = run_sql(q1) + + for item in xm1: + xm2.append(item[0]) + + ## get complete recID list of hb formatted records + hb1 = run_sql(q2) + + for item in hb1: + hb2.append(item[0]) + + return lhdiff(xm2,hb2) + +### Print out info +### + +def print_info(): + "Print script info" + + print "\n Usage: bibreformat -[options]\n" + print "\n Options:" + print " -a All records with existing 'hb' format" + print " -b Process in addition records without HB" + print " -c Individual set specified by collection identifier (dbcollid)" + print " -l Records modified since last bibreformat only" + print " -r Individual record specified by record identifier (recID)" + print " -s Individual record specified by ancient Aleph300 system number\n" + print "\n Example:" + print " bibreformat -l -b Standard synchronization bibreformat run.\n" + +### Bibreformat all selected records +### + +def iterate_over(list): + "Iterate odver list of IDs" + + n_rec = 0 + n_max = 1000 + total_rec = 0 # Total number of records + xml_content = '' # hold the contents + tbibformat = 0 # time taken up by external call + tbibupload = 0 # time taken up by external call + + for record in list: + + n_rec = n_rec + 1 + total_rec = total_rec + 1 + + message = "Processing record: %d" % (record) + print message + + xml_content = xml_content + read_xml_input(record) + + if xml_content: + + if n_rec >= n_max: + + filehandle = open("bibreformat.xml" ,"w") + filehandle.write(xml_content) + filehandle.close() + + +### bibformat external call +### + + t11 = os.times()[4] + message = "START bibformat external call" + print message + + command = "%s/bibformat otype='HB' < bibreformat.xml > rec_fmt.xml" % bindir + os.system(command) + + t22 = os.times()[4] + message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) + print message + + tbibformat = tbibformat + (t22 - t11) + + +### bibupload external call +### + + t11 = os.times()[4] + message = "START bibupload external call" + print message + + command = "%s/bibupload -f rec_fmt.xml" % bindir + os.system(command) + + t22 = os.times()[4] + message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) + print message + + tbibupload = tbibupload + (t22- t11) + + n_rec = 0 + xml_content = '' + +### Process the last re-formated chunk +### + + if n_rec > 0: + + print "Processing last record set (%d)" % n_rec + + filehandle = open("bibreformat.xml" ,"w") + filehandle.write(xml_content) + filehandle.close() + +### bibformat external call +### + + t11 = os.times()[4] + message = "START bibformat external call" + print message + + command = "%s/bibformat otype='HB' < bibreformat.xml > rec_fmt.xml 2> bibreformat.err" % bindir + os.system(command) + + t22 = os.times()[4] + message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) + print message + + tbibformat = tbibformat + (t22 - t11) + +### bibupload external call +### + + t11 = os.times()[4] + message = "START bibupload external call" + print message + + command = "%s/bibupload -f rec_fmt.xml" % bindir + os.system(command) + + t22 = os.times()[4] + message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) + print message + + tbibupload = tbibupload + (t22- t11) + + return + +### +### + +def main(): + +### Options, parameters +### + + process_hb = 0 + mode = "" + sql_queries = [] + res = [] + + options, arguments = getopt.getopt(sys.argv[1:],'alq:r:bc:s:',["all","last","query=","record=","without-hb","collection=", "sysno="]) + + t1 = os.times()[4] + for option, value in options: + if option in ("-a","--all"): + + sql_queries.append("select id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'") + + if option in ("-l","--last"): + + sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date") + + if option in ("-q","--query"): + + sql_queries.append(value) + + if option in ("-b","--without-hb"): + + process_hb = 1 + + if option in ("-r","--record"): + + sql_queries.append("select id from bibrec where id='%s'" % (value)) + + if option in ("-c","--collection"): + + sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value = '%s'" % value) + + sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value = '%s'" % value) + + if option in ("-s","--sysno"): + + sql_queries.append("select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='%s'" % value) + + if len(sql_queries) == 0: + print_info() + sys.exit() + +### Query database for record IDs +### + + if process_hb: + without_hb = withouthb() + print "Records without hb: %s" % len(without_hb) + + recIDs = [] + + for sql_query in sql_queries: + res = run_sql(sql_query) + for item in res: + recIDs.append(item[0]) + +### list of corresponding record IDs was retrieved +### bibformat the records selected + + print "Records to be processed: %s" % len(recIDs) + +### Initialize main loop + + total_rec = 0 # Total number of records + xml_content = '' # hold the contents + tbibformat = 0 # time taken up by external call + tbibupload = 0 # time taken up by external call + +### Iterate over all records prepared in lists I (option) + iterate_over(recIDs) + +### Iterate over all records prepared in list II (no_hb) + if process_hb: + iterate_over(without_hb) + +### Final statistics + + t2 = os.times()[4] + + elapsed = t2 - t1 + message = "total records processed: %d" % total_rec + print message + + message = "total processing time: %2f sec" % elapsed + print message + + avg = total_rec / elapsed + message = "records per second average: %2f rec/sec" % avg + print message + + message = "Time spent on external call (os.system):" + print message + + message = " bibformat: %2f sec" % tbibformat + print message + + message = " bibupload: %2f sec" % tbibupload + print message + +### main +if __name__ == '__main__': + main() diff --git a/modules/bibformat/bin/bibreformat.wml b/modules/bibformat/bin/bibreformat.wml new file mode 100644 index 000000000..f2b7e16fc --- /dev/null +++ b/modules/bibformat/bin/bibreformat.wml @@ -0,0 +1,386 @@ +## $Id$ +## BibReformat -- to reformat HTML brief (and other) formats for bibliographic records + +## This file is part of the CERN Document Server Software (CDSware). +## Copyright (C) 2002 CERN. +## +## The CDSware is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## The CDSware is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with CDSware; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +## read config variables: +#include "config.wml" +#include "configbis.wml" +## start Python: +#! +## $Id$ +## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. +""" +BibReformat -- to reformat HTML brief (and other) formats for bibliographic records + + bibreformat : reformat only modified records (i.e. "bibformat.last_updated < bibrec.modification_date"), format: HB + bibreformat -a : reformat all records in database, format: HB +""" + +## fill config variables: +pylibdir = "/python" + +## okay, rest of the Python code goes below +####### + +## version number: +__version__ = "$Id$" + +## import interesting modules: +try: + import sys + import os + import getopt + import string + import zlib + import time + import MySQLdb + import signal + import urllib +except ImportError, e: + print "Error: %s" % e + import sys + sys.exit(1) + +try: + sys.path.append('%s' % pylibdir) + from cdsware.config import * + from cdsware.dbquery import run_sql +except ImportError, e: + print "Error: %s" % e + import sys + sys.exit(1) + +### SQL query for recIDs detection, XXX for command line parameter +### + +sql_query_for_recIDs = { + + 'a' : "select id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'", + 'b1': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'", + 'b2': "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'", + 'c1': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value='XXX'", + 'c2': "select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value='XXX'", + 'l' : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date", + 'e' : "select br.id from bibrec as br left join bibrec_bib03x as br3 on br.id = br3.id_bibrec left join bib03x as b3 on br3.id_bibxxx = b3.id where b3.value='XXX'", + 'r' : "select id from bibrec where id='XXX'", + 's' : "select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='XXX'" + +} + +### MySQL Search by recID +### + +def read_xml_input(query): + "Read records to preprocess via http" + + url = "%s/search.py?id=%s&of=xm" % (weburl, query) + return urllib.urlopen(url).read() + +### Result set operations +### + +def lhdiff(l1, l2): + "Does list difference via intermediate hash." + d = {} + ld = [] + for e in l2: + d[e]=1 + for e in l1: + if not d.has_key(e): + ld.append(e) + return ld + +### Result set operations +### + +def ldiff(l1, l2): + "Returns l1 - l2." + + ld = [] + for e in l1: + if not e in l2: + ld.append(e) + return ld + +### Identify recIDs of records with missing hb +### + +def withouthb(): + "List of record IDs to be reformated, not having the hb format yet" + + xm1, xm2, hb1, hb2 = [],[],[],[] + + q1 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='xm'" + q2 = "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'" + + ## get complete recID list of xm formatted records + xm1 = run_sql(q1) + + for item in xm1: + xm2.append(item[0]) + + ## get complete recID list of hb formatted records + hb1 = run_sql(q2) + + for item in hb1: + hb2.append(item[0]) + + return lhdiff(xm2,hb2) + +### Print out info +### + +def print_info(): + "Print script info" + + print "\n Usage: bibreformat -[options]\n" + print "\n Options:" + print " -a All records with existing 'hb' format" + print " -b Process in addition records without HB" + print " -c Individual set specified by collection identifier (dbcollid)" + print " -l Records modified since last bibreformat only" + print " -r Individual record specified by record identifier (recID)" + print " -s Individual record specified by ancient Aleph300 system number\n" + print "\n Example:" + print " bibreformat -l -b Standard synchronization bibreformat run.\n" + +### Bibreformat all selected records +### + +def iterate_over(list): + "Iterate odver list of IDs" + + n_rec = 0 + n_max = 1000 + total_rec = 0 # Total number of records + xml_content = '' # hold the contents + tbibformat = 0 # time taken up by external call + tbibupload = 0 # time taken up by external call + + for record in list: + + n_rec = n_rec + 1 + total_rec = total_rec + 1 + + message = "Processing record: %d" % (record) + print message + + xml_content = xml_content + read_xml_input(record) + + if xml_content: + + if n_rec >= n_max: + + filehandle = open("bibreformat.xml" ,"w") + filehandle.write(xml_content) + filehandle.close() + + +### bibformat external call +### + + t11 = os.times()[4] + message = "START bibformat external call" + print message + + command = "%s/bibformat otype='HB' < bibreformat.xml > rec_fmt.xml" % bindir + os.system(command) + + t22 = os.times()[4] + message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) + print message + + tbibformat = tbibformat + (t22 - t11) + + +### bibupload external call +### + + t11 = os.times()[4] + message = "START bibupload external call" + print message + + command = "%s/bibupload -f rec_fmt.xml" % bindir + os.system(command) + + t22 = os.times()[4] + message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) + print message + + tbibupload = tbibupload + (t22- t11) + + n_rec = 0 + xml_content = '' + +### Process the last re-formated chunk +### + + if n_rec > 0: + + print "Processing last record set (%d)" % n_rec + + filehandle = open("bibreformat.xml" ,"w") + filehandle.write(xml_content) + filehandle.close() + +### bibformat external call +### + + t11 = os.times()[4] + message = "START bibformat external call" + print message + + command = "%s/bibformat otype='HB' < bibreformat.xml > rec_fmt.xml 2> bibreformat.err" % bindir + os.system(command) + + t22 = os.times()[4] + message = "END bibformat external call (time elapsed:%2f)" % (t22-t11) + print message + + tbibformat = tbibformat + (t22 - t11) + +### bibupload external call +### + + t11 = os.times()[4] + message = "START bibupload external call" + print message + + command = "%s/bibupload -f rec_fmt.xml" % bindir + os.system(command) + + t22 = os.times()[4] + message = "END bibupload external call (time elapsed:%2f)" % (t22-t11) + print message + + tbibupload = tbibupload + (t22- t11) + + return + +### +### + +def main(): + +### Options, parameters +### + + process_hb = 0 + mode = "" + sql_queries = [] + res = [] + + options, arguments = getopt.getopt(sys.argv[1:],'alq:r:bc:s:',["all","last","query=","record=","without-hb","collection=", "sysno="]) + + t1 = os.times()[4] + for option, value in options: + if option in ("-a","--all"): + + sql_queries.append("select id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='hb'") + + if option in ("-l","--last"): + + sql_queries.append("select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='hb' and bf.last_updated < br.modification_date") + + if option in ("-q","--query"): + + sql_queries.append(value) + + if option in ("-b","--without-hb"): + + process_hb = 1 + + if option in ("-r","--record"): + + sql_queries.append("select id from bibrec where id='%s'" % (value)) + + if option in ("-c","--collection"): + + sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__a' and b.value = '%s'" % value) + + sql_queries.append("select a.id_bibrec from bibrec_bib98x as a left join bib98x as b on a.id_bibxxx=b.id where b.tag='980__b' and b.value = '%s'" % value) + + if option in ("-s","--sysno"): + + sql_queries.append("select a.id_bibrec from bib90x as b left join bibrec_bib90x as a on a.id_bibxxx=b.id where b.value='%s'" % value) + + if len(sql_queries) == 0: + print_info() + sys.exit() + +### Query database for record IDs +### + + if process_hb: + without_hb = withouthb() + print "Records without hb: %s" % len(without_hb) + + recIDs = [] + + for sql_query in sql_queries: + res = run_sql(sql_query) + for item in res: + recIDs.append(item[0]) + +### list of corresponding record IDs was retrieved +### bibformat the records selected + + print "Records to be processed: %s" % len(recIDs) + +### Initialize main loop + + total_rec = 0 # Total number of records + xml_content = '' # hold the contents + tbibformat = 0 # time taken up by external call + tbibupload = 0 # time taken up by external call + +### Iterate over all records prepared in lists I (option) + iterate_over(recIDs) + +### Iterate over all records prepared in list II (no_hb) + if process_hb: + iterate_over(without_hb) + +### Final statistics + + t2 = os.times()[4] + + elapsed = t2 - t1 + message = "total records processed: %d" % total_rec + print message + + message = "total processing time: %2f sec" % elapsed + print message + + avg = total_rec / elapsed + message = "records per second average: %2f rec/sec" % avg + print message + + message = "Time spent on external call (os.system):" + print message + + message = " bibformat: %2f sec" % tbibformat + print message + + message = " bibupload: %2f sec" % tbibupload + print message + +### main +if __name__ == '__main__': + main()