diff --git a/modules/bibupload/lib/bibupload.py b/modules/bibupload/lib/bibupload.py index 803f210c7..ce8fee09a 100644 --- a/modules/bibupload/lib/bibupload.py +++ b/modules/bibupload/lib/bibupload.py @@ -1,1313 +1,1317 @@ -#!@PYTHON@ # -*- coding: utf-8 -*- ## ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibUpload : Receive a MARC XML file and update the appropriate database tables according to options Usage: bibupload [options] input.xml Examples: $ bibupload -i input.xml Options: Input: -a --append : new fields are appended to the existing record -c --correct : fields are replaced by the new ones in the existing record -f --format : takes only the FMT fields into account. Does not update -i --insert : insert the new record in the database -r --replace : the existing record is entirely replaced by the new one -z --reference : update references (update only 999 fields) -s --stage : stage to start from in the algorithme ( 0 : always done ; 1: FMT tags ; 2:FFT tags; 3: BibFmt; 4: Metadata update; 5 : time update) Scheduling options: -u, --user=USER user name to store task, password needed General options: -h, --help print this help and exit -v, --verbose=LEVEL verbose level (from 0 to 9, default 1) -n --notimechange : do not change record last modification date when updating -V --version : print the script version """ __version__ = "$Id$" import os import sys import getopt import getpass import signal import string import marshal import time import traceback -from time import localtime from zlib import compress import MySQLdb import re from invenio.bibupload_config import * from invenio.access_control_engine import acc_authorize_action from invenio.dbquery import run_sql, \ Error from invenio.bibrecord import create_records, \ create_record, \ record_add_field, \ record_delete_field, \ record_xml_output from invenio.dateutils import convert_datestruct_to_datetext from invenio.search_engine import print_record from invenio.config import filedir, \ filedirsize, \ htdocsurl # Global variables options = {} options['mode'] = None options['verbose'] = 1 options['tag'] = None options['file_path'] = None options['notimechange'] = 0 options['stage_to_start_from'] = 1 #Statistic variables stat = {} stat['nb_records_to_upload'] = 0 stat['nb_records_updated'] = 0 stat['nb_records_inserted'] = 0 stat['nb_errors'] = 0 stat['exectime'] = time.localtime() ### bibsched task related functions: def write_message(msg, stream=sys.stdout, verbose=1): """Write message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff. Do not print anything if the global verbose option is lower than VERBOSE. """ if stream == sys.stdout or stream == sys.stderr: if options['verbose'] >= verbose: stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) stream.write("%s\n" % msg) stream.flush() else: sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream) return def task_sig_sleep(sig, frame): """Signal handler for the 'sleep' signal sent by BibSched.""" write_message("sleeping...") task_update_status("SLEEPING") signal.pause() # wait for wake-up signal def task_sig_wakeup(sig, frame): """Signal handler for the 'wakeup' signal sent by BibSched.""" write_message("continuing...") task_update_status("CONTINUING") def task_sig_stop(sig, frame): """Signal handler for the 'stop' signal sent by BibSched.""" write_message("stopping...") task_update_status("STOPPING") write_message("flushing cache or whatever...") time.sleep(3) write_message("closing tables or whatever...") time.sleep(1) write_message("stopped") task_update_status("STOPPED") sys.exit(0) def task_sig_suicide(sig, frame): """Signal handler for the 'suicide' signal sent by BibSched.""" write_message("suiciding myself now...") task_update_status("SUICIDING") write_message("suicided") task_update_status("SUICIDED") sys.exit(0) def task_sig_unknown(sig, frame): """Signal handler for the other unknown signals sent by shell or user.""" write_message("unknown signal %d ignored" % sig) # do nothing for other signals def authenticate(user, header="BibUpload Task Submission", action="runbibupload"): """Authenticate the user against the user database. Check for its password, if it exists. Check for action access rights. Return user name upon authorization success, do system exit upon authorization failure. """ # FIXME: for the time being do not authenticate but always let the # tasks in, because of automated inserts. Maybe we shall design # an internal user here that will always be let in. return user print header print "=" * len(header) if user == "": print >> sys.stdout, "\rUsername: ", user = string.strip(string.lower(sys.stdin.readline())) else: print >> sys.stdout, "\rUsername:", user ## first check user pw: res = run_sql("select id,password from user where email=%s or nickname=%s", (user, user,), 1) if not res: print "Sorry, %s does not exist." % user sys.exit(1) else: (uid_db, password_db) = res[0] if password_db: password_entered = getpass.getpass() if password_db == password_entered: pass else: print "Sorry, wrong credentials for %s." % user sys.exit(1) ## secondly check authorization for the action: (auth_code, auth_message) = acc_authorize_action(uid_db, action) if auth_code != 0: print auth_message sys.exit(1) return user def task_submit(options): """Submits task to the BibSched task queue. This is what people will be invoking via command line.""" + global task_id ## sanity check: remove eventual "task" option: if options.has_key("task"): del options["task"] ## authenticate user: user = authenticate(options.get("user", "")) ## submit task: if options["verbose"] >= 9: print "" write_message("storing task options %s\n" % options) task_id = run_sql("""INSERT INTO schTASK (id,proc,user,runtime,sleeptime,status,arguments) VALUES (NULL,'bibupload',%s,%s,%s,'WAITING',%s)""", (user, options["runtime"], options["sleeptime"], marshal.dumps(options))) ## update task number: options["task"] = task_id run_sql("""UPDATE schTASK SET arguments=%s WHERE id=%s""", (marshal.dumps(options), task_id)) write_message("Task #%d submitted." % task_id) return task_id def task_update_progress(msg): """Updates progress information in the BibSched task table.""" global task_id, options if options["verbose"] >= 9: write_message("Updating task progress to %s." % msg) return run_sql("UPDATE schTASK SET progress=%s where id=%s", (msg, task_id)) def task_update_status(val): """Updates status information in the BibSched task table.""" global task_id, options if options["verbose"] >= 9: write_message("Updating task status to %s." % val) return run_sql("UPDATE schTASK SET status=%s where id=%s", (val, task_id)) def task_read_status(task_id): """Read status information in the BibSched task table.""" res = run_sql("SELECT status FROM schTASK where id=%s", (task_id,), 1) try: out = res[0][0] except: out = 'UNKNOWN' return out def task_get_options(id): """Returns options for the task 'id' read from the BibSched task queue table.""" out = {} res = run_sql("SELECT arguments FROM schTASK WHERE id=%s AND proc='bibupload'", (id,)) try: out = marshal.loads(res[0][0]) except: write_message("Error: BibUpload task %d does not seem to exist." % id, sys.stderr) sys.exit(1) return out def task_run(): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call. The task prints Fibinacci numbers for up to NUM on the stdout, and some messages on stderr. Return 1 in case of success and 0 in case of failure.""" global task_id, options, stat options = task_get_options(task_id) # get options from BibSched task table ## check task id: if not options.has_key("task"): write_message("Error: The task #%d does not seem to be a BibUpload task." % task_id, sys.stderr) return 0 ## check task status: task_status = task_read_status(task_id) if task_status != "WAITING": write_message("Error: The task #%d is %s. I expected WAITING." % (task_id, task_status), sys.stderr) return 0 ## we can run the task now: if options["verbose"]: write_message("Task #%d started." % task_id) task_update_status("RUNNING") ## initialize signal handler: signal.signal(signal.SIGUSR1, task_sig_sleep) signal.signal(signal.SIGTERM, task_sig_stop) signal.signal(signal.SIGABRT, task_sig_suicide) signal.signal(signal.SIGCONT, task_sig_wakeup) signal.signal(signal.SIGINT, task_sig_unknown) ## run the task: error = 0 write_message("BibUpload Mode "+options['mode']+" has been choosen.", verbose=2) write_message("STAGE 0:", verbose=2) if options['file_path'] != None: recs = xml_marc_to_records(open_marc_file(options['file_path'])) stat['nb_records_to_upload'] = len(recs) write_message(" -Open XML marc : DONE", verbose=2) if recs != None: #We proceed each record by record for record in recs: error = bibupload(record) - if error == 1: + if error[0] == 1: stat['nb_errors'] += 1 task_update_progress("Done %d out of %d." % (stat['nb_records_inserted'] + stat['nb_records_updated'], stat['nb_records_to_upload'])) else: write_message(" Error bibupload failed : No record found", verbose=1, stream=sys.stderr) if options['verbose'] >= 1: #Print out the statistics print_out_bibupload_statistics() # Check if they were errors if stat['nb_errors'] >= 1: task_update_status("DONE WITH ERRORS") else: ## we are done: task_update_status("DONE") if options["verbose"]: write_message("Task #%d finished." % task_id) return 1 ### bibupload engine functions: def parse_command(): """Analyze the command line and retrieve arguments (xml file, mode, etc) into global options variable. Return 0 in case everything went well, 1 in case of errors, 2 in case only help or version number were asked for. """ # FIXME: add treatment of `time' try: opts, args = getopt.getopt(sys.argv[1:], "i:r:c:a:z:s:f:u:hv:Vn", [ "insert=", "replace=", "correct=", "append=", "reference=", "stage=", "format=", "user=", "help", "verbose", "version", "notimechange", ]) except getopt.GetoptError,erro: usage() write_message("Stage 0 error: %s" % erro, verbose=1, stream=sys.stderr) return 1 # set the proper mode depending on the argument value for opt, opt_value in opts: # Verbose mode option if opt in ["-v", "--verbose"]: try: options['verbose'] = int(opt_value) except ValueError: write_message("Failed: enter a valid number for verbose mode (between 0 and 9).", verbose=1, stream=sys.stderr) return 1 # stage mode option if opt in ["-s", "--stage"]: try: options['stage_to_start_from'] = int(opt_value) except ValueError: write_message("Failed: enter a valid number for the stage to start from(>0).", verbose=1, stream=sys.stderr) return 1 # No time change option if opt in ["-n", "--notimechange"]: options['notimechange'] = 1 # Insert mode option if opt in ["-i", "--insert"]: options['mode'] = 'insert' options['file_path'] = os.path.abspath(opt_value) # Replace mode option if opt in ["-r", "--replace"]: # We check if there is not the mode insert after replace if opt_value == '-i' or opt_value == '--insert': options['replace_insert'] = 1 options['mode'] = 'replace_insert' options['file_path'] = os.path.abspath(args[0]) else: # Creation of the records from the xml Marc in argument options['mode'] = 'replace' options['file_path'] = os.path.abspath(opt_value) # Correct mode option if opt in ["-c", "--correct"]: # We check if there is not just a special tag to correct try: if int(opt_value) > 0 and int(opt_value) < 999: options['mode'] = 'correct' options['tag'] = opt_value options['file_path'] = os.path.abspath(args[0]) except ValueError: if opt_value == 'FMT' or opt_value == 'FFT': options['mode'] = 'correct' options['tag'] = opt_value options['file_path'] = os.path.abspath(args[0]) else: options['mode'] = 'correct' options['file_path'] = os.path.abspath(opt_value) # Append mode option if opt in ["-a", "--append"]: # We check if there is not just a special tag to correct try: if int(opt_value) > 0 and int(opt_value) < 999: options['mode'] = 'append' options['tag'] = opt_value options['file_path'] = os.path.abspath(args[0]) except ValueError: if opt_value == 'FMT' or opt_value == 'FFT': options['mode'] = 'append' options['tag'] = opt_value options['file_path'] = os.path.abspath(args[0]) else: options['mode'] = 'append' options['file_path'] = os.path.abspath(opt_value) # Reference mode option if opt in ["-z", "--reference"]: options['mode'] = 'reference' options['file_path'] = os.path.abspath(opt_value) # Format mode option if opt in ["-f", "--format"]: options['mode'] = 'format' options['file_path'] = os.path.abspath(opt_value) # Detection of user if opt in ["-u", "--user"]: options['user'] = opt_value # Help mode option if opt in ["-h", "--help"]: usage() return 2 # Version mode option if opt in ["-V", "--version"]: write_message( __version__, verbose=1) return 2 if options['mode'] == None: write_message("Please specify at least one update/insert mode!") return 1 if options['file_path'] == None: write_message("Missing filename! -h for help.") return 1 return 0 def bibupload(record): - """ Main function: proceed a record and fit it in the tables bibfmt, bibrec, bibrec_bibxxx, bibxxx with proper record metadata """ + """Main function: process a record and fit it in the tables + bibfmt, bibrec, bibrec_bibxxx, bibxxx with proper record + metadata. + Return (error_code, recID) of the processed record. + """ error = None #If there are special tags to proceed check if it exists in the record if options['tag'] != None and not(record.has_key(options['tag'])): write_message(" Failed: Tag not found, enter a valid tag to update.", verbose=1, stream=sys.stderr) - return 1 + return (1, -1) #Extraction of the Record Id rec_id = retrieve_rec_id(record) if rec_id == -1: - return 1 + return (1, -1) else: write_message(" -Retrieve record Id (found %s) : DONE." % rec_id, verbose=2) write_message(" -Check if the xml marc file is already in the database : DONE" , verbose=2) # Reference mode check if there are reference tag if options['mode'] == 'reference': error = extract_tag_from_record(record, cfg_bibupload_reference_tag) if error == None: write_message(" Failed : No reference tags has been found...", verbose=1, stream=sys.stderr) - return 1 + return (1, -1) else: error = None write_message(" -Check if reference tags exist : DONE", verbose=2) if options['mode'] == 'insert': # Insert the record into the bibrec databases to have a recordId rec_id = create_new_record() write_message(" -Creation of a new record id (%d) : DONE" % rec_id, verbose=2) # we add the record Id control field to the record error = record_add_field(record, '001', '', '', rec_id) if error == None: write_message(" Failed: " \ "Error during adding the 001 controlfield " \ "to the record", verbose=1, stream=sys.stderr) - return 1 + return (1, rec_id) else: error = None elif options['mode'] != 'insert' and options['mode'] != 'format' and options['stage_to_start_from'] != 5: # Update Mode #Retrieve the old record to update rec_old = create_record(print_record(int(rec_id),'xm'), 2)[0] if rec_old == None: write_message(" Failed during the creation of the old record!", verbose=1, stream=sys.stderr) - return 1 + return (1, rec_id) else: write_message(" -Retrieve the old record to update : DONE", verbose=2) #Delete tags to correct in the record if options['mode'] == 'correct' or options['mode'] == 'reference': delete_tags_to_correct(record, rec_old) write_message(" -Delete the old tags to correct in the old record: DONE", verbose=2) # Append new tag to the old record and update the new record with the old_record modified if options['mode'] == 'append' or options['mode'] == 'correct' or options['mode'] == 'reference': record = append_new_tag_to_old_record(record, rec_old) write_message(" -Append new tags to the old record: DONE", verbose=2) # now we clear all the rows from bibrec_bibxxx from the old record delete_bibrec_bibxxx(rec_old, rec_id) write_message(" -Clean bibrec_bibxxx: DONE", verbose=2) write_message(" -Stage COMPLETED", verbose=2) #Have a look if we have FMT tags write_message("Stage 1: Start (Insert of FMT tags if exist).", verbose=2) if options['stage_to_start_from'] <= 1 and extract_tag_from_record(record,'FMT') != None: record = insert_fmt_tags(record, rec_id) if record == None: write_message(" Stage 1 failed: Error while inserting FMT tags", verbose=1, stream=sys.stderr) - return 1 + return (1, rec_id) elif record == 0: #Mode format finished stat['nb_records_updated'] += 1 - return 0 + return (0, rec_id) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) #Have a look if we have FFT tags write_message("Stage 2: Start (Process FFT tags if exist).", verbose=2) if options['stage_to_start_from'] <= 2 and extract_tag_from_record(record,'FFT') != None: if options['mode'] == 'insert' or options['mode'] == 'append': record = insert_fft_tags(record, rec_id) else: record = update_fft_tag(record, rec_id) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Update of the BibFmt write_message("Stage 3 : Start (Update bibfmt).", verbose=2) if options['stage_to_start_from'] <= 3: # format the single record as xml rec_xml_new = record_xml_output(record) #Update bibfmt with the format xm of this record error = update_bibfmt_format(rec_id, rec_xml_new, 'xm') if error == 1: write_message(" Failed: error during update_bibfmt_format", verbose=1, stream=sys.stderr) - return 1 + return (1, rec_id) write_message(" -Stage COMPLETED", verbose=2) # Update the database MetaData write_message("Stage 4 : Start (Update the database with the metadata).", verbose=2) if options['stage_to_start_from'] <= 4: update_database_with_metadata(record, rec_id) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Finally we update the bibrec table with the current date write_message("Stage 5 : Start (Update bibrec table with current date).", verbose=2) if options['stage_to_start_from'] <= 5 and options['mode'] != 'insert' and options['notimechange'] == 0: - now = convert_datestruct_to_datetext(localtime()) + now = convert_datestruct_to_datetext(time.localtime()) write_message(" -Retrieve current localtime : DONE", verbose=2) update_bibrec_modif_date(now, rec_id) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) #Increase statistics if options['mode'] == 'insert': stat['nb_records_inserted'] += 1 else: stat['nb_records_updated'] += 1 #Upload of this record finish write_message("Record "+str(rec_id)+" DONE", verbose=1) - return 0 + return (0, rec_id) def usage(): """Print help""" print """Receive a MARC XML file and update the appropriate database tables according to options Usage: bibupload [options] input.xml Examples: $ bibupload -i input.xml Options: Input: -a --append : new fields are appended to the existing record -c --correct : fields are replaced by the new ones in the existing record -f --format : takes only the FMT fields into account. Does not update -i --insert : insert the new record in the database -r --replace : the existing record is entirely replaced by the new one -z --reference : update references (update only 999 fields) -s --stage : stage to start from in the algorithme (0: always done; 1: FMT tags; 2:FFT tags; 3: BibFmt; 4: Metadata update; 5: time update) Scheduling options: -u, --user=USER user name to store task, password needed General options: -h, --help print this help and exit -v, --verbose=LEVEL verbose level (from 0 to 9, default 1) -n --notimechange : do not change record last modification date when updating -V --version : print the script version """ def print_out_bibupload_statistics(): """Print the statistics of the process""" print "BibUpload stat:" print "Input Records: ", stat['nb_records_to_upload'] print "Updated: ", stat['nb_records_updated'] print "Inserted: ", stat['nb_records_inserted'] print "Errors: ", stat['nb_errors'] print "Execution time: ", time.time() - time.mktime(stat['exectime']) def open_marc_file(path): """Open a file and return the data""" try: # open the file containing the marc document marc_file = open(path,'r') marc = marc_file.read() marc_file.close() except IOError, erro: write_message("Error: %s" % erro, verbose=1, stream=sys.stderr) write_message("Exiting.", sys.stderr) task_update_status("ERROR") sys.exit(1) return marc def xml_marc_to_records(xml_marc): """create the records""" # Creation of the records from the xml Marc in argument recs = create_records(xml_marc, 1, 1) if recs[0][0] == None: write_message("Error: Xml Marc has a wrong format: %s" % recs[0][2], verbose=1, stream=sys.stderr) write_message("Exiting.", sys.stderr) task_update_status("ERROR") sys.exit(1) else: recs = map((lambda x:x[0]), recs) return recs def find_record_bibrec(rec_id): """ receives the record ID and returns if this record exist in bibrec """ query = """SELECT id FROM bibrec where id = %s""" params = (rec_id,) try: res = run_sql(query, params) except Error, error: write_message(" Error during find_record_bibrec function : %s " % error, verbose=1, stream=sys.stderr) if len(res): return res else: return None def find_record_format(rec_id, format): """ receives the record ID and returns if this record exist in bibrec """ query = """SELECT count(id) FROM bibfmt where id_bibrec = %s and format = %s""" params = (rec_id, format) try: res = run_sql(query, params) except Error, error: write_message(" Error during find_record_format() : %s " % error, verbose=1, stream=sys.stderr) if len(res): return res[0] else: return None def find_record_bibfmt(marc): """ receives the xmlmarc containing a record and returns the id in bibrec if the record exists in bibfmt""" # compress the marc value pickled_marc = MySQLdb.escape_string(compress(marc)) query = """SELECT id_bibrec FROM bibfmt where value = %s""" # format for marc xml is xm params = (pickled_marc,) try: res = run_sql(query, params) except Error, error: write_message(" Error during find_record_bibfmt function : %s " % error, verbose=1, stream=sys.stderr) if len(res): return res else: return None def find_record_from_sysno(sysno): """receive the sysno number and return the record id""" table_name = 'bib'+cfg_bibupload_external_sysno_tag[0:2]+'x' query = """SELECT DISTINCT id FROM `%s` where value = '%s'""" params = (table_name, sysno) try: res = run_sql(query % params) except Error, error: write_message(" Error during find_record_from_sysno function 1st query : %s " % error, verbose=1, stream=sys.stderr) if len(res): table_name = 'bibrec_bib'+cfg_bibupload_external_sysno_tag[0:2]+'x' query = """SELECT DISTINCT id_bibrec FROM `%s` where id_bibxxx = '%s'""" params = (table_name, res[0][0]) try: res = run_sql(query % params) except Error, error: write_message(" Error during find_record_from_sysno function 2nd query : %s " % error, verbose=1, stream=sys.stderr) if len(res): return res[0][0] else: return None else: return None def extract_tag_from_record(record, tag_number): """ Extract the recordId """ # first step verify if the record is not already in the database if record: for tag in record.keys(): #Check the recordId if tag == tag_number: return record[tag] return None def retrieve_rec_id(record): """Retrieve the record Id from a record by using tag 001 or SYSNO""" rec_id = None tag = None #1st step: we look for the tag 001 tag = extract_tag_from_record(record, '001') if tag != None: #We exctract the record Id from the Tag rec_id = tag[0][3] #if we are in insert mode => error if options['mode'] == 'insert': write_message(" Failed : Error tag 001 found in the xml submitted"\ ", you should use the option replace, correct or append"\ " to replace an existing record. -h for help.", verbose=1, stream=sys.stderr) return -1 #if we found the rec id and we are not in insert mode => continue elif options['mode'] != 'insert': #we try to find the rec_id in the table bibrec if find_record_bibrec(rec_id) != None: return rec_id else: #The record doesn't exist yet. We will try to check the SYSNO id rec_id = None else: write_message(" -Tag 001 not found in the xml marc file.", verbose=9) if rec_id == None: #2nd step we look for the sysno code sysno = extract_tag_from_record(record, cfg_bibupload_external_sysno_tag) if sysno != None: #retrieve the SYSNO code from the tuple SYSNO sysno = sysno[0][0][0][1] write_message(" Check if the SYSNO id "+sysno+" exist in the database", verbose=9) #Retrieve the rec id from the database rec_id = find_record_from_sysno(sysno) if rec_id != None and options['mode'] == 'insert': write_message(" Failed : Record id found in the database: Please choose another mode than insert. -h for help.", verbose=1, stream=sys.stderr) return -1 elif rec_id == None and options['mode'] != 'insert': if options['mode'] != 'replace_insert': write_message(" Failed : Record Id not found even with SYSNO id..."\ "Please insert the file before updating it."\ " -h for help", verbose=1, stream=sys.stderr) return -1 else: options['mode'] = 'insert' else: return rec_id if sysno == None and options['mode'] != 'insert': if options['mode'] != 'replace_insert': write_message(" Failed : SYSNO tag not found in the xml marc file."\ "Please insert the file before updating it."\ " -h for help", verbose=1, stream=sys.stderr) return -1 else: options['mode'] = 'insert' - + return rec_id ### Insert functions def create_new_record(): """Create new record in the database""" - now = convert_datestruct_to_datetext(localtime()) + now = convert_datestruct_to_datetext(time.localtime()) query = """INSERT INTO bibrec (creation_date, modification_date) VALUES (%s, %s)""" params = (now, now) try: rec_id = run_sql(query, params) return rec_id except Error, error: write_message(" Error during the creation_new_record function : %s " % error, verbose=1, stream=sys.stderr) - + return None def insert_bibfmt(id_bibrec, marc, format): """Insert the format in the table bibfmt""" # compress the marc value #pickled_marc = MySQLdb.escape_string(compress(marc)) pickled_marc = compress(marc) # get the current time - now = convert_datestruct_to_datetext(localtime()) + now = convert_datestruct_to_datetext(time.localtime()) query = """INSERT INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s )""" try: row_id = run_sql(query, (id_bibrec, format, now, pickled_marc)) return row_id except Error, error: write_message(" Error during the insert_bibfmt function : %s " % error, verbose=1, stream=sys.stderr) + return None def insert_record_bibxxx(tag, value): """Insert the record into bibxxx""" #determine into which table one should insert the record table_name = 'bib'+tag[0:2]+'x' #check if the table exists in the database query = """SHOW TABLES like %s""" params = (table_name,) try: res = run_sql(query, params) except Error, error: write_message(" Error during the insert_record_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) # if the table doesn't exist, please create it if len(res): pass else: if options['verbose'] >= 1: print 'create table' # check if the tag, value combination exists in the table query = """SELECT id FROM %s """ % table_name query += """ WHERE tag = %s and value = %s""" params = (tag, value) try: res = run_sql(query, params) except Error, error: write_message(" Error during the insert_record_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) if len(res): # get the id of the row, if it exists row_id = res[0][0] return (table_name, row_id) else: # necessary to insert the tag, value into bibxxx table query = """INSERT INTO %s """ % table_name query += """ (tag, value) values (%s , %s)""" params = (tag, value) try: row_id = run_sql(query, params) except Error, error: write_message(" Error during the insert_record_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) return (table_name, row_id) def insert_record_bibrec_bibxxx(table_name, id_bibxxx, field_number, id_bibrec): """Insert the record into bibrec_bibxxx""" #determine into which table one should insert the record full_table_name = 'bibrec_'+ table_name #check if the table exists in the database query = """SHOW TABLES like '%s'""" params = (full_table_name) try: res = run_sql(query % params) except Error, error: write_message(" Error during the insert_record_bibrec_bibxxx function 1st query: %s " % error, verbose=1, stream=sys.stderr) # if the table doesn't exist, create it if len(res): pass else: print 'create table', full_table_name # insert the proper row into the table query = """INSERT INTO %s """ % full_table_name query += """(id_bibrec,id_bibxxx, field_number) values (%s , %s, %s)""" params = (id_bibrec, id_bibxxx, field_number) try: res = run_sql(query, params) except Error, error: write_message(" Error during the insert_record_bibrec_bibxxx function 2nd query : %s " % error, verbose=1, stream=sys.stderr) return res def insert_fft_tags(record, rec_id): """Process and insert FFT tags""" tuple_list = None tuple_list = extract_tag_from_record(record,'FFT') #If there is a FFT TAG :) if tuple_list != None: for single_tuple in tuple_list: # Get the inside of the FFT file docpath = single_tuple[0][0][1] docname = re.sub("\..*", "", os.path.basename(docpath)) extension = re.sub("^[^\.]*.", "", os.path.basename(docpath)).lower() #Create a new docId try: bib_doc_id = run_sql("insert into bibdoc (docname,creation_date,modification_date) values(%s,NOW(),NOW())", (docname,)) write_message(" -Insert of the file %s into bibdoc : DONE" % docname, verbose=2) except Error, error: write_message(" Error during the insert_fft_tags function : %s " % error, verbose=1, stream=sys.stderr) if bib_doc_id != None: #we link the document to the record if a rec_id was specified if rec_id != "": #TO FIX doc_type : main or additional, fron where the information come from? doc_type = "" try: res = run_sql("insert into bibrec_bibdoc values(%s,%s,%s)", (rec_id, bib_doc_id, doc_type)) if res == None: write_message(" Failed during creation of link between doc Id and rec Id).", verbose=1, stream=sys.stderr) else: write_message(" -Insert of the link bibrec bibdoc for %s : DONE" % docname, verbose=2) except Error, error: write_message(" Error during the insert_fft_tags function : %s " % error, verbose=1, stream=sys.stderr) else: write_message(" Failed during creation of the new doc Id.", verbose=1, stream=sys.stderr) #Move the file to the correct place # Variables from the config file archivepath = filedir archivesize = filedirsize url_path = None group = "g"+str(int(int(bib_doc_id)/archivesize)) basedir = "%s/%s/%s" % (archivepath, group, bib_doc_id) # we create the corresponding storage directory if not os.path.exists(basedir): try: os.makedirs(basedir) write_message(" -Create a new directory %s : DONE" % basedir, verbose=2) except OSError, error: write_message(" Error making the directory : %s " % error, verbose=1, stream=sys.stderr) # and save the father record id if it exists if rec_id != "": try: filep = open("%s/.recid" % basedir, "w") filep.write(str(bib_doc_id)) filep.close() except IOError, error: write_message(" Error writing the file : %s " % error, verbose=1, stream=sys.stderr) #Move the file to the good directory try: os.system("mv %s %s" % (docpath, basedir)) write_message(" -Move the file %s : DONE" % docname, verbose=2) except OSError, error: write_message(" Error moving the file : %s " % error, verbose=1, stream=sys.stderr) #Create the Url Path url_path = htdocsurl+"/record/"+str(rec_id)+"/files/"+docname+"."+extension #add tag 856 to the xml marc to proceed subfield_list = [('u', url_path), ('z', 'Access to Fulltext')] newfield_number = record_add_field(record, "856", "4", "", "", subfield_list) if newfield_number == None: write_message(" Error when adding the field"+ single_tuple, verbose=1, stream=sys.stderr) else: write_message(" -Add the new tag 856 to the record for %s : DONE" % docname, verbose=2) #Delete FFT tag :) record_delete_field(record, 'FFT', '', '') write_message(" -Delete FFT tag from source : DONE", verbose=2) return record def insert_fmt_tags(record, rec_id): """Process and insert FMT tags""" tuple_list = None tuple_list = extract_tag_from_record(record, 'FMT') #If there is a FMT TAG :) if tuple_list != None: for single_tuple in tuple_list: # Get the inside of the FMT file f_value = single_tuple[0][0][1] g_value = single_tuple[0][1][1] #Update the format res = update_bibfmt_format(rec_id, g_value, f_value) if res == 1: write_message(" Failed: Error during update_bibfmt", verbose=1, stream=sys.stderr) #If we are in mode format, we only care about the FMT tag if options['mode'] == 'format': return 0 # We delete the FMT Tag of the record record_delete_field(record,'FMT') write_message(" -Delete field FMT from record : DONE", verbose=2) return record elif options['mode'] == 'format': write_message(" Failed: Format updated failed : No tag FMT found", verbose=1, stream=sys.stderr) return None else: return record ### Update functions def update_bibrec_modif_date(now, bibrec_id): """Update the date of the record in bibrec table """ query = """UPDATE bibrec SET modification_date = %s WHERE id = %s""" params = (now, bibrec_id) try: res = run_sql(query, params) - if res != 0: + if res != 1: write_message(" Failed : Sql error during the update of the bibrec modification_date", verbose=1, stream=sys.stderr) else: write_message(" -Update record modification date : DONE" , verbose=2) except Error, error: write_message(" Error during update_bibrec_modif_date function : %s" % error, verbose=1, stream=sys.stderr) def update_bibfmt_format(id_bibrec, marc, format): """Update the format in the table bibfmt""" # We check if the format is already in bibFmt found = find_record_format(id_bibrec, format) if int(found[0]) == 1: # Update the format # get the current time - now = convert_datestruct_to_datetext(localtime()) + now = convert_datestruct_to_datetext(time.localtime()) # compress the marc value pickled_marc = compress(marc) query = """UPDATE bibfmt SET format = %s, last_updated = %s, value = %s WHERE id_bibrec = %s """ params = (format, now, pickled_marc, id_bibrec) try: row_id = run_sql(query, params) if row_id == None: write_message(" Failed: Error during update_bibfmt_format function", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Update the format %s in bibfmt : DONE" % format , verbose=2) return 0 except Error, error: write_message(" Error during the update_bibfmt_format function : %s " % error, verbose=1, stream=sys.stderr) elif int(found[0]) > 1: write_message(" Failed: Same format %s found several time in bibfmt for the same record." % format, verbose=1, stream=sys.stderr) return 1 else: # Insert the format information in BibFMT res = insert_bibfmt(id_bibrec, marc, format) if res == None: write_message(" Failed: Error during insert_bibfmt", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Insert the format %s in bibfmt : DONE" % format , verbose=2) return 0 def update_database_with_metadata(record, rec_id): """Update the database tables with the record and the record id given in parameter""" for tag in record.keys(): # check if tag is not a control field : tag not in cfg_bibupload_controlfield_tags and if tag not in cfg_bibupload_special_tags: # for each tag there is a list of tuples representing datafields tuple_list = record[tag] # this list should contain the elements of a full tag [tag, ind1, ind2, subfield_code] tag_list = [] tag_list.append(tag) for single_tuple in tuple_list: # these are the contents of a single tuple subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # append the ind's to the full tag if (ind1 == ''): tag_list.append('_') else: tag_list.append(ind1) if (ind2 == ''): tag_list.append('_') else: tag_list.append(ind2) datafield_number = single_tuple[4] if tag in cfg_bibupload_controlfield_tags and tag != "001": value = single_tuple[3] # get the full tag full_tag = ''.join(tag_list) # Others modes : update all the tag if options['mode'] == 'insert' or options['mode'] == 'replace' or options['mode'] == 'append': write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value) #print 'tname, bibrow', table_name, bibxxx_row_id; if table_name == None or bibxxx_row_id == None: write_message(" Failed : during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id) if res == None: write_message(" Failed : during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) else: # get the tag and value from the content of each subfield for subfield in subfield_list: subtag = subfield[0] value = subfield[1] tag_list.append(subtag) # get the full tag full_tag = ''.join(tag_list) # Others modes : update all the tag if options['mode'] == 'insert' or options['mode'] == 'replace' or options['mode'] == 'append': write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value) if table_name == None or bibxxx_row_id == None: write_message(" Failed : during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id) if res == None: write_message(" Failed : during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) # remove the subtag from the list tag_list.pop() tag_list.pop() tag_list.pop() tag_list.pop() write_message(" -Update the database with metadata : DONE", verbose=2) def append_new_tag_to_old_record(record, rec_old): """Append new tags to a old record""" if options['tag'] != None: tag = options['tag'] if tag in cfg_bibupload_controlfield_tags: if tag == '001': pass else: # if it is a controlfield,just access the value for single_tuple in record[tag]: controlfield_value = single_tuple[3] # add the field to the old record newfield_number = record_add_field(rec_old, tag, "", "", controlfield_value) if newfield_number == None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: # For each tag there is a list of tuples representing datafields for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # We add the datafield to the old record if options['verbose'] == 9: print " Adding tag: ", tag, " ind1=", ind1, " ind2=", ind2, " code=", subfield_list newfield_number = record_add_field(rec_old, tag, ind1, ind2, "", subfield_list) if newfield_number == None: write_message("Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: # Go through each tag in the appended record for tag in record.keys(): # Reference mode append only reference tag if options['mode'] == 'reference': if tag == cfg_bibupload_reference_tag: for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # We add the datafield to the old record if options['verbose'] == 9: print " Adding tag: ", tag, " ind1=", ind1, " ind2=", ind2, " code=", subfield_list newfield_number = record_add_field(rec_old, tag, ind1, ind2, "", subfield_list) if newfield_number == None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: if tag in cfg_bibupload_controlfield_tags: if tag == '001': pass else: # if it is a controlfield,just access the value for single_tuple in record[tag]: controlfield_value = single_tuple[3] # add the field to the old record newfield_number = record_add_field(rec_old, tag, "", "", controlfield_value) if newfield_number == None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: # For each tag there is a list of tuples representing datafields for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # We add the datafield to the old record if options['verbose'] == 9: print " Adding tag: ", tag, " ind1=", ind1, " ind2=", ind2, " code=", subfield_list newfield_number = record_add_field(rec_old, tag, ind1, ind2, "", subfield_list) if newfield_number == None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) return rec_old def update_fft_tag(record, rec_id): """Process and Update FFT tags""" #TO IMPROVE: SELECT THE BIBDOC ID TO DELETE AND FIRST INSERT THE NEW FFT TAGS BEFORE DELETING THE OLD ONE # We delete the bibdoc corresponding to this record delete_bibdoc(rec_id) # We delete the links between bibrec and bibdoc delete_bibrec_bibdoc(rec_id) # We delete the tag 856 from the record record_delete_field(record, '856', '4') # We add the new fft tags record = insert_fft_tags(record, rec_id) return record ###Delete function def delete_tags_to_correct(record, rec_old): """Delete the tags which are existing in both records""" # Browse through all the tags from the marc file for tag in record.keys(): #Do we have to delete only a special tag or all? if options['tag'] == None: # See if these tags exist in the old record - if rec_old.has_key(tag): + if rec_old.has_key(tag) and tag != '001': # Delete the tag found write_message(" Delete tag: "+tag+" ind1= "+rec_old[tag][0][1]+" ind2= "+rec_old[tag][0][2], verbose=9) record_delete_field(rec_old, tag, rec_old[tag][0][1], rec_old[tag][0][2]) else: if rec_old.has_key(tag) and options['tag'] == tag: # Delete the tag found write_message(" Delete tag: "+tag+" ind1= "+rec_old[tag][0][1]+" ind2= "+rec_old[tag][0][2], verbose=9) record_delete_field(rec_old, tag, rec_old[tag][0][1], rec_old[tag][0][2]) def delete_bibrec_bibxxx(record, id_bibrec): """Delete the database record from the table bibxxx given in parameters""" # we clear all the rows from bibrec_bibxxx from the old record for tag in record.keys(): # for each name construct the bibrec_bibxxx table name table_name = 'bibrec_bib'+tag[0:2]+'x' # delete all the records with proper id_bibrec query = """DELETE FROM `%s` where id_bibrec = %s""" params = (table_name, id_bibrec) try: run_sql(query % params) except Error, error: write_message(" Error during the delete_bibrec_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) def delete_bibdoc(id_bibrec): """Delete document from bibdoc which correspond to the bibrec id given in parameter""" query = """UPDATE bibdoc SET status='deleted' WHERE id IN (SELECT id_bibdoc FROM bibrec_bibdoc where id_bibrec = %s)""" params = (id_bibrec,) try: run_sql(query, params) except Error, error: write_message(" Error during the delete_bibdoc function : %s " % error, verbose=1, stream=sys.stderr) def delete_bibrec_bibdoc(id_bibrec): """Delete the bibrec record from the table bibrec_bibdoc given in parameter""" # delete all the records with proper id_bibrec query = """DELETE FROM bibrec_bibdoc where id_bibrec = %s""" params = (id_bibrec,) try: run_sql(query, params) except Error, error: write_message(" Error during the delete_bibrec_bibdoc function : %s " % error, verbose=1, stream=sys.stderr) def main(): """main entry point for bibupload""" global task_id, options ## parse command line: if len(sys.argv) == 2 and sys.argv[1].isdigit(): ## A - run the task task_id = int(sys.argv[1]) try: if not task_run(): write_message("Error occurred. Exiting.", sys.stderr) except StandardError, erro: write_message("Unexpected error occurred: %s." % erro, sys.stderr) write_message("Traceback is:", sys.stderr) traceback.print_tb(sys.exc_info()[2]) write_message("Exiting.", sys.stderr) task_update_status("ERROR") else: ## B - submit the task # set default values: options["runtime"] = time.strftime("%Y-%m-%d %H:%M:%S") options["sleeptime"] = "" # set user-defined options: error = parse_command() if error == 0: task_submit(options) else: sys.exit(1) return if __name__ == "__main__": main()