diff --git a/modules/bibedit/lib/textmarc2xmlmarc.py b/modules/bibedit/lib/textmarc2xmlmarc.py index cdc5089e2..506bee8a8 100644 --- a/modules/bibedit/lib/textmarc2xmlmarc.py +++ b/modules/bibedit/lib/textmarc2xmlmarc.py @@ -1,462 +1,462 @@ # -*- coding: utf-8 -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ textmarc2xmlmarc utility. """ __version__ = "" import fileinput import getopt import string import sys import re import os.path class Field: "A class to hold information on bibliographic field and its value." def __init__(self, value_as_string=""): "Creates field instance from a string of the form '$$aElis$2editor'." self.value = {} if value_as_string: s = value_as_string if s[0:2] != '$$': # if does not start by subfield, add artificial beginning subfield s = '$$ '+ s for chunk in string.split(s, '$$'): if chunk: aleph_spread = alephsplit(chunk[1:]) if len(aleph_spread) > 1: self.add('0',aleph_spread[1]) self.add(chunk[0], aleph_spread[0]) # add subfield # self.add(chunk[0], chunk[1:]) def has_subfield(self, subfield_code): "Does the field contain this subfield?" if self.value.has_key(subfield_code): return 1 else: return 0 def get_subfield(self, subfield_code=''): "Returns the value of a given subfield." if self.value.has_key(subfield_code): return self.value[subfield_code][0] # return first subfield else: return "" def add(self, subfield_code, subfield_value): "Adds subfield to the field." c = string.strip(subfield_code) v = string.strip(subfield_value) if v: # let us disregard empty subfields if self.value.has_key(c): self.value[c].append(v) else: self.value[c] = [v] def display(self,field_type = "datafield"): "Displays field in xml format." keys = self.value.keys() keys.sort() out = "" for subfield_code in keys: for subfield_value in self.value[subfield_code]: subfield_value = encode_for_xml(subfield_value) if subfield_value: if out != "": out = out + "\n" if field_type == "datafield": out = out + " <subfield code=\"%s\">%s</subfield>" % (subfield_code, subfield_value) else: out = out + "%s" % subfield_value return out class Record: "A class to hold information on bibliographic record." def __init__(self, sysno="0"): "Creates record instance." self.sysno = string.strip(sysno) self.field = {} def add(self, field_tag, field_value): "Adds the field to the record." t = string.strip(field_tag) if self.field.has_key(t): self.field[t].append(field_value) else: self.field[t] = [field_value] def has_basenb(self, bases): "Tests whether one of record's BASE values is among one of the values passed in argument tuple." if self.field.has_key('BASE'): for f in self.field['BASE']: try: b = int(f.get_subfield()) # get BASE number if b in bases: return 1 except: pass if self.field.has_key('BA'): for f in self.field['BA']: try: b = int(f.get_subfield()) # get BASE number if b in bases: return 1 except: pass if self.field.has_key('BAS'): for f in self.field['BAS']: try: b = int(f.get_subfield('a')) # get BASE number if b in bases: return 1 except: pass if self.field.has_key('960'): for f in self.field['960']: try: b = int(f.get_subfield('a')) # get BASE number if b in bases: return 1 except: pass return 0 def display(self, filehandle): "Displays record in the xml format." ## display record header print '<record>' print ' <controlfield tag="001">%d</controlfield>' % int(self.sysno) ## display record body keys = self.field.keys() keys.sort() for field_tag in keys: for field_instance in self.field[field_tag]: if field_tag[0] >= '0' and field_tag[0] <= '9': # are we using numbers for field tag name? tag = field_tag[0:3] # yes, so the first three chars constitute MARC-21 tag name i1 = field_tag[3:4] # next char is 1st indicator if i1 == "_" or i1 == " ": i1 = " " i2 = field_tag[4:5] # next char is 2nd indicator if i2 == "_" or i2 == " ": i2 = " " else: tag = field_tag i1 = " " i2 = " " if tag[:2] == "00": field_type = "controlfield" else: field_type = "datafield" instance_to_print = field_instance.display(field_type) if instance_to_print: if tag[:2] != "00": print " <datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\">" % (tag, i1, i2) print instance_to_print print " </datafield>" else: if not (tag == "001" and int(self.sysno) == int(instance_to_print)): print " <controlfield tag=\"%s\">%s</controlfield>" % (tag,instance_to_print) ## display record footer print "</record>" def log_on_exec(command): "Execute command and create record in log file" return os.system(command) def alephsplit(value): "splits value on <<foo=bar>> and returns list with two elements (foo and bar)" out = [] y = re.split("(<<.*?>>)",value) if y!=None: outf1 = "" outf0 = "" count = 0 for item in y: if (re.match("<<.*?>>",item) == None): outf1 = outf1 + item outf0 = outf0 + item else: if re.search("=",item): z = re.search("<<.*?=",item) if z!= None: outf1 = outf1 + z.group()[2:-1] z = re.search("=.*?>>",item) if z!= None: outf0 = outf0 + z.group()[1:-2] count = count + 1 else: z = re.search("<<.+>>",item) if z!=None: outf1 = outf1 + z.group()[2:-2] out.append(outf1) if count > 0: out.append(outf0) else: out.append("") return out def clean(x,values): "Empty when already in list of values" for v in values: if (v == x): x = "" return x def transform_record(rec, errors): "Transforms record from MARC-21 format to XML format." out = Record(rec.sysno) is_deleted = 0 original_collids = [] for tag in rec.field.keys(): if tag == "BAS": for field_instance in rec.field[tag]: out.add("960", field_instance) elif tag == "DEL": is_deleted = 1 elif tag == "591": pass # we drop 591 field that is of secret internal note nature (TS 20070123) elif tag == "CAT" or tag == "961": first_CAT = "" last_CAT = "" for field_instance in rec.field[tag]: if first_CAT == "": first_CAT = field_instance last_CAT = field_instance outf = Field() if first_CAT.has_subfield('x'): outf.add('x',string.replace(first_CAT.get_subfield('x'),'-','')) elif first_CAT.has_subfield('c'): outf.add('x',string.replace(first_CAT.get_subfield('c'),'-','')) if last_CAT.has_subfield('c'): outf.add('c',string.replace(last_CAT.get_subfield('c'),'-','')) if last_CAT.has_subfield('l'): outf.add('l',last_CAT.get_subfield('l')) if last_CAT.has_subfield('h'): outf.add('h',last_CAT.get_subfield('h')) out.add('961', outf) elif tag == "LKR": for field_instance in rec.field[tag]: out.add("962", field_instance) elif tag == "OWN": for field_instance in rec.field[tag]: out.add("963", field_instance) elif (tag == "520" or tag == "590"): topfield = 1000 outfa = Field() outfb = Field() listvaluesa = {} listvaluesb = {} listordersa = [] listordersb = [] valuea = '' valueb = '' for field_instance in rec.field[tag]: if field_instance.has_subfield('b'): partial = field_instance.get_subfield('b') if field_instance.has_subfield('9') and field_instance.get_subfield('9') != "": order = int(field_instance.get_subfield('9')) else: order = topfield topfield = topfield + 1 listordersb.append(order) listvaluesb[order]=partial elif field_instance.has_subfield('a'): partial = field_instance.get_subfield('a') if field_instance.has_subfield('9') and field_instance.get_subfield('9').isdigit(): order = int(field_instance.get_subfield('9')) else: order = topfield topfield = topfield + 1 listordersa.append(order) listvaluesa[order]=partial else: out.add(tag, field_instance) listordersa.sort() for order in listordersa: valuea = valuea + " " + listvaluesa[order] if valuea != '': outfa.add('a',valuea) out.add(tag, outfa) listordersb.sort() for order in listordersb: valueb = valueb + " " + listvaluesb[order] if valueb != '': outfb.add('b',valueb) out.add(tag, outfb) elif tag == "980": for field_instance in rec.field[tag]: original_collids.append(field_instance) elif tag!="FMT" and tag!="LDR" and tag!="008" and tag!="OWN" and tag!="0248" and tag!="---" and tag[0] in string.digits and tag[1] in string.digits and tag[2] in string.digits: for field_instance in rec.field[tag]: out.add(tag, field_instance) #deleted collection field if is_deleted: outf = Field() outf.add('c','DELETED') out.add('980',outf) return out def transform_file(filename): "Reads ALEPH 500 sequential data file and transforms them into XML format." record_no = 0 filehandle = "" errors = {} # dict that holds 'bad' fields as keys and list of sysnos for which they occurred as values record_current = Record() # will hold current bibliographic record as we read through input file sysno_old, field_old, value_old = None, None, None # will hold values from previous line ## go trough all the input file for line in fileinput.input(filename): if re.sub("\s","",line) != "": # parse the input line with MARC sequential format sysno, field, value = line[0:9], line[10:15], line[16:] if field[0] == " " or field[1] == " " or field[2] == " ": text = "\nRecord %s: Error in field definition %s\n" % (sysno,field) if field[0] == " ": field = string.replace(field," ","0",1) if field[1] == " ": field = string.replace(field," ","0",1) if field[2] == " ": field = string.replace(field," ","0",1) sys.stderr.write(text) raise sysno, field, value = string.strip(sysno), string.strip(field), string.strip(value) if sysno == record_current.sysno: # we are in the same bibliographic record record_current.add(field_old, Field(value_old)) field_old, value_old = field, value else: # end of current record found, so transform it record_no = record_no + 1 # count records if field_old and value_old: # add previous line record_current.add(field_old, Field(value_old)) record_tmp = transform_record(record_current, errors) if record_tmp.sysno != "0": record_tmp.display(filehandle) record_current = Record(sysno) # set up a new current record field_old, value_old = field, value ## after all the input lines have been read, display last record record_current.add(field_old, Field(value_old)) record_tmp = transform_record(record_current, errors) if record_tmp.sysno != "0": record_tmp.display(filehandle) ## display eventual errors errors_keys = errors.keys(); errors_keys.sort() for t in errors_keys: sys.stderr.write("\n\nUnknown tag %s occurred for the following SYSNOs:\n " % t) nbchars = 5 for s in errors[t]: sys.stderr.write("%s " % s) nbchars = nbchars + len(s) if nbchars >= 72: sys.stderr.write("\n ") nbchars = 5 - sys.stderr.write("\n") + sys.stderr.write("\n") def usage(code, msg=''): "Prints usage for this module." sys.stderr.write("%s\n" % __version__) if msg: sys.stderr.write("Error: %s.\n" % msg) sys.stderr.write("Usage: %s file.seq ...].\n" % sys.argv[0]) sys.stderr.write("Options: \n") sys.stderr.write(" -h, --help print this help\n") sys.exit(code) def encode_for_xml(s): "Encode special chars in string so that it would be XML-compliant." s = string.replace(s, '&', '&') s = string.replace(s, '<', '<') s = re.sub("[\x00-\x19\x7F\x1C\x1D]","",s) # remove ctrl characters # s = unicode(s,'latin1','ignore').encode('utf-8','replace') return s def main(): "Main function that does conversion from ALEPH 500 into XML." ## read command-line options try: opts, dummy_args = getopt.getopt(sys.argv[1:], "h", ['help']) except getopt.error, msg: usage(1, msg) help_p = 0 # default is not to print help ## guess about desired output format for opt, arg in opts: if opt in ('-h', '--help'): help_p = 1 ## guess about possible input files files = [] for arg in sys.argv[1:]: if arg[0] != "-" and len(arg)>3: files.append(arg) ## process all the input, finally if help_p == 1: usage(0) else: print '<?xml version="1.0" encoding="UTF-8"?>' print '<collection xmlns="http://www.loc.gov/MARC21/slim">' if files: for afile in files: transform_file(afile) else: transform_file("-") print '</collection>' sys.stderr.close() ### okay, here we go: if __name__ == '__main__': main()