diff --git a/modules/bibconvert/bin/bibconvert.in b/modules/bibconvert/bin/bibconvert.in index 5dee91eb9..742989f4b 100644 --- a/modules/bibconvert/bin/bibconvert.in +++ b/modules/bibconvert/bin/bibconvert.in @@ -1,1575 +1,1587 @@ ## $Id$ ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## read config variables: #include "config.wml" #include "configbis.wml" #include "cdswmllib.wml" ## start Python: #! ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. """BibConvert tool to convert bibliographic records from any format to any format.""" __version__ = "<: print generate_pretty_version_string('$Id$'); :>" ## okay, rest of the Python code goes below ####### pylibdir = "/python" try: import fileinput import string import os import re import sys import time import getopt from time import gmtime, strftime, localtime import os.path except ImportError, e: print "Error: %s" % e import sys sys.exit(1) try: sys.path.append('%s' % pylibdir) from cdsware.search_engine import perform_request_search from cdsware.config import * except ImportError, e: print "Error: %s" % e sys.exit(1) ### Matching records with database content def parse_query_string(query_string): """Parse query string, e.g.: Input: 245__a::REP(-, )::SHAPE::SUP(SPACE, )::MINL(4)::MAXL(8)::EXPW(PUNCT)::WORDS(4,L)::SHAPE::SUP(SPACE, )||700__a::MINL(2)::REP(COMMA,). Output:[['245__a','REP(-,)','SHAPE','SUP(SPACE, )','MINL(4)','MAXL(8)','EXPW(PUNCT)','WORDS(4,L)','SHAPE','SUP(SPACE, )'],['700__a','MINL(2)','REP(COMMA,)']] """ query_string_out = [] query_string_out_in = [] query_string_split_1 = query_string.split('||') for item_1 in query_string_split_1: query_string_split_2 = item_1.split('::') query_string_out_in = [] for item in query_string_split_2: query_string_out_in.append(item) query_string_out.append(query_string_out_in) return query_string_out def set_conv(): """ bibconvert common settings ======================= minimal length of output line = 1 maximal length of output line = 4096 """ conv_setting = [ 1, 4096 ] return conv_setting def get_pars(fn): "Read function and its parameters into list" out = [] out.append(re.split('\(|\)',fn)[0]) out.append(re.split(',',re.split('\(|\)',fn)[1])) return out def append_to_output_file(filename, output): "bibconvert output file creation by output line" try: file = open(filename,'a') file.write(output) file.close() except IOError, e: exit_on_error("Cannot write into %s" % filename) return 1 def sub_keywd(out): "bibconvert keywords literal substitution" out = string.replace(out,"EOL","\n") out = string.replace(out,"_CR_","\r") out = string.replace(out,"_LF_","\n") out = string.replace(out,"\\",'\\') out = string.replace(out,"\r",'\r') out = string.replace(out,"BSLASH",'\\') out = string.replace(out,"COMMA",',') out = string.replace(out,"LEFTB",'[') out = string.replace(out,"RIGHTB",']') out = string.replace(out,"LEFTP",'(') out = string.replace(out,"RIGHTP",')') return out def check_split_on(data_item_split, sep, tpl_f): """ bibconvert conditional split with following conditions =================================================== ::NEXT(N,TYPE,SIDE) - next N chars are of the TYPE having the separator on the SIDE ::PREV(N,TYPE,SIDE) - prev.N chars are of the TYPE having the separator on the SIDE """ fn = get_pars(tpl_f)[0] par = get_pars(tpl_f)[1] done = 0 while (done == 0): if ( (( fn == "NEXT" ) and ( par[2]=="R" )) or (( fn == "PREV" ) and ( par[2]=="L" )) ): test_value = data_item_split[0][-(string.atoi(par[0])):] elif ( ((fn == "NEXT") and ( par[2]=="L")) or ((fn == "PREV") and ( par[2]=="R")) ): test_value = data_item_split[1][:(string.atoi(par[0]))] data_item_split_tmp = [] if ((FormatField(test_value,"SUP(" + par[1] + ",)") != "")or(len(test_value) < string.atoi(par[0]))): data_item_split_tmp = data_item_split[1].split(sep,1) if(len(data_item_split_tmp)==1): done = 1 data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0] data_item_split[1] = "" else: data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0] data_item_split[1] = data_item_split_tmp[1] else: done = 1 return data_item_split def get_subfields(data,subfield,src_tpl): "Get subfield according to the template" out = [] for data_item in data: found = 0 for src_tpl_item in src_tpl: if (src_tpl_item[:2] == "<:"): if (src_tpl_item[2:-2] == subfield): found = 1 else: sep_in_list = src_tpl_item.split("::") sep = sep_in_list[0] data_item_split = data_item.split(sep,1) if (len(data_item_split)==1): data_item = data_item_split[0] else: if (len(sep_in_list) > 1): data_item_split = check_split_on(data_item.split(sep,1), sep_in_list[0],sep_in_list[1]) if(found == 1): data_item = data_item_split[0] else: data_item = string.join(data_item_split[1:],sep) out.append(data_item) return out def exp_n(word): "Replace newlines and carriage return's from string." out = "" for ch in word: if ((ch != '\n') and (ch != '\r')): out = out + ch return out def exp_e(list): "Expunge empty elements from a list" out = [] for item in list: item = exp_n(item) if ((item != '\r\n' and item != '\r' and item != '\n' and item !="" and len(item)!=0)): out.append(item) return out def sup_e(word): "Replace spaces" out = "" for ch in word: if (ch != ' '): out = out + ch return out def select_line(field_code, list): "Return appropriate item from a list" out = [''] for field in list: field[0] = sup_e(field[0]) field_code = sup_e(field_code) if (field[0] == field_code): out = field[1] return out def parse_field_definition(source_field_definition): "Create list of source_field_definition" word_list = [] out = [] word = "" counter = 0 if (len(source_field_definition.split("---"))==4): out = source_field_definition.split("---") else: element_list_high = source_field_definition.split("<:") for word_high in element_list_high: element_list_low = word_high.split(':>') for word_low in element_list_low: word_list.append(word_low) word_list.append(":>") word_list.pop() word_list.append("<:") word_list.pop() for item in word_list: word = word + item if (item == "<:"): counter = counter + 1 if (item == ":>"): counter = counter - 1 if counter == 0: out.append(word) word = "" return out def parse_template(template): """ bibconvert parse template ====================== in - template filename out - [ [ field_code , [ field_template_parsed ] , [] ] """ out = [] for field_def in read_file(template,1): field_tpl_new = [] if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")): field_code = field_def.split("---",1)[0] field_tpl = parse_field_definition(field_def.split("---",1)[1]) field_tpl_new = field_tpl field_tpl = exp_e(field_tpl_new) out_data = [field_code, field_tpl] out.append(out_data) return out def parse_common_template(template,part): """ bibconvert parse template ========================= in - template filename out - [ [ field_code , [ field_template_parsed ] , [] ] """ out = [] counter = 0 for field_def in read_file(template,1): if (exp_n(field_def)[:3] == "==="): counter = counter + 1 elif (counter == part): field_tpl_new = [] if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")): field_code = field_def.split("---",1)[0] field_tpl = parse_field_definition(field_def.split("---",1)[1]) field_tpl_new = field_tpl field_tpl = exp_e(field_tpl_new) out_data = [field_code, field_tpl] out.append(out_data) return out def parse_input_data_f(source_data_open, source_tpl): """ bibconvert parse input data ======================== in - input source data location (filehandle) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][] destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[] input data file; by line: - fieldcode value """ out = [['',[]]] count = 0 values = [] while (count < 1): line = source_data_open.readline() if (line == ""): return(-1) line_split = line.split(" ",1) if (re.sub("\s","",line) == separator): count = count + 1 if (len(line_split) == 2): field_code = line_split[0] field_value = exp_n(line_split[1]) values.append([field_code,field_value]) item_prev = "" stack = [''] for item in values: if ((item[0]==item_prev)or(item_prev == "")): stack.append(item[1]) item_prev = item[0] else: out.append([item_prev,stack]) item_prev = item[0] stack = [] stack.append(item[1]) try: if (stack[0] != ""): if (out[0][0]==""): out = [] out.append([field_code,stack]) except IndexError, e: out = out return out def parse_input_data_fx(source_tpl): """ bibconvert parse input data ======================== in - input source data location (filehandle) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] extraction_template_entry - input data file - specified by extract_tpl """ count = 0 record = "" field_data_1_in_list = [] out = [['',[]]] while (count <10): line = sys.stdin.readline() if (line == ""): count = count + 1 if (record == "" and count): return (-1) if (re.sub("\s","",line) == separator): count = count + 10 else: record = record + line for field_defined in extract_tpl_parsed: try: field_defined[1][0] = sub_keywd(field_defined[1][0]) field_defined[1][1] = sub_keywd(field_defined[1][1]) except IndexError, e: field_defined = field_defined try: field_defined[1][2] = sub_keywd(field_defined[1][2]) except IndexError, e: field_defined = field_defined field_data_1 ="" if ((field_defined[1][0][0:2] == '//') and (field_defined[1][0][-2:] == '//')): field_defined_regexp = field_defined[1][0][2:-2] try: #### if (len(re.split(field_defined_regexp,record)) == 1): field_data_1 = "" field_data_1_in_list = [] else: field_data_1_tmp = re.split(field_defined_regexp,record,1)[1] field_data_1_in_list = field_data_1_tmp.split(field_defined_regexp) except IndexError, e: field_data_1 = "" else: try: if (len(record.split(field_defined[1][0])) == 1): field_data_1 = "" field_data_1_in_list = [] else: field_data_1_tmp = record.split(field_defined[1][0],1)[1] field_data_1_in_list = field_data_1_tmp.split(field_defined[1][0]) except IndexError, e: field_data_1 = "" spliton = [] outvalue = "" field_data_2 = "" field_data = "" try: if ((field_defined[1][1])=="EOL"): spliton = ['\n'] elif ((field_defined[1][1])=="MIN"): spliton = ['\n'] elif ((field_defined[1][1])=="MAX"): for item in extract_tpl_parsed: try: spliton.append(item[1][0]) except IndexError, e: spliton = spliton elif (field_defined[1][1][0:2] == '//') and (field_defined[1][1][-2:] == '//'): spliton = [field_defined[1][1][2:-2]] else: spliton = [field_defined[1][1]] except IndexError,e : spliton = "" outvalues = [] for field_data in field_data_1_in_list: outvalue = "" for splitstring in spliton: field_data_2 = "" if (len(field_data.split(splitstring))==1): if (outvalue == ""): field_data_2 = field_data else: field_data_2 = outvalue else: field_data_2 = field_data.split(splitstring)[0] outvalue = field_data_2 field_data = field_data_2 outvalues.append(outvalue) outvalues = exp_e(outvalues) if (len(outvalues) > 0): if (out[0][0]==""): out = [] outstack = [] if (len(field_defined[1])==3): spliton = [field_defined[1][2]] if (field_defined[1][2][0:2] == '//') and (field_defined[1][2][-2:] == '//'): spliton = [field_defined[1][2][2:-2]] for item in outvalues: stack = re.split(spliton[0],item) for stackitem in stack: outstack.append(stackitem) else: outstack = outvalues out.append([field_defined[0],outstack]) return out def parse_input_data_d(source_data, source_tpl): """ bibconvert parse input data ======================== in - input source data location (directory) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][] destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[] input data dir; by file: - fieldcode value per line """ out = [] for source_field_tpl in read_file(source_tpl,1): source_field_code = source_field_tpl.split("---")[0] source_field_data = read_file(source_data + source_field_code,0) source_field_data = exp_e(source_field_data) out_data = [source_field_code, source_field_data] out.append(out_data) return out def sub_empty_lines(value): out = re.sub('\n\n+','',value) return out def set_par_defaults(par1,par2): "Set default parameter when not defined" par_new_in_list = par2.split(",") i = 0 out = [] for par in par_new_in_list: if (len(par1)>i): if (par1[i] == ""): out.append(par) else: out.append(par1[i]) else: out.append(par) i = i + 1 return out def generate(keyword): """ bibconvert generaded values: ========================= SYSNO() - generate date as '%w%H%M%S' WEEK(N) - generate date as '%V' with shift (N) DATE(format) - generate date in specifieddate FORMAT VALUE(value) - enter value literarly OAI() - generate oai_identifier, starting value given at command line as -o """ out = keyword fn = keyword + "()" par = get_pars(fn)[1] fn = get_pars(fn)[0] par = set_par_defaults(par,"") if (fn == "SYSNO"): + out = sysno500 + if (fn == "SYSNO330"): out = sysno if (fn == "WEEK"): par = set_par_defaults(par,"0") out = "%02d" % (string.atoi(strftime("%V",localtime())) + string.atoi(par[0])) if (string.atoi(out)<0): out = "00" if (fn == "VALUE"): par = set_par_defaults(par,"") out = par[0] if (fn == "DATE"): par = set_par_defaults(par,"%w%H%M%S," + "%d" % conv_setting[1]) out = strftime(par[0],localtime()) out = out[:string.atoi(par[1])] + if (fn == "XDATE"): + par = set_par_defaults(par,"%w%H%M%S," + ",%d" % conv_setting[1]) + out = strftime(par[0],localtime()) + out = string.atoi(par[1]) + out[:string.atoi(par[2])] if (fn == "OAI"): oai_prefix = "" out = "%s:%d" % (oai_prefix,tcounter + oai_identifier_from) return out def read_file(filename,exception): "Read file into list" out = [] if (os.path.isfile(filename)): file = open(filename,'r') out = file.readlines() file.close() else: if exception: exit_on_error("Cannot access file: %s" % filename) return out def crawl_KB(filename,value,mode): """ bibconvert look-up value in KB_file in one of following modes: =========================================================== 1 - case sensitive / match (default) 2 - not case sensitive / search 3 - case sensitive / search 4 - not case sensitive / match 5 - case sensitive / search (in KB) 6 - not case sensitive / search (in KB) 7 - case sensitive / search (reciprocal) 8 - not case sensitive / search (reciprocal) 9 - replace by _DEFAULT_ only R - not case sensitive / search (reciprocal) (8) replace """ if (os.path.isfile(filename) != 1): pathtmp = string.split(extract_tpl,"/") pathtmp.pop() path = string.join(pathtmp,"/") filename = path + "/" + filename if (os.path.isfile(filename)): file_to_read = open(filename,"r") file_read = file_to_read.readlines() for line in file_read: code = string.split(line,"---") if (mode == "2"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(value_to_cmp,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif ((mode == "3") or (mode == "0")): if ((len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "4"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((code[0] == value_to_cmp)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "5"): if ((len(string.split(code[0],value)) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "6"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "7"): if ((len(string.split(code[0],value)) > 1)or(len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "8"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "9"): if (code[0]=="_DEFAULT_"): value = code[1] return value elif (mode == "R"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")): value = value.replace(code[0],code[1]) else: if ((code[0] == value)or(code[0]=="_DEFAULT_")): value = code[1] return value return value def FormatField(value,fn): """ bibconvert formatting functions: ================================ ADD(prefix,suffix) - add prefix/suffix KB(kb_file,mode) - lookup in kb_file and replace value ABR(N,suffix) - abbreviate to N places with suffix ABRX() - abbreviate exclusively words longer ABRW() - abbreviate word (limit from right) REP(x,y) - replace SUP(type) - remove characters of certain TYPE LIM(n,side) - limit to n letters from L/R LIMW(string,side) - L/R after split on string WORDS(n,side) - limit to n words from L/R IF(value,valueT,valueF) - replace on IF condition MINL(n) - replace words shorter than n MINLW(n) - replace words shorter than n MAXL(n) - replace words longer than n EXPW(type) - replace word from value containing TYPE EXP(STR,0/1) - replace word from value containing string NUM() - take only digits in given string SHAPE() - remove extra space UP() - to uppercase DOWN() - to lowercase CAP() - make capitals each word SPLIT(n,h,str,from) - only for final Aleph field, i.e. AB , maintain whole words SPLITW(sep,h,str,from) - only for final Aleph field, split on string CONF(filed,value,0/1) - confirm validity of output line (check other field) CONFL(substr,0/1) - confirm validity of output line (check field being processed) CUT(prefix,postfix) - remove substring from side RANGE(MIN,MAX) - select items in repetitive fields RE(regexp) - regular expressions bibconvert character TYPES ========================== ALPHA - alphabetic NALPHA - not alpphabetic NUM - numeric NNUM - not numeric ALNUM - alphanumeric NALNUM - non alphanumeric LOWER - lowercase UPPER - uppercase PUNCT - punctual NPUNCT - non punctual SPACE - space """ out = value fn = fn + "()" par = get_pars(fn)[1] fn = get_pars(fn)[0] regexp = "//" NRE = len(regexp) value = sub_keywd(value) par_tmp = [] for item in par: item = sub_keywd(item) par_tmp.append(item) par = par_tmp if (fn == "RE"): new_value = "" par = set_par_defaults(par,".*,0") if (re.search(par[0],value) and (par[1] == "0")): new_value = value out = new_value if (fn == "KB"): new_value = "" par = set_par_defaults(par,"KB,0") new_value = crawl_KB(par[0],value,par[1]) out = new_value elif (fn == "ADD"): par = set_par_defaults(par,",") out = par[0] + value + par[1] elif (fn == "ABR"): par = set_par_defaults(par,"1,.") out = value[:string.atoi(par[0])] + par[1] elif (fn == "ABRW"): tmp = FormatField(value,"ABR(1,.)") tmp = tmp.upper() out = tmp elif (fn == "ABRX"): par = set_par_defaults(par,",") toout = [] tmp = value.split(" ") for wrd in tmp: if (len(wrd) > string.atoi(par[0])): wrd = wrd[:string.atoi(par[0])] + par[1] toout.append(wrd) out = string.join(toout," ") elif (fn == "SUP"): par = set_par_defaults(par,",") if(par[0]=="NUM"): out = re.sub('\d+',par[1],value) if(par[0]=="NNUM"): out = re.sub('\D+',par[1],value) if(par[0]=="ALPHA"): out = re.sub('[a-zA-Z]+',par[1],value) if(par[0]=="NALPHA"): out = re.sub('[^a-zA-Z]+',par[1],value) if((par[0]=="ALNUM")or(par[0]=="NPUNCT")): out = re.sub('\w+',par[1],value) if(par[0]=="NALNUM"): out = re.sub('\W+',par[1],value) if(par[0]=="PUNCT"): out = re.sub('\W+',par[1],value) if(par[0]=="LOWER"): out = re.sub('[a-z]+',par[1],value) if(par[0]=="UPPER"): out = re.sub('[A-Z]+',par[1],value) if(par[0]=="SPACE"): out = re.sub('\s+',par[1],value) elif (fn == "LIM"): par = set_par_defaults(par,",") if (par[1] == "L"): out = value[(len(value) - string.atoi(par[0])):] if (par[1] == "R"): out = value[:string.atoi(par[0])] elif (fn == "LIMW"): par = set_par_defaults(par,",") if (par[0]!= ""): if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] par[0] = re.search(par[0],value).group() tmp = value.split(par[0]) if (par[1] == "L"): out = par[0] + tmp[1] if (par[1] == "R"): out = tmp[0] + par[0] elif (fn == "WORDS"): tmp2 = [value] par = set_par_defaults(par,",") if (par[1] == "R"): tmp = value.split(" ") tmp2 = [] i = 0 while (i < string.atoi(par[0])): tmp2.append(tmp[i]) i = i + 1 if (par[1] == "L"): tmp = value.split(" ") tmp.reverse() tmp2 = [] i = 0 while (i < string.atoi(par[0])): tmp2.append(tmp[i]) i = i + 1 tmp2.reverse() out = string.join(tmp2, " ") elif (fn == "MINL"): par = set_par_defaults(par,"1") tmp = value.split(" ") tmp2 = [] i = 0 for wrd in tmp: if (len(wrd) >= string.atoi(par[0])): tmp2.append(wrd) out = string.join(tmp2, " ") elif (fn == "MINLW"): par = set_par_defaults(par,"1") if (len(value) >= string.atoi(par[0])): out = value else: out = "" elif (fn == "MAXL"): par = set_par_defaults(par,"4096") tmp = value.split(" ") tmp2 = [] i = 0 for wrd in tmp: if (len(wrd) <= string.atoi(par[0])): tmp2.append(wrd) out = string.join(tmp2, " ") elif (fn == "REP"): set_par_defaults(par,",") if (par[0]!= ""): if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] out = re.sub(par[0],value) else: out = value.replace(par[0],par[1]) elif (fn == "SHAPE"): if (value != ""): out = value.strip() elif (fn == "UP"): out = value.upper() elif (fn == "DOWN"): out = value.lower() elif (fn == "CAP"): tmp = value.split(" ") out2 = [] for wrd in tmp: wrd2 = wrd.capitalize() out2.append(wrd2) out = string.join(out2," ") elif (fn == "IF"): par = set_par_defaults(par,",,") N = 0 while N < 3: if (par[N][0:NRE] == regexp and par[N][-NRE:] == regexp): par[N] = par[N][NRE:-NRE] par[N] = re.search(par[N],value).group() N += 1 if (value == par[0]): out = par[1] else: out = par[2] if (out == "ORIG"): out = value elif (fn == "EXP"): par = set_par_defaults(par,",0") if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] par[0] = re.search(par[0],value).group() tmp = value.split(" ") out2 = [] for wrd in tmp: if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] if ((re.search(par[0],wrd).group() == wrd) and (par[1]=="1")): out2.append(wrd) if ((re.search(par[0],wrd).group() != wrd) and (par[1]=="0")): out2.append(wrd) else: if ((len(wrd.split(par[0])) == 1)and(par[1]=="1")): out2.append(wrd) if ((len(wrd.split(par[0])) != 1)and(par[1]=="0")): out2.append(wrd) out = string.join(out2," ") elif (fn == "EXPW"): par = set_par_defaults(par,",0") tmp = value.split(" ") out2 = [] for wrd in tmp: if ((FormatField(wrd,"SUP(" + par[0] + ")") == wrd)and(par[1]=="1")): out2.append(wrd) if ((FormatField(wrd,"SUP(" + par[0] + ")") != wrd)and(par[1]=="0")): out2.append(wrd) out = string.join(out2," ") elif (fn == "SPLIT"): par = set_par_defaults(par,"%d,0,,1" % conv_setting[1]) length = string.atoi(par[0]) + (string.atoi(par[1])) header = string.atoi(par[1]) headerplus = par[2] starting = string.atoi(par[3]) line = "" tmp2 = [] tmp3 = [] tmp = value.split(" ") linenumber = 1 if (linenumber >= starting): tmp2.append(headerplus) line = line + headerplus for wrd in tmp: line = line + " " + wrd tmp2.append(wrd) if (len(line) > length): linenumber = linenumber + 1 line = tmp2.pop() toout = string.join(tmp2) tmp3.append(toout) tmp2 = [] line2 = value[:header] if (linenumber >= starting): line3 = line2 + headerplus + line else: line3 = line2 + line line = line3 tmp2.append(line) tmp3.append(line) out = string.join(tmp3,"\n") out = FormatField(out,"SHAPE()") elif (fn == "SPLITW"): par = set_par_defaults(par,",0,,1") if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] str = re.search(par[0], value) header = string.atoi(par[1]) headerplus = par[2] starting = string.atoi(par[3]) counter = 1 tmp2 = [] tmp = re.split(par[0],value) last = tmp.pop() for wrd in tmp: counter = counter + 1 if (counter >= starting): tmp2.append(value[:header] + headerplus + wrd + str) else: tmp2.append(value[:header] + wrd + str) if (last != ""): counter = counter + 1 if (counter >= starting): tmp2.append(value[:header] + headerplus + last) else: tmp2.append(value[:header] + last) out = string.join(tmp2,"\n") elif (fn == "CONF"): par = set_par_defaults(par,",,1") found = 0 par1 = "" data = select_line(par[0],data_parsed) for line in data: if (par[1][0:NRE] == regexp and par[1][-NRE:] == regexp): par1 = par[1][NRE:-NRE] else: par1 = par[1] if (par1 == ""): if (line == ""): found = 1 elif (len(re.split(par1,line)) > 1 ): found = 1 if ((found == 1)and(string.atoi(par[2]) == 1)): out = value if ((found == 1)and(string.atoi(par[2]) == 0)): out = "" if ((found == 0)and(string.atoi(par[2]) == 1)): out = "" if ((found == 0)and(string.atoi(par[2]) == 0)): out = value return out elif (fn == "CONFL"): set_par_defaults(par,",1") if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] if (re.search(par[0],value)): if (string.atoi(par[1]) == 1): out = value else: out = "" else: if (string.atoi(par[1]) == 1): out = "" else: out = value return out elif (fn == "CUT"): par = set_par_defaults(par,",") left = value[:len(par[0])] right = value[-(len(par[1])):] if (left == par[0]): out = out[len(par[0]):] if (right == par[1]): out = out[:-(len(par[1]))] return out elif (fn == "NUM"): tmp = re.findall('\d',value) out = string.join(tmp,"") return out def printInfo(): "print out when not enough parmeters given" print """ BibConvert data convertor Usage: bibconvert [options] -ctemplate.cfg < input.dat Options: -c'config' configuration templates file -d'directory' source_data fields are located in separated files in 'directory'one record) -h print this help -V print version number -l'length' minimum line length (default = 1) -o'value' OAI identifier starts with specified value (default = 1) -b'file header' insert file header -e'file footer' insert file footer -B'record header' insert record header -E'record footer' insert record footer -s'record separator' record separator, default empty line (EOLEOL) -m0'query_string' match records using query string, output unmatched -m1'query_string' match records using query string, output matched -m2'query_string' match records using query string, output ambiguous -Cx'field extraction template' alternative to -c when configuration is split to several files -Cs'source data template' alternative to -c when configuration is split to several files -Ct'target data template' alternative to -c when configuration is split to several files """ ## Match records with the database content ## def match_in_database(record, query_string): "Check if record is in alreadey in database with an oai identifier. Returns recID if present, 0 otherwise." query_string_parsed = parse_query_string(query_string) search_pattern = [] search_field = [] for query_field in query_string_parsed: ind1 = query_field[0][3:4] if ind1 == "_": ind1 = "" ind2 = query_field[0][4:5] if ind2 == "_": ind2 = "" stringsplit = "" % (query_field[0][0:3], ind1, ind2, query_field[0][5:6]) formatting = query_field[1:] record1 = string.split(record, stringsplit) if len(record1) > 1: matching_value = string.split(record1[1],"<")[0] for fn in formatting: matching_value = FormatField(matching_value, fn) search_pattern.append(matching_value) search_field.append(query_field[0]) search_field.append("") search_field.append("") search_field.append("") search_pattern.append("") search_pattern.append("") search_pattern.append("") recID_list = perform_request_search(p1=search_pattern[0],f1=search_field[0],p2=search_pattern[1],f2=search_field[1],p3=search_pattern[2],f3=search_field[2]) return recID_list def parse_query_string(query_string): """Parse query string, e.g.: Input: 245__a::REP(-, )::SHAPE::SUP(SPACE, )::MINL(4)::MAXL(8)::EXPW(PUNCT)::WORDS(4,L)::SHAPE::SUP(SPACE, )||700__a::MINL(2)::REP(COMMA,). Output:[['245__a','REP(-,)','SHAPE','SUP(SPACE, )','MINL(4)','MAXL(8)','EXPW(PUNCT)','WORDS(4,L)','SHAPE','SUP(SPACE, )'],['700__a','MINL(2)','REP(COMMA,)']] """ query_string_out = [] query_string_out_in = [] query_string_split_1 = query_string.split('||') for item_1 in query_string_split_1: query_string_split_2 = item_1.split('::') query_string_out_in = [] for item in query_string_split_2: query_string_out_in.append(item) query_string_out.append(query_string_out_in) return query_string_out def exit_on_error(error_message): "exit when error occured" sys.stderr.write("\n bibconvert data convertor\n") sys.stderr.write(" Error: %s\n" % error_message) sys.exit() return 0 -def create_record(begin_record_header, ending_record_footer, query_string, match_mode): +def create_record(begin_record_header, ending_record_footer, query_string, match_mode, Xcount): "Create output record" out_to_print = "" out = [] field_data_item_LIST = [] + ssn5cnt = "%3d" % Xcount + sysno = generate("DATE(%w%H%M%S)") + sysno500 = generate("XDATE(%w%H%M%S),%s" % ssn5cnt) for T_tpl_item_LIST in target_tpl_parsed: # the line is printed only if the variables inside are not empty print_line = 0 to_output = [] rows = 1 for field_tpl_item_STRING in T_tpl_item_LIST[1]: DATA = [] if (field_tpl_item_STRING[:2]=="<:"): field_tpl_item_STRING = field_tpl_item_STRING[2:-2] field = field_tpl_item_STRING.split("::")[0] if (len(field_tpl_item_STRING.split("::")) == 1): value = generate(field) to_output.append([value]) else: subfield = field_tpl_item_STRING.split("::")[1] if (field[-1] == "*"): repetitive = 1 field = field[:-1] else: repetitive = 0 if dirmode: DATA = select_line(field,data_parsed) else: DATA = select_line(field,data_parsed) if (repetitive == 0): DATA = [string.join(DATA," ")] SRC_TPL = select_line(field,source_tpl_parsed) try: if (DATA[0] != ""): DATA = get_subfields(DATA,subfield,SRC_TPL) FF = field_tpl_item_STRING.split("::") if (len(FF) > 2): FF = FF[2:] for fn in FF: # DATAFORMATTED = [] if (len(DATA) != 0 and DATA[0] != ""): DATA = get_subfields(DATA,subfield,SRC_TPL) FF = field_tpl_item_STRING.split("::") if (len(FF) > 2): FF = FF[2:] for fn2 in FF: DATAFORMATTED = [] for item in DATA: item = FormatField(item,fn) DATAFORMATTED.append(item) DATA = DATAFORMATTED if (len(DATA) > rows): rows = len(DATA) if DATA != "": print_line = 1 to_output.append(DATA) except IndexError, e: pass else: to_output.append([field_tpl_item_STRING]) current = 0 default_print = 0 while (current < rows): line_to_print = [] for item in to_output: if (item==[]): item =[''] if (len(item) <= current): printout = item[0] else: printout = item[current] line_to_print.append(printout) output = exp_n(string.join(line_to_print,"")) global_formatting_functions = T_tpl_item_LIST[0].split("::")[1:] for GFF in global_formatting_functions: if (GFF[:5] == "RANGE"): parR = get_pars(GFF)[1] parR = set_par_defaults(parR,"MIN,MAX") if (parR[0]!="MIN"): if (string.atoi(parR[0]) > (current+1)): output = "" if (parR[1]!="MAX"): if (string.atoi(parR[1]) < (current+1)): output = "" elif (GFF[:4] == "DEFP"): default_print = 1 else: output = FormatField(output,GFF) if ((len(output) > conv_setting[0] and print_line == 1) or default_print): out_to_print = out_to_print + output + "\n" current = current + 1 ### out_flag = 0 if query_string: recID = match_in_database(out_to_print, query_string) if len(recID) == 1 and match_mode == 1: ctrlfield = "%d" % (recID[0]) out_to_print = ctrlfield + "\n" + out_to_print out_flag = 1 if len(recID) == 0 and match_mode == 0: out_flag = 1 if len(recID) > 1 and match_mode == 2: out_flag = 1 if out_flag or match_mode == -1: if begin_record_header != "": out_to_print = begin_record_header + "\n" + out_to_print if ending_record_footer != "": out_to_print = out_to_print + "\n" + ending_record_footer else: out_to_print = "" return out_to_print ### MAIN ### conv_setting = set_conv() sysno = generate("DATE(%w%H%M%S)") +sysno500 = generate("DATE(%w%H%M%S)") separator = "" tcounter = 0 source_data = "" query_string = "" match_mode = -1 begin_record_header = "" ending_record_footer = "" output_rec_sep = "" begin_header = "" ending_footer = "" oai_identifier_from = 1 opts, args = getopt.getopt(sys.argv[1:],"c:d:hVl:o:b:e:B:E:s:m:C:", [ "config", "directory", "help", "version", "length", "oai", "header", "footer", "record-header", "record-footer", "separator", "match", "config-alt" ]) # get options and arguments dirmode = 0 +Xcount = 0 for opt, opt_value in opts: if opt in ["-c", "--config"]: extract_tpl = opt_value extract_tpl_parsed = parse_common_template(extract_tpl,1) source_tpl = opt_value source_tpl_parsed = parse_common_template(source_tpl,2) target_tpl = opt_value target_tpl_parsed = parse_common_template(target_tpl,3) elif opt in ["-d", "--directory"]: source_data = opt_value source_data = source_data + "/" extract_tpl = "/" dirmode = 1 elif opt in ["-h", "--help"]: printInfo() sys.exit(0) elif opt in ["-V", "--version"]: print __version__ sys.exit(0) elif opt in ["-l", "--length"]: try: conv_setting[0] = string.atoi(opt_value) except ValueError, e: conv_setting[0] = 1 elif opt in ["-o", "--oai"]: try: oai_identifier_from = string.atoi(opt_value) except ValueError, e: oai_identifier_from = 1 elif opt in ["-b", "--header"]: begin_header = opt_value elif opt in ["-e", "--footer"]: ending_footer = opt_value elif opt in ["-B", "--record-header"]: begin_record_header = opt_value elif opt in ["-E", "--record-footer"]: ending_record_footer = opt_value elif opt in ["-s", "--separator"]: separator = opt_value elif opt in ["-t", "--output_separator"]: output_rec_sep = opt_value elif opt in ["-m", "--match"]: match_mode = string.atoi(opt_value[0:1]) query_string = opt_value[1:] elif opt in ["-C", "--config-alt"]: if opt_value[0:1] == "x": extract_tpl = opt_value[1:] extract_tpl_parsed = parse_template(extract_tpl) if opt_value[0:1] == "t": target_tpl = opt_value[1:] target_tpl_parsed = parse_template(target_tpl) if opt_value[0:1] == "s": source_tpl = opt_value[1:] source_tpl_parsed = parse_template(source_tpl) if dirmode: if (os.path.isdir(source_data)): data_parsed = parse_input_data_d(source_data,source_tpl) - record = create_record(begin_record_header, ending_record_footer, query_string, match_mode) + record = create_record(begin_record_header, ending_record_footer, query_string, match_mode, Xcount) if record != "": print record tcounter = tcounter + 1 if output_rec_sep != "": print output_rec_sep else: exit_on_error("Cannot access directory: %s" % source_data) else: done = 0 print begin_header while (done == 0): data_parsed = parse_input_data_fx(source_tpl) if (data_parsed == -1): done = 1 else: if (data_parsed[0][0]!= ''): - record = create_record(begin_record_header, ending_record_footer, query_string, match_mode) + record = create_record(begin_record_header, ending_record_footer, query_string, match_mode, Xcount) + Xcount += 1 if record != "": print record tcounter = tcounter + 1 if output_rec_sep != "": print output_rec_sep print ending_footer diff --git a/modules/bibconvert/bin/bibconvert.wml b/modules/bibconvert/bin/bibconvert.wml index 5dee91eb9..742989f4b 100644 --- a/modules/bibconvert/bin/bibconvert.wml +++ b/modules/bibconvert/bin/bibconvert.wml @@ -1,1575 +1,1587 @@ ## $Id$ ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## read config variables: #include "config.wml" #include "configbis.wml" #include "cdswmllib.wml" ## start Python: #! ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. """BibConvert tool to convert bibliographic records from any format to any format.""" __version__ = "<: print generate_pretty_version_string('$Id$'); :>" ## okay, rest of the Python code goes below ####### pylibdir = "/python" try: import fileinput import string import os import re import sys import time import getopt from time import gmtime, strftime, localtime import os.path except ImportError, e: print "Error: %s" % e import sys sys.exit(1) try: sys.path.append('%s' % pylibdir) from cdsware.search_engine import perform_request_search from cdsware.config import * except ImportError, e: print "Error: %s" % e sys.exit(1) ### Matching records with database content def parse_query_string(query_string): """Parse query string, e.g.: Input: 245__a::REP(-, )::SHAPE::SUP(SPACE, )::MINL(4)::MAXL(8)::EXPW(PUNCT)::WORDS(4,L)::SHAPE::SUP(SPACE, )||700__a::MINL(2)::REP(COMMA,). Output:[['245__a','REP(-,)','SHAPE','SUP(SPACE, )','MINL(4)','MAXL(8)','EXPW(PUNCT)','WORDS(4,L)','SHAPE','SUP(SPACE, )'],['700__a','MINL(2)','REP(COMMA,)']] """ query_string_out = [] query_string_out_in = [] query_string_split_1 = query_string.split('||') for item_1 in query_string_split_1: query_string_split_2 = item_1.split('::') query_string_out_in = [] for item in query_string_split_2: query_string_out_in.append(item) query_string_out.append(query_string_out_in) return query_string_out def set_conv(): """ bibconvert common settings ======================= minimal length of output line = 1 maximal length of output line = 4096 """ conv_setting = [ 1, 4096 ] return conv_setting def get_pars(fn): "Read function and its parameters into list" out = [] out.append(re.split('\(|\)',fn)[0]) out.append(re.split(',',re.split('\(|\)',fn)[1])) return out def append_to_output_file(filename, output): "bibconvert output file creation by output line" try: file = open(filename,'a') file.write(output) file.close() except IOError, e: exit_on_error("Cannot write into %s" % filename) return 1 def sub_keywd(out): "bibconvert keywords literal substitution" out = string.replace(out,"EOL","\n") out = string.replace(out,"_CR_","\r") out = string.replace(out,"_LF_","\n") out = string.replace(out,"\\",'\\') out = string.replace(out,"\r",'\r') out = string.replace(out,"BSLASH",'\\') out = string.replace(out,"COMMA",',') out = string.replace(out,"LEFTB",'[') out = string.replace(out,"RIGHTB",']') out = string.replace(out,"LEFTP",'(') out = string.replace(out,"RIGHTP",')') return out def check_split_on(data_item_split, sep, tpl_f): """ bibconvert conditional split with following conditions =================================================== ::NEXT(N,TYPE,SIDE) - next N chars are of the TYPE having the separator on the SIDE ::PREV(N,TYPE,SIDE) - prev.N chars are of the TYPE having the separator on the SIDE """ fn = get_pars(tpl_f)[0] par = get_pars(tpl_f)[1] done = 0 while (done == 0): if ( (( fn == "NEXT" ) and ( par[2]=="R" )) or (( fn == "PREV" ) and ( par[2]=="L" )) ): test_value = data_item_split[0][-(string.atoi(par[0])):] elif ( ((fn == "NEXT") and ( par[2]=="L")) or ((fn == "PREV") and ( par[2]=="R")) ): test_value = data_item_split[1][:(string.atoi(par[0]))] data_item_split_tmp = [] if ((FormatField(test_value,"SUP(" + par[1] + ",)") != "")or(len(test_value) < string.atoi(par[0]))): data_item_split_tmp = data_item_split[1].split(sep,1) if(len(data_item_split_tmp)==1): done = 1 data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0] data_item_split[1] = "" else: data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0] data_item_split[1] = data_item_split_tmp[1] else: done = 1 return data_item_split def get_subfields(data,subfield,src_tpl): "Get subfield according to the template" out = [] for data_item in data: found = 0 for src_tpl_item in src_tpl: if (src_tpl_item[:2] == "<:"): if (src_tpl_item[2:-2] == subfield): found = 1 else: sep_in_list = src_tpl_item.split("::") sep = sep_in_list[0] data_item_split = data_item.split(sep,1) if (len(data_item_split)==1): data_item = data_item_split[0] else: if (len(sep_in_list) > 1): data_item_split = check_split_on(data_item.split(sep,1), sep_in_list[0],sep_in_list[1]) if(found == 1): data_item = data_item_split[0] else: data_item = string.join(data_item_split[1:],sep) out.append(data_item) return out def exp_n(word): "Replace newlines and carriage return's from string." out = "" for ch in word: if ((ch != '\n') and (ch != '\r')): out = out + ch return out def exp_e(list): "Expunge empty elements from a list" out = [] for item in list: item = exp_n(item) if ((item != '\r\n' and item != '\r' and item != '\n' and item !="" and len(item)!=0)): out.append(item) return out def sup_e(word): "Replace spaces" out = "" for ch in word: if (ch != ' '): out = out + ch return out def select_line(field_code, list): "Return appropriate item from a list" out = [''] for field in list: field[0] = sup_e(field[0]) field_code = sup_e(field_code) if (field[0] == field_code): out = field[1] return out def parse_field_definition(source_field_definition): "Create list of source_field_definition" word_list = [] out = [] word = "" counter = 0 if (len(source_field_definition.split("---"))==4): out = source_field_definition.split("---") else: element_list_high = source_field_definition.split("<:") for word_high in element_list_high: element_list_low = word_high.split(':>') for word_low in element_list_low: word_list.append(word_low) word_list.append(":>") word_list.pop() word_list.append("<:") word_list.pop() for item in word_list: word = word + item if (item == "<:"): counter = counter + 1 if (item == ":>"): counter = counter - 1 if counter == 0: out.append(word) word = "" return out def parse_template(template): """ bibconvert parse template ====================== in - template filename out - [ [ field_code , [ field_template_parsed ] , [] ] """ out = [] for field_def in read_file(template,1): field_tpl_new = [] if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")): field_code = field_def.split("---",1)[0] field_tpl = parse_field_definition(field_def.split("---",1)[1]) field_tpl_new = field_tpl field_tpl = exp_e(field_tpl_new) out_data = [field_code, field_tpl] out.append(out_data) return out def parse_common_template(template,part): """ bibconvert parse template ========================= in - template filename out - [ [ field_code , [ field_template_parsed ] , [] ] """ out = [] counter = 0 for field_def in read_file(template,1): if (exp_n(field_def)[:3] == "==="): counter = counter + 1 elif (counter == part): field_tpl_new = [] if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")): field_code = field_def.split("---",1)[0] field_tpl = parse_field_definition(field_def.split("---",1)[1]) field_tpl_new = field_tpl field_tpl = exp_e(field_tpl_new) out_data = [field_code, field_tpl] out.append(out_data) return out def parse_input_data_f(source_data_open, source_tpl): """ bibconvert parse input data ======================== in - input source data location (filehandle) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][] destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[] input data file; by line: - fieldcode value """ out = [['',[]]] count = 0 values = [] while (count < 1): line = source_data_open.readline() if (line == ""): return(-1) line_split = line.split(" ",1) if (re.sub("\s","",line) == separator): count = count + 1 if (len(line_split) == 2): field_code = line_split[0] field_value = exp_n(line_split[1]) values.append([field_code,field_value]) item_prev = "" stack = [''] for item in values: if ((item[0]==item_prev)or(item_prev == "")): stack.append(item[1]) item_prev = item[0] else: out.append([item_prev,stack]) item_prev = item[0] stack = [] stack.append(item[1]) try: if (stack[0] != ""): if (out[0][0]==""): out = [] out.append([field_code,stack]) except IndexError, e: out = out return out def parse_input_data_fx(source_tpl): """ bibconvert parse input data ======================== in - input source data location (filehandle) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] extraction_template_entry - input data file - specified by extract_tpl """ count = 0 record = "" field_data_1_in_list = [] out = [['',[]]] while (count <10): line = sys.stdin.readline() if (line == ""): count = count + 1 if (record == "" and count): return (-1) if (re.sub("\s","",line) == separator): count = count + 10 else: record = record + line for field_defined in extract_tpl_parsed: try: field_defined[1][0] = sub_keywd(field_defined[1][0]) field_defined[1][1] = sub_keywd(field_defined[1][1]) except IndexError, e: field_defined = field_defined try: field_defined[1][2] = sub_keywd(field_defined[1][2]) except IndexError, e: field_defined = field_defined field_data_1 ="" if ((field_defined[1][0][0:2] == '//') and (field_defined[1][0][-2:] == '//')): field_defined_regexp = field_defined[1][0][2:-2] try: #### if (len(re.split(field_defined_regexp,record)) == 1): field_data_1 = "" field_data_1_in_list = [] else: field_data_1_tmp = re.split(field_defined_regexp,record,1)[1] field_data_1_in_list = field_data_1_tmp.split(field_defined_regexp) except IndexError, e: field_data_1 = "" else: try: if (len(record.split(field_defined[1][0])) == 1): field_data_1 = "" field_data_1_in_list = [] else: field_data_1_tmp = record.split(field_defined[1][0],1)[1] field_data_1_in_list = field_data_1_tmp.split(field_defined[1][0]) except IndexError, e: field_data_1 = "" spliton = [] outvalue = "" field_data_2 = "" field_data = "" try: if ((field_defined[1][1])=="EOL"): spliton = ['\n'] elif ((field_defined[1][1])=="MIN"): spliton = ['\n'] elif ((field_defined[1][1])=="MAX"): for item in extract_tpl_parsed: try: spliton.append(item[1][0]) except IndexError, e: spliton = spliton elif (field_defined[1][1][0:2] == '//') and (field_defined[1][1][-2:] == '//'): spliton = [field_defined[1][1][2:-2]] else: spliton = [field_defined[1][1]] except IndexError,e : spliton = "" outvalues = [] for field_data in field_data_1_in_list: outvalue = "" for splitstring in spliton: field_data_2 = "" if (len(field_data.split(splitstring))==1): if (outvalue == ""): field_data_2 = field_data else: field_data_2 = outvalue else: field_data_2 = field_data.split(splitstring)[0] outvalue = field_data_2 field_data = field_data_2 outvalues.append(outvalue) outvalues = exp_e(outvalues) if (len(outvalues) > 0): if (out[0][0]==""): out = [] outstack = [] if (len(field_defined[1])==3): spliton = [field_defined[1][2]] if (field_defined[1][2][0:2] == '//') and (field_defined[1][2][-2:] == '//'): spliton = [field_defined[1][2][2:-2]] for item in outvalues: stack = re.split(spliton[0],item) for stackitem in stack: outstack.append(stackitem) else: outstack = outvalues out.append([field_defined[0],outstack]) return out def parse_input_data_d(source_data, source_tpl): """ bibconvert parse input data ======================== in - input source data location (directory) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][] destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[] input data dir; by file: - fieldcode value per line """ out = [] for source_field_tpl in read_file(source_tpl,1): source_field_code = source_field_tpl.split("---")[0] source_field_data = read_file(source_data + source_field_code,0) source_field_data = exp_e(source_field_data) out_data = [source_field_code, source_field_data] out.append(out_data) return out def sub_empty_lines(value): out = re.sub('\n\n+','',value) return out def set_par_defaults(par1,par2): "Set default parameter when not defined" par_new_in_list = par2.split(",") i = 0 out = [] for par in par_new_in_list: if (len(par1)>i): if (par1[i] == ""): out.append(par) else: out.append(par1[i]) else: out.append(par) i = i + 1 return out def generate(keyword): """ bibconvert generaded values: ========================= SYSNO() - generate date as '%w%H%M%S' WEEK(N) - generate date as '%V' with shift (N) DATE(format) - generate date in specifieddate FORMAT VALUE(value) - enter value literarly OAI() - generate oai_identifier, starting value given at command line as -o """ out = keyword fn = keyword + "()" par = get_pars(fn)[1] fn = get_pars(fn)[0] par = set_par_defaults(par,"") if (fn == "SYSNO"): + out = sysno500 + if (fn == "SYSNO330"): out = sysno if (fn == "WEEK"): par = set_par_defaults(par,"0") out = "%02d" % (string.atoi(strftime("%V",localtime())) + string.atoi(par[0])) if (string.atoi(out)<0): out = "00" if (fn == "VALUE"): par = set_par_defaults(par,"") out = par[0] if (fn == "DATE"): par = set_par_defaults(par,"%w%H%M%S," + "%d" % conv_setting[1]) out = strftime(par[0],localtime()) out = out[:string.atoi(par[1])] + if (fn == "XDATE"): + par = set_par_defaults(par,"%w%H%M%S," + ",%d" % conv_setting[1]) + out = strftime(par[0],localtime()) + out = string.atoi(par[1]) + out[:string.atoi(par[2])] if (fn == "OAI"): oai_prefix = "" out = "%s:%d" % (oai_prefix,tcounter + oai_identifier_from) return out def read_file(filename,exception): "Read file into list" out = [] if (os.path.isfile(filename)): file = open(filename,'r') out = file.readlines() file.close() else: if exception: exit_on_error("Cannot access file: %s" % filename) return out def crawl_KB(filename,value,mode): """ bibconvert look-up value in KB_file in one of following modes: =========================================================== 1 - case sensitive / match (default) 2 - not case sensitive / search 3 - case sensitive / search 4 - not case sensitive / match 5 - case sensitive / search (in KB) 6 - not case sensitive / search (in KB) 7 - case sensitive / search (reciprocal) 8 - not case sensitive / search (reciprocal) 9 - replace by _DEFAULT_ only R - not case sensitive / search (reciprocal) (8) replace """ if (os.path.isfile(filename) != 1): pathtmp = string.split(extract_tpl,"/") pathtmp.pop() path = string.join(pathtmp,"/") filename = path + "/" + filename if (os.path.isfile(filename)): file_to_read = open(filename,"r") file_read = file_to_read.readlines() for line in file_read: code = string.split(line,"---") if (mode == "2"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(value_to_cmp,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif ((mode == "3") or (mode == "0")): if ((len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "4"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((code[0] == value_to_cmp)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "5"): if ((len(string.split(code[0],value)) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "6"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "7"): if ((len(string.split(code[0],value)) > 1)or(len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "8"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "9"): if (code[0]=="_DEFAULT_"): value = code[1] return value elif (mode == "R"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")): value = value.replace(code[0],code[1]) else: if ((code[0] == value)or(code[0]=="_DEFAULT_")): value = code[1] return value return value def FormatField(value,fn): """ bibconvert formatting functions: ================================ ADD(prefix,suffix) - add prefix/suffix KB(kb_file,mode) - lookup in kb_file and replace value ABR(N,suffix) - abbreviate to N places with suffix ABRX() - abbreviate exclusively words longer ABRW() - abbreviate word (limit from right) REP(x,y) - replace SUP(type) - remove characters of certain TYPE LIM(n,side) - limit to n letters from L/R LIMW(string,side) - L/R after split on string WORDS(n,side) - limit to n words from L/R IF(value,valueT,valueF) - replace on IF condition MINL(n) - replace words shorter than n MINLW(n) - replace words shorter than n MAXL(n) - replace words longer than n EXPW(type) - replace word from value containing TYPE EXP(STR,0/1) - replace word from value containing string NUM() - take only digits in given string SHAPE() - remove extra space UP() - to uppercase DOWN() - to lowercase CAP() - make capitals each word SPLIT(n,h,str,from) - only for final Aleph field, i.e. AB , maintain whole words SPLITW(sep,h,str,from) - only for final Aleph field, split on string CONF(filed,value,0/1) - confirm validity of output line (check other field) CONFL(substr,0/1) - confirm validity of output line (check field being processed) CUT(prefix,postfix) - remove substring from side RANGE(MIN,MAX) - select items in repetitive fields RE(regexp) - regular expressions bibconvert character TYPES ========================== ALPHA - alphabetic NALPHA - not alpphabetic NUM - numeric NNUM - not numeric ALNUM - alphanumeric NALNUM - non alphanumeric LOWER - lowercase UPPER - uppercase PUNCT - punctual NPUNCT - non punctual SPACE - space """ out = value fn = fn + "()" par = get_pars(fn)[1] fn = get_pars(fn)[0] regexp = "//" NRE = len(regexp) value = sub_keywd(value) par_tmp = [] for item in par: item = sub_keywd(item) par_tmp.append(item) par = par_tmp if (fn == "RE"): new_value = "" par = set_par_defaults(par,".*,0") if (re.search(par[0],value) and (par[1] == "0")): new_value = value out = new_value if (fn == "KB"): new_value = "" par = set_par_defaults(par,"KB,0") new_value = crawl_KB(par[0],value,par[1]) out = new_value elif (fn == "ADD"): par = set_par_defaults(par,",") out = par[0] + value + par[1] elif (fn == "ABR"): par = set_par_defaults(par,"1,.") out = value[:string.atoi(par[0])] + par[1] elif (fn == "ABRW"): tmp = FormatField(value,"ABR(1,.)") tmp = tmp.upper() out = tmp elif (fn == "ABRX"): par = set_par_defaults(par,",") toout = [] tmp = value.split(" ") for wrd in tmp: if (len(wrd) > string.atoi(par[0])): wrd = wrd[:string.atoi(par[0])] + par[1] toout.append(wrd) out = string.join(toout," ") elif (fn == "SUP"): par = set_par_defaults(par,",") if(par[0]=="NUM"): out = re.sub('\d+',par[1],value) if(par[0]=="NNUM"): out = re.sub('\D+',par[1],value) if(par[0]=="ALPHA"): out = re.sub('[a-zA-Z]+',par[1],value) if(par[0]=="NALPHA"): out = re.sub('[^a-zA-Z]+',par[1],value) if((par[0]=="ALNUM")or(par[0]=="NPUNCT")): out = re.sub('\w+',par[1],value) if(par[0]=="NALNUM"): out = re.sub('\W+',par[1],value) if(par[0]=="PUNCT"): out = re.sub('\W+',par[1],value) if(par[0]=="LOWER"): out = re.sub('[a-z]+',par[1],value) if(par[0]=="UPPER"): out = re.sub('[A-Z]+',par[1],value) if(par[0]=="SPACE"): out = re.sub('\s+',par[1],value) elif (fn == "LIM"): par = set_par_defaults(par,",") if (par[1] == "L"): out = value[(len(value) - string.atoi(par[0])):] if (par[1] == "R"): out = value[:string.atoi(par[0])] elif (fn == "LIMW"): par = set_par_defaults(par,",") if (par[0]!= ""): if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] par[0] = re.search(par[0],value).group() tmp = value.split(par[0]) if (par[1] == "L"): out = par[0] + tmp[1] if (par[1] == "R"): out = tmp[0] + par[0] elif (fn == "WORDS"): tmp2 = [value] par = set_par_defaults(par,",") if (par[1] == "R"): tmp = value.split(" ") tmp2 = [] i = 0 while (i < string.atoi(par[0])): tmp2.append(tmp[i]) i = i + 1 if (par[1] == "L"): tmp = value.split(" ") tmp.reverse() tmp2 = [] i = 0 while (i < string.atoi(par[0])): tmp2.append(tmp[i]) i = i + 1 tmp2.reverse() out = string.join(tmp2, " ") elif (fn == "MINL"): par = set_par_defaults(par,"1") tmp = value.split(" ") tmp2 = [] i = 0 for wrd in tmp: if (len(wrd) >= string.atoi(par[0])): tmp2.append(wrd) out = string.join(tmp2, " ") elif (fn == "MINLW"): par = set_par_defaults(par,"1") if (len(value) >= string.atoi(par[0])): out = value else: out = "" elif (fn == "MAXL"): par = set_par_defaults(par,"4096") tmp = value.split(" ") tmp2 = [] i = 0 for wrd in tmp: if (len(wrd) <= string.atoi(par[0])): tmp2.append(wrd) out = string.join(tmp2, " ") elif (fn == "REP"): set_par_defaults(par,",") if (par[0]!= ""): if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] out = re.sub(par[0],value) else: out = value.replace(par[0],par[1]) elif (fn == "SHAPE"): if (value != ""): out = value.strip() elif (fn == "UP"): out = value.upper() elif (fn == "DOWN"): out = value.lower() elif (fn == "CAP"): tmp = value.split(" ") out2 = [] for wrd in tmp: wrd2 = wrd.capitalize() out2.append(wrd2) out = string.join(out2," ") elif (fn == "IF"): par = set_par_defaults(par,",,") N = 0 while N < 3: if (par[N][0:NRE] == regexp and par[N][-NRE:] == regexp): par[N] = par[N][NRE:-NRE] par[N] = re.search(par[N],value).group() N += 1 if (value == par[0]): out = par[1] else: out = par[2] if (out == "ORIG"): out = value elif (fn == "EXP"): par = set_par_defaults(par,",0") if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] par[0] = re.search(par[0],value).group() tmp = value.split(" ") out2 = [] for wrd in tmp: if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] if ((re.search(par[0],wrd).group() == wrd) and (par[1]=="1")): out2.append(wrd) if ((re.search(par[0],wrd).group() != wrd) and (par[1]=="0")): out2.append(wrd) else: if ((len(wrd.split(par[0])) == 1)and(par[1]=="1")): out2.append(wrd) if ((len(wrd.split(par[0])) != 1)and(par[1]=="0")): out2.append(wrd) out = string.join(out2," ") elif (fn == "EXPW"): par = set_par_defaults(par,",0") tmp = value.split(" ") out2 = [] for wrd in tmp: if ((FormatField(wrd,"SUP(" + par[0] + ")") == wrd)and(par[1]=="1")): out2.append(wrd) if ((FormatField(wrd,"SUP(" + par[0] + ")") != wrd)and(par[1]=="0")): out2.append(wrd) out = string.join(out2," ") elif (fn == "SPLIT"): par = set_par_defaults(par,"%d,0,,1" % conv_setting[1]) length = string.atoi(par[0]) + (string.atoi(par[1])) header = string.atoi(par[1]) headerplus = par[2] starting = string.atoi(par[3]) line = "" tmp2 = [] tmp3 = [] tmp = value.split(" ") linenumber = 1 if (linenumber >= starting): tmp2.append(headerplus) line = line + headerplus for wrd in tmp: line = line + " " + wrd tmp2.append(wrd) if (len(line) > length): linenumber = linenumber + 1 line = tmp2.pop() toout = string.join(tmp2) tmp3.append(toout) tmp2 = [] line2 = value[:header] if (linenumber >= starting): line3 = line2 + headerplus + line else: line3 = line2 + line line = line3 tmp2.append(line) tmp3.append(line) out = string.join(tmp3,"\n") out = FormatField(out,"SHAPE()") elif (fn == "SPLITW"): par = set_par_defaults(par,",0,,1") if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] str = re.search(par[0], value) header = string.atoi(par[1]) headerplus = par[2] starting = string.atoi(par[3]) counter = 1 tmp2 = [] tmp = re.split(par[0],value) last = tmp.pop() for wrd in tmp: counter = counter + 1 if (counter >= starting): tmp2.append(value[:header] + headerplus + wrd + str) else: tmp2.append(value[:header] + wrd + str) if (last != ""): counter = counter + 1 if (counter >= starting): tmp2.append(value[:header] + headerplus + last) else: tmp2.append(value[:header] + last) out = string.join(tmp2,"\n") elif (fn == "CONF"): par = set_par_defaults(par,",,1") found = 0 par1 = "" data = select_line(par[0],data_parsed) for line in data: if (par[1][0:NRE] == regexp and par[1][-NRE:] == regexp): par1 = par[1][NRE:-NRE] else: par1 = par[1] if (par1 == ""): if (line == ""): found = 1 elif (len(re.split(par1,line)) > 1 ): found = 1 if ((found == 1)and(string.atoi(par[2]) == 1)): out = value if ((found == 1)and(string.atoi(par[2]) == 0)): out = "" if ((found == 0)and(string.atoi(par[2]) == 1)): out = "" if ((found == 0)and(string.atoi(par[2]) == 0)): out = value return out elif (fn == "CONFL"): set_par_defaults(par,",1") if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp): par[0] = par[0][NRE:-NRE] if (re.search(par[0],value)): if (string.atoi(par[1]) == 1): out = value else: out = "" else: if (string.atoi(par[1]) == 1): out = "" else: out = value return out elif (fn == "CUT"): par = set_par_defaults(par,",") left = value[:len(par[0])] right = value[-(len(par[1])):] if (left == par[0]): out = out[len(par[0]):] if (right == par[1]): out = out[:-(len(par[1]))] return out elif (fn == "NUM"): tmp = re.findall('\d',value) out = string.join(tmp,"") return out def printInfo(): "print out when not enough parmeters given" print """ BibConvert data convertor Usage: bibconvert [options] -ctemplate.cfg < input.dat Options: -c'config' configuration templates file -d'directory' source_data fields are located in separated files in 'directory'one record) -h print this help -V print version number -l'length' minimum line length (default = 1) -o'value' OAI identifier starts with specified value (default = 1) -b'file header' insert file header -e'file footer' insert file footer -B'record header' insert record header -E'record footer' insert record footer -s'record separator' record separator, default empty line (EOLEOL) -m0'query_string' match records using query string, output unmatched -m1'query_string' match records using query string, output matched -m2'query_string' match records using query string, output ambiguous -Cx'field extraction template' alternative to -c when configuration is split to several files -Cs'source data template' alternative to -c when configuration is split to several files -Ct'target data template' alternative to -c when configuration is split to several files """ ## Match records with the database content ## def match_in_database(record, query_string): "Check if record is in alreadey in database with an oai identifier. Returns recID if present, 0 otherwise." query_string_parsed = parse_query_string(query_string) search_pattern = [] search_field = [] for query_field in query_string_parsed: ind1 = query_field[0][3:4] if ind1 == "_": ind1 = "" ind2 = query_field[0][4:5] if ind2 == "_": ind2 = "" stringsplit = "" % (query_field[0][0:3], ind1, ind2, query_field[0][5:6]) formatting = query_field[1:] record1 = string.split(record, stringsplit) if len(record1) > 1: matching_value = string.split(record1[1],"<")[0] for fn in formatting: matching_value = FormatField(matching_value, fn) search_pattern.append(matching_value) search_field.append(query_field[0]) search_field.append("") search_field.append("") search_field.append("") search_pattern.append("") search_pattern.append("") search_pattern.append("") recID_list = perform_request_search(p1=search_pattern[0],f1=search_field[0],p2=search_pattern[1],f2=search_field[1],p3=search_pattern[2],f3=search_field[2]) return recID_list def parse_query_string(query_string): """Parse query string, e.g.: Input: 245__a::REP(-, )::SHAPE::SUP(SPACE, )::MINL(4)::MAXL(8)::EXPW(PUNCT)::WORDS(4,L)::SHAPE::SUP(SPACE, )||700__a::MINL(2)::REP(COMMA,). Output:[['245__a','REP(-,)','SHAPE','SUP(SPACE, )','MINL(4)','MAXL(8)','EXPW(PUNCT)','WORDS(4,L)','SHAPE','SUP(SPACE, )'],['700__a','MINL(2)','REP(COMMA,)']] """ query_string_out = [] query_string_out_in = [] query_string_split_1 = query_string.split('||') for item_1 in query_string_split_1: query_string_split_2 = item_1.split('::') query_string_out_in = [] for item in query_string_split_2: query_string_out_in.append(item) query_string_out.append(query_string_out_in) return query_string_out def exit_on_error(error_message): "exit when error occured" sys.stderr.write("\n bibconvert data convertor\n") sys.stderr.write(" Error: %s\n" % error_message) sys.exit() return 0 -def create_record(begin_record_header, ending_record_footer, query_string, match_mode): +def create_record(begin_record_header, ending_record_footer, query_string, match_mode, Xcount): "Create output record" out_to_print = "" out = [] field_data_item_LIST = [] + ssn5cnt = "%3d" % Xcount + sysno = generate("DATE(%w%H%M%S)") + sysno500 = generate("XDATE(%w%H%M%S),%s" % ssn5cnt) for T_tpl_item_LIST in target_tpl_parsed: # the line is printed only if the variables inside are not empty print_line = 0 to_output = [] rows = 1 for field_tpl_item_STRING in T_tpl_item_LIST[1]: DATA = [] if (field_tpl_item_STRING[:2]=="<:"): field_tpl_item_STRING = field_tpl_item_STRING[2:-2] field = field_tpl_item_STRING.split("::")[0] if (len(field_tpl_item_STRING.split("::")) == 1): value = generate(field) to_output.append([value]) else: subfield = field_tpl_item_STRING.split("::")[1] if (field[-1] == "*"): repetitive = 1 field = field[:-1] else: repetitive = 0 if dirmode: DATA = select_line(field,data_parsed) else: DATA = select_line(field,data_parsed) if (repetitive == 0): DATA = [string.join(DATA," ")] SRC_TPL = select_line(field,source_tpl_parsed) try: if (DATA[0] != ""): DATA = get_subfields(DATA,subfield,SRC_TPL) FF = field_tpl_item_STRING.split("::") if (len(FF) > 2): FF = FF[2:] for fn in FF: # DATAFORMATTED = [] if (len(DATA) != 0 and DATA[0] != ""): DATA = get_subfields(DATA,subfield,SRC_TPL) FF = field_tpl_item_STRING.split("::") if (len(FF) > 2): FF = FF[2:] for fn2 in FF: DATAFORMATTED = [] for item in DATA: item = FormatField(item,fn) DATAFORMATTED.append(item) DATA = DATAFORMATTED if (len(DATA) > rows): rows = len(DATA) if DATA != "": print_line = 1 to_output.append(DATA) except IndexError, e: pass else: to_output.append([field_tpl_item_STRING]) current = 0 default_print = 0 while (current < rows): line_to_print = [] for item in to_output: if (item==[]): item =[''] if (len(item) <= current): printout = item[0] else: printout = item[current] line_to_print.append(printout) output = exp_n(string.join(line_to_print,"")) global_formatting_functions = T_tpl_item_LIST[0].split("::")[1:] for GFF in global_formatting_functions: if (GFF[:5] == "RANGE"): parR = get_pars(GFF)[1] parR = set_par_defaults(parR,"MIN,MAX") if (parR[0]!="MIN"): if (string.atoi(parR[0]) > (current+1)): output = "" if (parR[1]!="MAX"): if (string.atoi(parR[1]) < (current+1)): output = "" elif (GFF[:4] == "DEFP"): default_print = 1 else: output = FormatField(output,GFF) if ((len(output) > conv_setting[0] and print_line == 1) or default_print): out_to_print = out_to_print + output + "\n" current = current + 1 ### out_flag = 0 if query_string: recID = match_in_database(out_to_print, query_string) if len(recID) == 1 and match_mode == 1: ctrlfield = "%d" % (recID[0]) out_to_print = ctrlfield + "\n" + out_to_print out_flag = 1 if len(recID) == 0 and match_mode == 0: out_flag = 1 if len(recID) > 1 and match_mode == 2: out_flag = 1 if out_flag or match_mode == -1: if begin_record_header != "": out_to_print = begin_record_header + "\n" + out_to_print if ending_record_footer != "": out_to_print = out_to_print + "\n" + ending_record_footer else: out_to_print = "" return out_to_print ### MAIN ### conv_setting = set_conv() sysno = generate("DATE(%w%H%M%S)") +sysno500 = generate("DATE(%w%H%M%S)") separator = "" tcounter = 0 source_data = "" query_string = "" match_mode = -1 begin_record_header = "" ending_record_footer = "" output_rec_sep = "" begin_header = "" ending_footer = "" oai_identifier_from = 1 opts, args = getopt.getopt(sys.argv[1:],"c:d:hVl:o:b:e:B:E:s:m:C:", [ "config", "directory", "help", "version", "length", "oai", "header", "footer", "record-header", "record-footer", "separator", "match", "config-alt" ]) # get options and arguments dirmode = 0 +Xcount = 0 for opt, opt_value in opts: if opt in ["-c", "--config"]: extract_tpl = opt_value extract_tpl_parsed = parse_common_template(extract_tpl,1) source_tpl = opt_value source_tpl_parsed = parse_common_template(source_tpl,2) target_tpl = opt_value target_tpl_parsed = parse_common_template(target_tpl,3) elif opt in ["-d", "--directory"]: source_data = opt_value source_data = source_data + "/" extract_tpl = "/" dirmode = 1 elif opt in ["-h", "--help"]: printInfo() sys.exit(0) elif opt in ["-V", "--version"]: print __version__ sys.exit(0) elif opt in ["-l", "--length"]: try: conv_setting[0] = string.atoi(opt_value) except ValueError, e: conv_setting[0] = 1 elif opt in ["-o", "--oai"]: try: oai_identifier_from = string.atoi(opt_value) except ValueError, e: oai_identifier_from = 1 elif opt in ["-b", "--header"]: begin_header = opt_value elif opt in ["-e", "--footer"]: ending_footer = opt_value elif opt in ["-B", "--record-header"]: begin_record_header = opt_value elif opt in ["-E", "--record-footer"]: ending_record_footer = opt_value elif opt in ["-s", "--separator"]: separator = opt_value elif opt in ["-t", "--output_separator"]: output_rec_sep = opt_value elif opt in ["-m", "--match"]: match_mode = string.atoi(opt_value[0:1]) query_string = opt_value[1:] elif opt in ["-C", "--config-alt"]: if opt_value[0:1] == "x": extract_tpl = opt_value[1:] extract_tpl_parsed = parse_template(extract_tpl) if opt_value[0:1] == "t": target_tpl = opt_value[1:] target_tpl_parsed = parse_template(target_tpl) if opt_value[0:1] == "s": source_tpl = opt_value[1:] source_tpl_parsed = parse_template(source_tpl) if dirmode: if (os.path.isdir(source_data)): data_parsed = parse_input_data_d(source_data,source_tpl) - record = create_record(begin_record_header, ending_record_footer, query_string, match_mode) + record = create_record(begin_record_header, ending_record_footer, query_string, match_mode, Xcount) if record != "": print record tcounter = tcounter + 1 if output_rec_sep != "": print output_rec_sep else: exit_on_error("Cannot access directory: %s" % source_data) else: done = 0 print begin_header while (done == 0): data_parsed = parse_input_data_fx(source_tpl) if (data_parsed == -1): done = 1 else: if (data_parsed[0][0]!= ''): - record = create_record(begin_record_header, ending_record_footer, query_string, match_mode) + record = create_record(begin_record_header, ending_record_footer, query_string, match_mode, Xcount) + Xcount += 1 if record != "": print record tcounter = tcounter + 1 if output_rec_sep != "": print output_rec_sep print ending_footer