diff --git a/modules/bibconvert/bin/bibconvert.in b/modules/bibconvert/bin/bibconvert.in index ce88b8096..c55c7759b 100644 --- a/modules/bibconvert/bin/bibconvert.in +++ b/modules/bibconvert/bin/bibconvert.in @@ -1,1427 +1,1395 @@ ## $Id$ ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## read config variables: #include "config.wml" #include "configbis.wml" ## start Python: #! ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. ## okay, rest of the Python code goes below ####### def set_conv(): """ bibconvert common settings ======================= minimal length of output line = 1 maximal length of output line = 4096 """ conv_setting = [ 1, 4096 ] return conv_setting def get_options(): "Read command line options into list" out = [] for arg in sys.argv: if (arg[:1] == "-"): out.append(arg) return out def get_arguments(): "Read command line arguments into list" out = [] for arg in sys.argv: if (arg[:1] != "-"): out.append(arg) return out def get_pars(fn): "Read function and its parameters into list" out = [] out.append(re.split('\(|\)',fn)[0]) out.append(re.split(',',re.split('\(|\)',fn)[1])) return out def is_opt(seek,opt_list): "Return entire argument if given in the list of options" out = "" for arg in opt_list: if (seek == arg[:2]): out = arg if (seek == arg[:3]): out = arg return out def append_to_output_file(filename, output): "bibconvert output file creation by output line" try: file = open(filename,'a') file.write(output) file.close() except IOError, e: exit_on_error("Cannot write into %s" % filename) return 1 def sub_keywd(out): "bibconvert keywords literal substitution" out = string.replace(out,"EOL","\n") out = string.replace(out,"_CR_","\r") out = string.replace(out,"_LF_","\n") out = string.replace(out,"\\",'\\') out = string.replace(out,"\r",'\r') out = string.replace(out,"BSLASH",'\\') out = string.replace(out,"COMMA",',') out = string.replace(out,"LEFTB",'[') out = string.replace(out,"RIGHTB",']') out = string.replace(out,"LEFTP",'(') out = string.replace(out,"RIGHTP",')') return out def check_split_on(data_item_split, sep, tpl_f): """ bibconvert conditional split with following conditions =================================================== ::NEXT(N,TYPE,SIDE) - next N chars are of the TYPE having the separator on the SIDE ::PREV(N,TYPE,SIDE) - prev.N chars are of the TYPE having the separator on the SIDE """ fn = get_pars(tpl_f)[0] par = get_pars(tpl_f)[1] done = 0 while (done == 0): if ( (( fn == "NEXT" ) and ( par[2]=="R" )) or (( fn == "PREV" ) and ( par[2]=="L" )) ): test_value = data_item_split[0][-(string.atoi(par[0])):] elif ( ((fn == "NEXT") and ( par[2]=="L")) or ((fn == "PREV") and ( par[2]=="R")) ): test_value = data_item_split[1][:(string.atoi(par[0]))] data_item_split_tmp = [] if ((FormatField(test_value,"SUP(" + par[1] + ",)") != "")or(len(test_value) < string.atoi(par[0]))): data_item_split_tmp = data_item_split[1].split(sep,1) if(len(data_item_split_tmp)==1): done = 1 data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0] data_item_split[1] = "" else: data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0] data_item_split[1] = data_item_split_tmp[1] else: done = 1 return data_item_split def get_subfields(data,subfield,src_tpl): "Get subfield according to the template" out = [] for data_item in data: found = 0 for src_tpl_item in src_tpl: if (src_tpl_item[:2] == "<:"): if (src_tpl_item[2:-2] == subfield): found = 1 else: sep_in_list = src_tpl_item.split("::") sep = sep_in_list[0] data_item_split = data_item.split(sep,1) if (len(data_item_split)==1): data_item = data_item_split[0] else: if (len(sep_in_list) > 1): data_item_split = check_split_on(data_item.split(sep,1), sep_in_list[0],sep_in_list[1]) if(found == 1): data_item = data_item_split[0] else: data_item = string.join(data_item_split[1:],sep) out.append(data_item) return out def exp_n(word): "Replace newlines and carriage return's from string." out = "" for ch in word: if ((ch != '\n') and (ch != '\r')): out = out + ch return out def exp_e(list): "Expunge empty elements from a list" out = [] for item in list: item = exp_n(item) if ((item != '\r\n' and item != '\r' and item != '\n' and item !="" and len(item)!=0)): out.append(item) return out def sup_e(word): "Replace spaces" out = "" for ch in word: if (ch != ' '): out = out + ch return out def select_line(field_code, list): "Return appropriate item from a list" out = [''] for field in list: field[0] = sup_e(field[0]) field_code = sup_e(field_code) if (field[0] == field_code): out = field[1] return out def parse_field_definition(source_field_definition): "Create list of source_field_definition" word_list = [] out = [] word = "" counter = 0 if (len(source_field_definition.split("---"))==4): out = source_field_definition.split("---") else: element_list_high = source_field_definition.split("<:") for word_high in element_list_high: element_list_low = word_high.split(':>') for word_low in element_list_low: word_list.append(word_low) word_list.append(":>") word_list.pop() word_list.append("<:") word_list.pop() for item in word_list: word = word + item if (item == "<:"): counter = counter + 1 if (item == ":>"): counter = counter - 1 if counter == 0: out.append(word) word = "" return out def parse_template(template): """ bibconvert parse template ====================== in - template filename out - [ [ field_code , [ field_template_parsed ] , [] ] """ out = [] for field_def in read_file(template,1): field_tpl_new = [] if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")): field_code = field_def.split("---",1)[0] field_tpl = parse_field_definition(field_def.split("---",1)[1]) field_tpl_new = field_tpl field_tpl = exp_e(field_tpl_new) out_data = [field_code, field_tpl] out.append(out_data) return out def parse_common_template(template,part): """ bibconvert parse template ========================= in - template filename out - [ [ field_code , [ field_template_parsed ] , [] ] """ out = [] counter = 0 for field_def in read_file(template,1): if (exp_n(field_def)[:3] == "==="): counter = counter + 1 elif (counter == part): field_tpl_new = [] if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")): field_code = field_def.split("---",1)[0] field_tpl = parse_field_definition(field_def.split("---",1)[1]) field_tpl_new = field_tpl field_tpl = exp_e(field_tpl_new) out_data = [field_code, field_tpl] out.append(out_data) return out def parse_input_data_f(source_data_open, source_tpl): """ bibconvert parse input data ======================== in - input source data location (filehandle) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][] destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[] input data file; by line: - fieldcode value """ out = [['',[]]] count = 0 values = [] while (count < 1): line = source_data_open.readline() if (line == ""): return(-1) line_split = line.split(" ",1) if (re.sub("\s","",line) == separator): count = count + 1 if (len(line_split) == 2): field_code = line_split[0] field_value = exp_n(line_split[1]) values.append([field_code,field_value]) item_prev = "" stack = [''] for item in values: if ((item[0]==item_prev)or(item_prev == "")): stack.append(item[1]) item_prev = item[0] else: out.append([item_prev,stack]) item_prev = item[0] stack = [] stack.append(item[1]) try: if (stack[0] != ""): if (out[0][0]==""): out = [] out.append([field_code,stack]) except IndexError, e: out = out return out def parse_input_data_fx(source_tpl): """ bibconvert parse input data ======================== in - input source data location (filehandle) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] extraction_template_entry - - input data file - specified by extract_tpl + input data file - specified by extract_tpl """ count = 0 record = "" field_data_1_in_list = [] out = [['',[]]] while (count <1): line = sys.stdin.readline() if (line == ""): if (record == ""): return (-1) if (re.sub("\s","",line) == separator): count = count + 1 else: - record = record + line + record = record + line for field_defined in extract_tpl_parsed: try: field_defined[1][0] = sub_keywd(field_defined[1][0]) field_defined[1][1] = sub_keywd(field_defined[1][1]) except IndexError, e: field_defined = field_defined try: field_defined[1][2] = sub_keywd(field_defined[1][2]) except IndexError, e: field_defined = field_defined field_data_1 ="" try: if (len(record.split(field_defined[1][0])) == 1): field_data_1 = "" field_data_1_in_list = [] else: field_data_1_tmp = record.split(field_defined[1][0],1)[1] field_data_1_in_list = field_data_1_tmp.split(field_defined[1][0]) except IndexError, e: field_data_1 = "" spliton = [] outvalue = "" field_data_2 = "" field_data = "" - + try: if ((field_defined[1][1])=="EOL"): spliton = ['\n'] elif ((field_defined[1][1])=="MIN"): spliton = ['\n'] elif ((field_defined[1][1])=="MAX"): for item in extract_tpl_parsed: try: spliton.append(item[1][0]) except IndexError, e: - spliton = spliton + spliton = spliton else: spliton = [field_defined[1][1]] except IndexError,e : spliton = "" outvalues = [] for field_data in field_data_1_in_list: outvalue = "" for splitstring in spliton: field_data_2 = "" if (len(field_data.split(splitstring))==1): if (outvalue == ""): field_data_2 = field_data else: field_data_2 = outvalue else: field_data_2 = field_data.split(splitstring)[0] outvalue = field_data_2 field_data = field_data_2 outvalues.append(outvalue) outvalues = exp_e(outvalues) if (len(outvalues) > 0): if (out[0][0]==""): out = [] outstack = [] if (len(field_defined[1])==3): for item in outvalues: stack = item.split(field_defined[1][2]) for stackitem in stack: outstack.append(stackitem) else: outstack = outvalues out.append([field_defined[0],outstack]) return out def parse_input_data_d(source_data, source_tpl): """ bibconvert parse input data ======================== in - input source data location (directory) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][] destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[] input data dir; by file: - fieldcode value per line """ out = [] for source_field_tpl in read_file(source_tpl,1): source_field_code = source_field_tpl.split("---")[0] source_field_data = read_file(source_data + source_field_code,0) source_field_data = exp_e(source_field_data) out_data = [source_field_code, source_field_data] out.append(out_data) return out def sub_empty_lines(value): out = re.sub('\n\n+','',value) return out def set_par_defaults(par1,par2): "Set default parameter when not defined" par_new_in_list = par2.split(",") i = 0 out = [] for par in par_new_in_list: if (len(par1)>i): if (par1[i] == ""): out.append(par) else: out.append(par1[i]) else: out.append(par) i = i + 1 return out def generate(keyword): """ bibconvert generaded values: ========================= SYSNO() - generate date as '%w%H%M%S' WEEK(N) - generate date as '%V' with shift (N) DATE(format) - generate date in specifieddate FORMAT VALUE(value) - enter value literarly OAI() - generate oai_identifier, starting value given at command line as -o """ out = keyword fn = keyword + "()" par = get_pars(fn)[1] fn = get_pars(fn)[0] par = set_par_defaults(par,"") if (fn == "SYSNO"): out = sysno if (fn == "WEEK"): par = set_par_defaults(par,"0") out = "%02d" % (string.atoi(strftime("%V",localtime())) + string.atoi(par[0])) if (string.atoi(out)<0): out = "00" if (fn == "VALUE"): par = set_par_defaults(par,"") out = par[0] if (fn == "DATE"): par = set_par_defaults(par,"%w%H%M%S," + "%d" % conv_setting[1]) out = strftime(par[0],localtime()) out = out[:string.atoi(par[1])] if (fn == "OAI"): oai_prefix = "" out = "%s:%d" % (oai_prefix,tcounter + oai_identifier_from) return out def read_file(filename,exception): "Read file into list" out = [] if (os.path.isfile(filename)): file = open(filename,'r') out = file.readlines() file.close() else: if exception: exit_on_error("Cannot access file: %s" % filename) return out def crawl_KB(filename,value,mode): """ bibconvert look-up value in KB_file in one of following modes: =========================================================== 1 - case sensitive / match (default) 2 - not case sensitive / search 3 - case sensitive / search 4 - not case sensitive / match 5 - case sensitive / search (in KB) 6 - not case sensitive / search (in KB) 7 - case sensitive / search (reciprocal) 8 - not case sensitive / search (reciprocal) 9 - replace by _DEFAULT_ only """ if (os.path.isfile(filename) != 1): pathtmp = string.split(extract_tpl,"/") pathtmp.pop() path = string.join(pathtmp,"/") filename = path + "/" + filename if (os.path.isfile(filename)): file_to_read = open(filename,"r") file_read = file_to_read.readlines() for line in file_read: code = string.split(line,"---") if (mode == "2"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(value_to_cmp,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif ((mode == "3") or (mode == "0")): if ((len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "4"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((code[0] == value_to_cmp)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "5"): if ((len(string.split(code[0],value)) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "6"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "7"): if ((len(string.split(code[0],value)) > 1)or(len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "8"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "9"): if (code[0]=="_DEFAULT_"): value = code[1] return value else: if ((code[0] == value)or(code[0]=="_DEFAULT_")): value = code[1] return value return value def FormatField(value,fn): """ bibconvert formatting functions: ============================= ADD(prefix,suffix) - add prefix/suffix KB(kb_file) - lookup in kb_file and replace value ABR(N,suffix) - abbreviate to N places with suffix ABRX() - abbreviate exclusively words longer ABRW() - abbreviate word (limit from right) REP(x,y) - replace SUP(type) - remove characters of certain TYPE LIM(n,side) - limit to n letters from L/R LIMW(string,side) - L/R after split on string WORDS(n,side) - limit to n words from L/R IF(value,valueT,valueF) - replace on IF condition MINL(n) - replace words shorter than n MINLW(n) - replace words shorter than n MAXL(n) - replace words longer than n EXPW(type) - replace word from value containing TYPE EXP(STR,0/1) - replace word from value containing string NUM() - take only digits in given string SHAPE() - remove extra space UP() - to uppercase DOWN() - to lowercase CAP() - make capitals each word SPLIT(n,h,str,from) - only for final Aleph field, i.e. AB , maintain whole words SPLITW(sep,h,str,from) - only for final Aleph field, split on string CONF(filed,value,0/1) - confirm validity of output line (check other field) CONFL(substr,0/1) - confirm validity of output line (check field being processed) CUT(prefix,postfix) - remove substring from side RANGE(MIN,MAX) - select items in repetitive fields bibconvert character TYPES ======================= ALPHA - alphabetic NALPHA - not alpphabetic NUM - numeric NNUM - not numeric ALNUM - alphanumeric NALNUM - non alphanumeric LOWER - lowercase UPPER - uppercase PUNCT - punctual NPUNCT - non punctual SPACE - space """ out = value fn = fn + "()" par = get_pars(fn)[1] fn = get_pars(fn)[0] value = sub_keywd(value) par_tmp =[] for item in par: item = sub_keywd(item) par_tmp.append(item) par = par_tmp if (fn == "KB"): new_value = "" par = set_par_defaults(par,"KB,0") new_value = crawl_KB(par[0],value,par[1]) out = new_value elif (fn == "ADD"): par = set_par_defaults(par,",") out = par[0] + value + par[1] elif (fn == "ABR"): par = set_par_defaults(par,"1,.") out = value[:string.atoi(par[0])] + par[1] elif (fn == "ABRW"): tmp = FormatField(value,"ABR(1,.)") tmp = tmp.upper() out = tmp elif (fn == "ABRX"): par = set_par_defaults(par,",") toout = [] tmp = value.split(" ") for wrd in tmp: if (len(wrd) > string.atoi(par[0])): wrd = wrd[:string.atoi(par[0])] + par[1] toout.append(wrd) out = string.join(toout," ") elif (fn == "SUP"): par = set_par_defaults(par,",") if(par[0]=="NUM"): out = re.sub('\d+',par[1],value) if(par[0]=="NNUM"): out = re.sub('\D+',par[1],value) if(par[0]=="ALPHA"): out = re.sub('[a-zA-Z]+',par[1],value) if(par[0]=="NALPHA"): out = re.sub('[^a-zA-Z]+',par[1],value) if((par[0]=="ALNUM")or(par[0]=="NPUNCT")): out = re.sub('\w+',par[1],value) if(par[0]=="NALNUM"): out = re.sub('\W+',par[1],value) if(par[0]=="PUNCT"): out = re.sub('\W+',par[1],value) if(par[0]=="LOWER"): out = re.sub('[a-z]+',par[1],value) if(par[0]=="UPPER"): out = re.sub('[A-Z]+',par[1],value) if(par[0]=="SPACE"): out = re.sub('\s+',par[1],value) elif (fn == "LIM"): par = set_par_defaults(par,",") if (par[1] == "L"): out = value[(len(value) - string.atoi(par[0])):] if (par[1] == "R"): out = value[:string.atoi(par[0])] elif (fn == "LIMW"): par = set_par_defaults(par,",") tmp = value.split(par[0]) if (par[1] == "L"): out = par[0] + tmp[1] if (par[1] == "R"): out = tmp[0] + par[0] elif (fn == "WORDS"): tmp2 = [value] par = set_par_defaults(par,",") if (par[1] == "R"): tmp = value.split(" ") tmp2 = [] i = 0 while (i < string.atoi(par[0])): tmp2.append(tmp[i]) i = i + 1 if (par[1] == "L"): tmp = value.split(" ") tmp.reverse() tmp2 = [] i = 0 while (i < string.atoi(par[0])): tmp2.append(tmp[i]) i = i + 1 tmp2.reverse() out = string.join(tmp2, " ") elif (fn == "MINL"): par = set_par_defaults(par,"1") tmp = value.split(" ") tmp2 = [] i = 0 for wrd in tmp: if (len(wrd) >= string.atoi(par[0])): tmp2.append(wrd) out = string.join(tmp2, " ") elif (fn == "MINLW"): par = set_par_defaults(par,"1") if (len(value) >= string.atoi(par[0])): out = value else: out = "" elif (fn == "MAXL"): par = set_par_defaults(par,"4096") tmp = value.split(" ") tmp2 = [] i = 0 for wrd in tmp: if (len(wrd) <= string.atoi(par[0])): tmp2.append(wrd) out = string.join(tmp2, " ") elif (fn == "REP"): set_par_defaults(par,",") if (par[0]!= ""): out = value.replace(par[0],par[1]) elif (fn == "SHAPE"): if (value != ""): out = value.strip() elif (fn == "UP"): out = value.upper() elif (fn == "DOWN"): out = value.lower() elif (fn == "CAP"): tmp = value.split(" ") out2 = [] for wrd in tmp: wrd2 = wrd.capitalize() out2.append(wrd2) out = string.join(out2," ") elif (fn == "IF"): par = set_par_defaults(par,",,") if (value == par[0]): out = par[1] else: out = par[2] if (out == "ORIG"): out = value elif (fn == "EXP"): par = set_par_defaults(par,",0") tmp = value.split(" ") out2 = [] for wrd in tmp: if ((len(wrd.split(par[0])) == 1)and(par[1]=="1")): out2.append(wrd) if ((len(wrd.split(par[0])) != 1)and(par[1]=="0")): out2.append(wrd) out = string.join(out2," ") elif (fn == "EXPW"): par = set_par_defaults(par,",0") tmp = value.split(" ") out2 = [] for wrd in tmp: if ((FormatField(wrd,"SUP(" + par[0] + ")") == wrd)and(par[1]=="1")): out2.append(wrd) if ((FormatField(wrd,"SUP(" + par[0] + ")") != wrd)and(par[1]=="0")): out2.append(wrd) out = string.join(out2," ") elif (fn == "SPLIT"): par = set_par_defaults(par,"%d,0,,1" % conv_setting[1]) length = string.atoi(par[0]) + (string.atoi(par[1])) header = string.atoi(par[1]) headerplus = par[2] starting = string.atoi(par[3]) line = "" tmp2 = [] tmp3 = [] tmp = value.split(" ") linenumber = 1 if (linenumber >= starting): tmp2.append(headerplus) line = line + headerplus for wrd in tmp: line = line + " " + wrd tmp2.append(wrd) if (len(line) > length): linenumber = linenumber + 1 line = tmp2.pop() toout = string.join(tmp2) tmp3.append(toout) tmp2 = [] line2 = value[:header] if (linenumber >= starting): line3 = line2 + headerplus + line else: line3 = line2 + line line = line3 tmp2.append(line) tmp3.append(line) out = string.join(tmp3,"\n") out = FormatField(out,"SHAPE()") elif (fn == "SPLITW"): par = set_par_defaults(par,",0,,1") str = par[0] header = string.atoi(par[1]) headerplus = par[2] starting = string.atoi(par[3]) counter = 1 tmp2 = [] tmp = value.split(par[0]) last = tmp.pop() for wrd in tmp: counter = counter + 1 if (counter >= starting): tmp2.append(value[:header] + headerplus + wrd + par[0]) else: tmp2.append(value[:header] + wrd + par[0]) if (last != ""): counter = counter + 1 if (counter >= starting): tmp2.append(value[:header] + headerplus + last) else: tmp2.append(value[:header] + last) out = string.join(tmp2,"\n") elif (fn == "CONF"): par = set_par_defaults(par,",,1") found = 0 data = select_line(par[0],data_parsed) for line in data: if (par[1] == ""): if (line == ""): found = 1 elif (len(re.split(par[1],line)) > 1): found = 1 if ((found == 1)and(string.atoi(par[2]) == 1)): out = value if ((found == 1)and(string.atoi(par[2]) == 0)): out = "" if ((found == 0)and(string.atoi(par[2]) == 1)): out = "" if ((found == 0)and(string.atoi(par[2]) == 0)): out = value return out elif (fn == "CONFL"): set_par_defaults(par,",1") if (re.search(par[0],value)): if (string.atoi(par[1]) == 1): out = value else: out = "" else: if (string.atoi(par[1]) == 1): out = "" else: out = value return out elif (fn == "CUT"): par = set_default_pars(par,",") left = value[:len(par[0])] right = value[-(len(par[1])):] if (left == par[0]): out = out[len(par[0]):] if (right == par[1]): out = out[:-(len(par[1]))] return out elif (fn == "NUM"): tmp = re.findall('\d',value) out = string.join(tmp,"") return out def printInfo(): "print out when not enough parmeters given" print """ BibConvert data convertor Usage: bibconvert [options] -ctemplate.cfg < input.dat Options: - -c'config' configuration templates file - -d'directory' source_data fields are located in separated files in 'directory'one record) - -h help - -l'length' minimum line length (default = 1) - -o'value' OAI identifier starts with specified value (default = 1) - -b'file header' insert file header - -e'file footer' insert file footer - -s'record separator' record separator, default empty line (EOLEOL) + -c'config' configuration templates file + -d'directory' source_data fields are located in separated files in 'directory'one record) + -h help + -l'length' minimum line length (default = 1) + -o'value' OAI identifier starts with specified value (default = 1) + -b'file header' insert file header + -e'file footer' insert file footer + -s'record separator' record separator, default empty line (EOLEOL) - -Cx'field extraction template' alternative to -c when configuration is split to several files - -Cs'source data template' alternative to -c when configuration is split to several files - -Ct'target data template' alternative to -c when configuration is split to several files + -Cx'field extraction template' alternative to -c when configuration is split to several files + -Cs'source data template' alternative to -c when configuration is split to several files + -Ct'target data template' alternative to -c when configuration is split to several files """ def printHelp(): "print out help" print """ BibConvert data convertor Usage: bibconvert [options] -ctemplate.cfg < input.dat Options: - -c'config' configuration templates file - -d'directory' source_data fields are located in separated files in 'directory'one record) - -h help - -l'length' minimum line length (default = 1) - -o'value' OAI identifier starts with specified value (default = 1) - -b'file header' insert file header - -e'file footer' insert file footer - -s'record separator' record separator, default empty line (EOLEOL) + -c'config' configuration templates file + -d'directory' source_data fields are located in separated files in 'directory'one record) + -h help + -l'length' minimum line length (default = 1) + -o'value' OAI identifier starts with specified value (default = 1) + -b'file header' insert file header + -e'file footer' insert file footer + -s'record separator' record separator, default empty line (EOLEOL) - -Cx'field extraction template' alternative to -c when configuration is split to several files - -Cs'source data template' alternative to -c when configuration is split to several files - -Ct'target data template' alternative to -c when configuration is split to several files + -Cx'field extraction template' alternative to -c when configuration is split to several files + -Cs'source data template' alternative to -c when configuration is split to several files + -Ct'target data template' alternative to -c when configuration is split to several files Example: -------- Creation of an XML metadata container in output.xml file from text input file, bibconvert -o1 -l1 -csample.cfg < sample.dat > output.xml -l1 print out all output lines -o1 create OAI identifiers starting with value 1 -c* data conversion configuration templates """ def exit_on_error(error_message): "exit when error occured" sys.stderr.write("\n bibconvert data convertor\n") sys.stderr.write(" Error: %s\n" % error_message) sys.exit() return 0 def create_record(): "Create output record" - out = [] field_data_item_LIST = [] for T_tpl_item_LIST in target_tpl_parsed: - + # the line is printed only if the variables inside are not empty + print_line = 0 to_output = [] rows = 1 - for field_tpl_item_STRING in T_tpl_item_LIST[1]: - DATA = [] - if (field_tpl_item_STRING[:2]=="<:"): field_tpl_item_STRING = field_tpl_item_STRING[2:-2] - field = field_tpl_item_STRING.split("::")[0] if (len(field_tpl_item_STRING.split("::")) == 1): value = generate(field) to_output.append([value]) else: subfield = field_tpl_item_STRING.split("::")[1] if (field[-1] == "*"): repetitive = 1 field = field[:-1] else: repetitive = 0 - if (is_opt("-d",opt_list)[:2]=="-d"): DATA = select_line(field,data_parsed) else: DATA = select_line(field,data_parsed) - if (repetitive == 0): DATA = [string.join(DATA," ")] - SRC_TPL = select_line(field,source_tpl_parsed) try: if (DATA[0] != ""): DATA = get_subfields(DATA,subfield,SRC_TPL) - FF = field_tpl_item_STRING.split("::") if (len(FF) > 2): FF = FF[2:] for fn in FF: - - # DATAFORMATTED = [] - if (len(DATA) != 0 and DATA[0] != ""): DATA = get_subfields(DATA,subfield,SRC_TPL) - FF = field_tpl_item_STRING.split("::") if (len(FF) > 2): FF = FF[2:] for fn2 in FF: - DATAFORMATTED = [] - - for item in DATA: item = FormatField(item,fn) DATAFORMATTED.append(item) - DATA = DATAFORMATTED - if (len(DATA) > rows): - rows = len(DATA) + if DATA != "": + print_line = 1 to_output.append(DATA) except IndexError, e: - pass - + pass else: to_output.append([field_tpl_item_STRING]) - current = 0 + default_print = 0 while (current < rows): line_to_print = [] for item in to_output: - if (item==[]): item =[''] - if (len(item) <= current): printout = item[0] else: printout = item[current] - line_to_print.append(printout) - - output = exp_n(string.join(line_to_print,"")) - global_formatting_functions = T_tpl_item_LIST[0].split("::")[1:] - for GFF in global_formatting_functions: if (GFF[:5] == "RANGE"): - parR = get_pars(GFF)[1] parR = set_par_defaults(parR,"MIN,MAX") - if (parR[0]!="MIN"): if (string.atoi(parR[0]) > (current+1)): output = "" - if (parR[1]!="MAX"): if (string.atoi(parR[1]) < (current+1)): output = "" - + elif (GFF[:4] == "DEFP"): + default_print = 1 else: output = FormatField(output,GFF) - if (len(output) > conv_setting[0]): - + if ((len(output) > conv_setting[0] and print_line == 1) or default_print): print output current = current + 1 return ### MAIN ### try: import fileinput import string import os import re import sys import time except ImportError, e: print "Error: %s" % e import sys sys.exit(1) from time import gmtime, strftime, localtime import os.path tcounter = 0 conv_setting = set_conv() sysno = generate("DATE(%w%H%M%S)") if(len(sys.argv) < 2): printInfo() sys.exit(0) opt_list = get_options() arg_list = get_arguments() if(len(opt_list) == 0): printInfo() sys.exit(0) elif (is_opt("-h",opt_list)[:2] == "-h"): printHelp() sys.exit(0) else: source_data = "" separator = "" if (is_opt("-s",opt_list)[:2] == "-s"): separator = is_opt("-s",opt_list)[2:] if (is_opt("-d",opt_list)[:2] == "-d"): source_data = is_opt("-d",opt_list)[2:] source_data = source_data + "/" extract_tpl = "/" else: if (is_opt("-Cx",opt_list)[:3] == "-Cx"): extract_tpl = is_opt("-Cx",opt_list)[3:] extract_tpl_parsed = parse_template(extract_tpl) elif (is_opt("-c",opt_list)[:2] == "-c"): extract_tpl = is_opt("-c",opt_list)[2:] extract_tpl_parsed = parse_common_template(extract_tpl,1) else: printInfo() sys.exit(0) if (is_opt("-Cs",opt_list)[:3] == "-Cs"): source_tpl = is_opt("-Cs",opt_list)[3:] source_tpl_parsed = parse_template(source_tpl) elif (is_opt("-c",opt_list)[:2] == "-c"): source_tpl = is_opt("-c",opt_list)[2:] source_tpl_parsed = parse_common_template(source_tpl,2) else: printInfo() sys.exit(0) if (is_opt("-Ct",opt_list)[:3] == "-Ct"): target_tpl = is_opt("-Ct",opt_list)[3:] target_tpl_parsed = parse_template(target_tpl) elif (is_opt("-c",opt_list)[:2] == "-c"): target_tpl = is_opt("-c",opt_list)[2:] target_tpl_parsed = parse_common_template(target_tpl,3) else: printInfo() sys.exit(0) if (is_opt("-t",opt_list)[:2] == "-t"): output_rec_sep = is_opt("-t",opt_list)[2:] else: output_rec_sep = "" if (is_opt("-b",opt_list)[:2] == "-b"): begin_header = is_opt("-b",opt_list)[2:] else: begin_header = "" if (is_opt("-e",opt_list)[:2] == "-e"): ending_footer = is_opt("-e",opt_list)[2:] else: ending_footer = "" if (is_opt("-l",opt_list)[:2] == "-l"): try: - conv_setting[0] = string.atoi(is_opt("-l",opt_list)[2:]) + conv_setting[0] = string.atoi(is_opt("-l",opt_list)[2:]) except ValueError, e: conv_setting[0] = 1 if (is_opt("-o",opt_list)[:2] == "-o"): try: oai_identifier_from = string.atoi(is_opt("-o",opt_list)[2:]) except ValueError, e: oai_identifier_from = 1 else: oai_identifier_from = 1 if (is_opt("-d",opt_list)[:2] == "-d"): if (os.path.isdir(source_data)): data_parsed = parse_input_data_d(source_data,source_tpl) create_record() tcounter = tcounter + 1 else: exit_on_error("Cannot access directory: %s" % source_data) if (is_opt("-d",opt_list)[:2] == ""): - done = 0 print begin_header while (done == 0): - data_parsed = parse_input_data_fx(source_tpl) - if (data_parsed == -1): done = 1 else: if (data_parsed[0][0]!= ''): - create_record() tcounter = tcounter + 1 print output_rec_sep - print ending_footer diff --git a/modules/bibconvert/bin/bibconvert.wml b/modules/bibconvert/bin/bibconvert.wml index ce88b8096..c55c7759b 100644 --- a/modules/bibconvert/bin/bibconvert.wml +++ b/modules/bibconvert/bin/bibconvert.wml @@ -1,1427 +1,1395 @@ ## $Id$ ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## read config variables: #include "config.wml" #include "configbis.wml" ## start Python: #! ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. ## okay, rest of the Python code goes below ####### def set_conv(): """ bibconvert common settings ======================= minimal length of output line = 1 maximal length of output line = 4096 """ conv_setting = [ 1, 4096 ] return conv_setting def get_options(): "Read command line options into list" out = [] for arg in sys.argv: if (arg[:1] == "-"): out.append(arg) return out def get_arguments(): "Read command line arguments into list" out = [] for arg in sys.argv: if (arg[:1] != "-"): out.append(arg) return out def get_pars(fn): "Read function and its parameters into list" out = [] out.append(re.split('\(|\)',fn)[0]) out.append(re.split(',',re.split('\(|\)',fn)[1])) return out def is_opt(seek,opt_list): "Return entire argument if given in the list of options" out = "" for arg in opt_list: if (seek == arg[:2]): out = arg if (seek == arg[:3]): out = arg return out def append_to_output_file(filename, output): "bibconvert output file creation by output line" try: file = open(filename,'a') file.write(output) file.close() except IOError, e: exit_on_error("Cannot write into %s" % filename) return 1 def sub_keywd(out): "bibconvert keywords literal substitution" out = string.replace(out,"EOL","\n") out = string.replace(out,"_CR_","\r") out = string.replace(out,"_LF_","\n") out = string.replace(out,"\\",'\\') out = string.replace(out,"\r",'\r') out = string.replace(out,"BSLASH",'\\') out = string.replace(out,"COMMA",',') out = string.replace(out,"LEFTB",'[') out = string.replace(out,"RIGHTB",']') out = string.replace(out,"LEFTP",'(') out = string.replace(out,"RIGHTP",')') return out def check_split_on(data_item_split, sep, tpl_f): """ bibconvert conditional split with following conditions =================================================== ::NEXT(N,TYPE,SIDE) - next N chars are of the TYPE having the separator on the SIDE ::PREV(N,TYPE,SIDE) - prev.N chars are of the TYPE having the separator on the SIDE """ fn = get_pars(tpl_f)[0] par = get_pars(tpl_f)[1] done = 0 while (done == 0): if ( (( fn == "NEXT" ) and ( par[2]=="R" )) or (( fn == "PREV" ) and ( par[2]=="L" )) ): test_value = data_item_split[0][-(string.atoi(par[0])):] elif ( ((fn == "NEXT") and ( par[2]=="L")) or ((fn == "PREV") and ( par[2]=="R")) ): test_value = data_item_split[1][:(string.atoi(par[0]))] data_item_split_tmp = [] if ((FormatField(test_value,"SUP(" + par[1] + ",)") != "")or(len(test_value) < string.atoi(par[0]))): data_item_split_tmp = data_item_split[1].split(sep,1) if(len(data_item_split_tmp)==1): done = 1 data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0] data_item_split[1] = "" else: data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0] data_item_split[1] = data_item_split_tmp[1] else: done = 1 return data_item_split def get_subfields(data,subfield,src_tpl): "Get subfield according to the template" out = [] for data_item in data: found = 0 for src_tpl_item in src_tpl: if (src_tpl_item[:2] == "<:"): if (src_tpl_item[2:-2] == subfield): found = 1 else: sep_in_list = src_tpl_item.split("::") sep = sep_in_list[0] data_item_split = data_item.split(sep,1) if (len(data_item_split)==1): data_item = data_item_split[0] else: if (len(sep_in_list) > 1): data_item_split = check_split_on(data_item.split(sep,1), sep_in_list[0],sep_in_list[1]) if(found == 1): data_item = data_item_split[0] else: data_item = string.join(data_item_split[1:],sep) out.append(data_item) return out def exp_n(word): "Replace newlines and carriage return's from string." out = "" for ch in word: if ((ch != '\n') and (ch != '\r')): out = out + ch return out def exp_e(list): "Expunge empty elements from a list" out = [] for item in list: item = exp_n(item) if ((item != '\r\n' and item != '\r' and item != '\n' and item !="" and len(item)!=0)): out.append(item) return out def sup_e(word): "Replace spaces" out = "" for ch in word: if (ch != ' '): out = out + ch return out def select_line(field_code, list): "Return appropriate item from a list" out = [''] for field in list: field[0] = sup_e(field[0]) field_code = sup_e(field_code) if (field[0] == field_code): out = field[1] return out def parse_field_definition(source_field_definition): "Create list of source_field_definition" word_list = [] out = [] word = "" counter = 0 if (len(source_field_definition.split("---"))==4): out = source_field_definition.split("---") else: element_list_high = source_field_definition.split("<:") for word_high in element_list_high: element_list_low = word_high.split(':>') for word_low in element_list_low: word_list.append(word_low) word_list.append(":>") word_list.pop() word_list.append("<:") word_list.pop() for item in word_list: word = word + item if (item == "<:"): counter = counter + 1 if (item == ":>"): counter = counter - 1 if counter == 0: out.append(word) word = "" return out def parse_template(template): """ bibconvert parse template ====================== in - template filename out - [ [ field_code , [ field_template_parsed ] , [] ] """ out = [] for field_def in read_file(template,1): field_tpl_new = [] if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")): field_code = field_def.split("---",1)[0] field_tpl = parse_field_definition(field_def.split("---",1)[1]) field_tpl_new = field_tpl field_tpl = exp_e(field_tpl_new) out_data = [field_code, field_tpl] out.append(out_data) return out def parse_common_template(template,part): """ bibconvert parse template ========================= in - template filename out - [ [ field_code , [ field_template_parsed ] , [] ] """ out = [] counter = 0 for field_def in read_file(template,1): if (exp_n(field_def)[:3] == "==="): counter = counter + 1 elif (counter == part): field_tpl_new = [] if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")): field_code = field_def.split("---",1)[0] field_tpl = parse_field_definition(field_def.split("---",1)[1]) field_tpl_new = field_tpl field_tpl = exp_e(field_tpl_new) out_data = [field_code, field_tpl] out.append(out_data) return out def parse_input_data_f(source_data_open, source_tpl): """ bibconvert parse input data ======================== in - input source data location (filehandle) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][] destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[] input data file; by line: - fieldcode value """ out = [['',[]]] count = 0 values = [] while (count < 1): line = source_data_open.readline() if (line == ""): return(-1) line_split = line.split(" ",1) if (re.sub("\s","",line) == separator): count = count + 1 if (len(line_split) == 2): field_code = line_split[0] field_value = exp_n(line_split[1]) values.append([field_code,field_value]) item_prev = "" stack = [''] for item in values: if ((item[0]==item_prev)or(item_prev == "")): stack.append(item[1]) item_prev = item[0] else: out.append([item_prev,stack]) item_prev = item[0] stack = [] stack.append(item[1]) try: if (stack[0] != ""): if (out[0][0]==""): out = [] out.append([field_code,stack]) except IndexError, e: out = out return out def parse_input_data_fx(source_tpl): """ bibconvert parse input data ======================== in - input source data location (filehandle) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] extraction_template_entry - - input data file - specified by extract_tpl + input data file - specified by extract_tpl """ count = 0 record = "" field_data_1_in_list = [] out = [['',[]]] while (count <1): line = sys.stdin.readline() if (line == ""): if (record == ""): return (-1) if (re.sub("\s","",line) == separator): count = count + 1 else: - record = record + line + record = record + line for field_defined in extract_tpl_parsed: try: field_defined[1][0] = sub_keywd(field_defined[1][0]) field_defined[1][1] = sub_keywd(field_defined[1][1]) except IndexError, e: field_defined = field_defined try: field_defined[1][2] = sub_keywd(field_defined[1][2]) except IndexError, e: field_defined = field_defined field_data_1 ="" try: if (len(record.split(field_defined[1][0])) == 1): field_data_1 = "" field_data_1_in_list = [] else: field_data_1_tmp = record.split(field_defined[1][0],1)[1] field_data_1_in_list = field_data_1_tmp.split(field_defined[1][0]) except IndexError, e: field_data_1 = "" spliton = [] outvalue = "" field_data_2 = "" field_data = "" - + try: if ((field_defined[1][1])=="EOL"): spliton = ['\n'] elif ((field_defined[1][1])=="MIN"): spliton = ['\n'] elif ((field_defined[1][1])=="MAX"): for item in extract_tpl_parsed: try: spliton.append(item[1][0]) except IndexError, e: - spliton = spliton + spliton = spliton else: spliton = [field_defined[1][1]] except IndexError,e : spliton = "" outvalues = [] for field_data in field_data_1_in_list: outvalue = "" for splitstring in spliton: field_data_2 = "" if (len(field_data.split(splitstring))==1): if (outvalue == ""): field_data_2 = field_data else: field_data_2 = outvalue else: field_data_2 = field_data.split(splitstring)[0] outvalue = field_data_2 field_data = field_data_2 outvalues.append(outvalue) outvalues = exp_e(outvalues) if (len(outvalues) > 0): if (out[0][0]==""): out = [] outstack = [] if (len(field_defined[1])==3): for item in outvalues: stack = item.split(field_defined[1][2]) for stackitem in stack: outstack.append(stackitem) else: outstack = outvalues out.append([field_defined[0],outstack]) return out def parse_input_data_d(source_data, source_tpl): """ bibconvert parse input data ======================== in - input source data location (directory) source data template source_field_code list of source field codes source_field_data list of source field data values (repetitive fields each line one occurence) out - [ [ source_field_code , [ source_field_data ] ] , [] ] source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][] destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[] input data dir; by file: - fieldcode value per line """ out = [] for source_field_tpl in read_file(source_tpl,1): source_field_code = source_field_tpl.split("---")[0] source_field_data = read_file(source_data + source_field_code,0) source_field_data = exp_e(source_field_data) out_data = [source_field_code, source_field_data] out.append(out_data) return out def sub_empty_lines(value): out = re.sub('\n\n+','',value) return out def set_par_defaults(par1,par2): "Set default parameter when not defined" par_new_in_list = par2.split(",") i = 0 out = [] for par in par_new_in_list: if (len(par1)>i): if (par1[i] == ""): out.append(par) else: out.append(par1[i]) else: out.append(par) i = i + 1 return out def generate(keyword): """ bibconvert generaded values: ========================= SYSNO() - generate date as '%w%H%M%S' WEEK(N) - generate date as '%V' with shift (N) DATE(format) - generate date in specifieddate FORMAT VALUE(value) - enter value literarly OAI() - generate oai_identifier, starting value given at command line as -o """ out = keyword fn = keyword + "()" par = get_pars(fn)[1] fn = get_pars(fn)[0] par = set_par_defaults(par,"") if (fn == "SYSNO"): out = sysno if (fn == "WEEK"): par = set_par_defaults(par,"0") out = "%02d" % (string.atoi(strftime("%V",localtime())) + string.atoi(par[0])) if (string.atoi(out)<0): out = "00" if (fn == "VALUE"): par = set_par_defaults(par,"") out = par[0] if (fn == "DATE"): par = set_par_defaults(par,"%w%H%M%S," + "%d" % conv_setting[1]) out = strftime(par[0],localtime()) out = out[:string.atoi(par[1])] if (fn == "OAI"): oai_prefix = "" out = "%s:%d" % (oai_prefix,tcounter + oai_identifier_from) return out def read_file(filename,exception): "Read file into list" out = [] if (os.path.isfile(filename)): file = open(filename,'r') out = file.readlines() file.close() else: if exception: exit_on_error("Cannot access file: %s" % filename) return out def crawl_KB(filename,value,mode): """ bibconvert look-up value in KB_file in one of following modes: =========================================================== 1 - case sensitive / match (default) 2 - not case sensitive / search 3 - case sensitive / search 4 - not case sensitive / match 5 - case sensitive / search (in KB) 6 - not case sensitive / search (in KB) 7 - case sensitive / search (reciprocal) 8 - not case sensitive / search (reciprocal) 9 - replace by _DEFAULT_ only """ if (os.path.isfile(filename) != 1): pathtmp = string.split(extract_tpl,"/") pathtmp.pop() path = string.join(pathtmp,"/") filename = path + "/" + filename if (os.path.isfile(filename)): file_to_read = open(filename,"r") file_read = file_to_read.readlines() for line in file_read: code = string.split(line,"---") if (mode == "2"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(value_to_cmp,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif ((mode == "3") or (mode == "0")): if ((len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "4"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((code[0] == value_to_cmp)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "5"): if ((len(string.split(code[0],value)) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "6"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "7"): if ((len(string.split(code[0],value)) > 1)or(len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "8"): value_to_cmp = string.lower(value) code[0] = string.lower(code[0]) if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")): value = code[1] return value elif (mode == "9"): if (code[0]=="_DEFAULT_"): value = code[1] return value else: if ((code[0] == value)or(code[0]=="_DEFAULT_")): value = code[1] return value return value def FormatField(value,fn): """ bibconvert formatting functions: ============================= ADD(prefix,suffix) - add prefix/suffix KB(kb_file) - lookup in kb_file and replace value ABR(N,suffix) - abbreviate to N places with suffix ABRX() - abbreviate exclusively words longer ABRW() - abbreviate word (limit from right) REP(x,y) - replace SUP(type) - remove characters of certain TYPE LIM(n,side) - limit to n letters from L/R LIMW(string,side) - L/R after split on string WORDS(n,side) - limit to n words from L/R IF(value,valueT,valueF) - replace on IF condition MINL(n) - replace words shorter than n MINLW(n) - replace words shorter than n MAXL(n) - replace words longer than n EXPW(type) - replace word from value containing TYPE EXP(STR,0/1) - replace word from value containing string NUM() - take only digits in given string SHAPE() - remove extra space UP() - to uppercase DOWN() - to lowercase CAP() - make capitals each word SPLIT(n,h,str,from) - only for final Aleph field, i.e. AB , maintain whole words SPLITW(sep,h,str,from) - only for final Aleph field, split on string CONF(filed,value,0/1) - confirm validity of output line (check other field) CONFL(substr,0/1) - confirm validity of output line (check field being processed) CUT(prefix,postfix) - remove substring from side RANGE(MIN,MAX) - select items in repetitive fields bibconvert character TYPES ======================= ALPHA - alphabetic NALPHA - not alpphabetic NUM - numeric NNUM - not numeric ALNUM - alphanumeric NALNUM - non alphanumeric LOWER - lowercase UPPER - uppercase PUNCT - punctual NPUNCT - non punctual SPACE - space """ out = value fn = fn + "()" par = get_pars(fn)[1] fn = get_pars(fn)[0] value = sub_keywd(value) par_tmp =[] for item in par: item = sub_keywd(item) par_tmp.append(item) par = par_tmp if (fn == "KB"): new_value = "" par = set_par_defaults(par,"KB,0") new_value = crawl_KB(par[0],value,par[1]) out = new_value elif (fn == "ADD"): par = set_par_defaults(par,",") out = par[0] + value + par[1] elif (fn == "ABR"): par = set_par_defaults(par,"1,.") out = value[:string.atoi(par[0])] + par[1] elif (fn == "ABRW"): tmp = FormatField(value,"ABR(1,.)") tmp = tmp.upper() out = tmp elif (fn == "ABRX"): par = set_par_defaults(par,",") toout = [] tmp = value.split(" ") for wrd in tmp: if (len(wrd) > string.atoi(par[0])): wrd = wrd[:string.atoi(par[0])] + par[1] toout.append(wrd) out = string.join(toout," ") elif (fn == "SUP"): par = set_par_defaults(par,",") if(par[0]=="NUM"): out = re.sub('\d+',par[1],value) if(par[0]=="NNUM"): out = re.sub('\D+',par[1],value) if(par[0]=="ALPHA"): out = re.sub('[a-zA-Z]+',par[1],value) if(par[0]=="NALPHA"): out = re.sub('[^a-zA-Z]+',par[1],value) if((par[0]=="ALNUM")or(par[0]=="NPUNCT")): out = re.sub('\w+',par[1],value) if(par[0]=="NALNUM"): out = re.sub('\W+',par[1],value) if(par[0]=="PUNCT"): out = re.sub('\W+',par[1],value) if(par[0]=="LOWER"): out = re.sub('[a-z]+',par[1],value) if(par[0]=="UPPER"): out = re.sub('[A-Z]+',par[1],value) if(par[0]=="SPACE"): out = re.sub('\s+',par[1],value) elif (fn == "LIM"): par = set_par_defaults(par,",") if (par[1] == "L"): out = value[(len(value) - string.atoi(par[0])):] if (par[1] == "R"): out = value[:string.atoi(par[0])] elif (fn == "LIMW"): par = set_par_defaults(par,",") tmp = value.split(par[0]) if (par[1] == "L"): out = par[0] + tmp[1] if (par[1] == "R"): out = tmp[0] + par[0] elif (fn == "WORDS"): tmp2 = [value] par = set_par_defaults(par,",") if (par[1] == "R"): tmp = value.split(" ") tmp2 = [] i = 0 while (i < string.atoi(par[0])): tmp2.append(tmp[i]) i = i + 1 if (par[1] == "L"): tmp = value.split(" ") tmp.reverse() tmp2 = [] i = 0 while (i < string.atoi(par[0])): tmp2.append(tmp[i]) i = i + 1 tmp2.reverse() out = string.join(tmp2, " ") elif (fn == "MINL"): par = set_par_defaults(par,"1") tmp = value.split(" ") tmp2 = [] i = 0 for wrd in tmp: if (len(wrd) >= string.atoi(par[0])): tmp2.append(wrd) out = string.join(tmp2, " ") elif (fn == "MINLW"): par = set_par_defaults(par,"1") if (len(value) >= string.atoi(par[0])): out = value else: out = "" elif (fn == "MAXL"): par = set_par_defaults(par,"4096") tmp = value.split(" ") tmp2 = [] i = 0 for wrd in tmp: if (len(wrd) <= string.atoi(par[0])): tmp2.append(wrd) out = string.join(tmp2, " ") elif (fn == "REP"): set_par_defaults(par,",") if (par[0]!= ""): out = value.replace(par[0],par[1]) elif (fn == "SHAPE"): if (value != ""): out = value.strip() elif (fn == "UP"): out = value.upper() elif (fn == "DOWN"): out = value.lower() elif (fn == "CAP"): tmp = value.split(" ") out2 = [] for wrd in tmp: wrd2 = wrd.capitalize() out2.append(wrd2) out = string.join(out2," ") elif (fn == "IF"): par = set_par_defaults(par,",,") if (value == par[0]): out = par[1] else: out = par[2] if (out == "ORIG"): out = value elif (fn == "EXP"): par = set_par_defaults(par,",0") tmp = value.split(" ") out2 = [] for wrd in tmp: if ((len(wrd.split(par[0])) == 1)and(par[1]=="1")): out2.append(wrd) if ((len(wrd.split(par[0])) != 1)and(par[1]=="0")): out2.append(wrd) out = string.join(out2," ") elif (fn == "EXPW"): par = set_par_defaults(par,",0") tmp = value.split(" ") out2 = [] for wrd in tmp: if ((FormatField(wrd,"SUP(" + par[0] + ")") == wrd)and(par[1]=="1")): out2.append(wrd) if ((FormatField(wrd,"SUP(" + par[0] + ")") != wrd)and(par[1]=="0")): out2.append(wrd) out = string.join(out2," ") elif (fn == "SPLIT"): par = set_par_defaults(par,"%d,0,,1" % conv_setting[1]) length = string.atoi(par[0]) + (string.atoi(par[1])) header = string.atoi(par[1]) headerplus = par[2] starting = string.atoi(par[3]) line = "" tmp2 = [] tmp3 = [] tmp = value.split(" ") linenumber = 1 if (linenumber >= starting): tmp2.append(headerplus) line = line + headerplus for wrd in tmp: line = line + " " + wrd tmp2.append(wrd) if (len(line) > length): linenumber = linenumber + 1 line = tmp2.pop() toout = string.join(tmp2) tmp3.append(toout) tmp2 = [] line2 = value[:header] if (linenumber >= starting): line3 = line2 + headerplus + line else: line3 = line2 + line line = line3 tmp2.append(line) tmp3.append(line) out = string.join(tmp3,"\n") out = FormatField(out,"SHAPE()") elif (fn == "SPLITW"): par = set_par_defaults(par,",0,,1") str = par[0] header = string.atoi(par[1]) headerplus = par[2] starting = string.atoi(par[3]) counter = 1 tmp2 = [] tmp = value.split(par[0]) last = tmp.pop() for wrd in tmp: counter = counter + 1 if (counter >= starting): tmp2.append(value[:header] + headerplus + wrd + par[0]) else: tmp2.append(value[:header] + wrd + par[0]) if (last != ""): counter = counter + 1 if (counter >= starting): tmp2.append(value[:header] + headerplus + last) else: tmp2.append(value[:header] + last) out = string.join(tmp2,"\n") elif (fn == "CONF"): par = set_par_defaults(par,",,1") found = 0 data = select_line(par[0],data_parsed) for line in data: if (par[1] == ""): if (line == ""): found = 1 elif (len(re.split(par[1],line)) > 1): found = 1 if ((found == 1)and(string.atoi(par[2]) == 1)): out = value if ((found == 1)and(string.atoi(par[2]) == 0)): out = "" if ((found == 0)and(string.atoi(par[2]) == 1)): out = "" if ((found == 0)and(string.atoi(par[2]) == 0)): out = value return out elif (fn == "CONFL"): set_par_defaults(par,",1") if (re.search(par[0],value)): if (string.atoi(par[1]) == 1): out = value else: out = "" else: if (string.atoi(par[1]) == 1): out = "" else: out = value return out elif (fn == "CUT"): par = set_default_pars(par,",") left = value[:len(par[0])] right = value[-(len(par[1])):] if (left == par[0]): out = out[len(par[0]):] if (right == par[1]): out = out[:-(len(par[1]))] return out elif (fn == "NUM"): tmp = re.findall('\d',value) out = string.join(tmp,"") return out def printInfo(): "print out when not enough parmeters given" print """ BibConvert data convertor Usage: bibconvert [options] -ctemplate.cfg < input.dat Options: - -c'config' configuration templates file - -d'directory' source_data fields are located in separated files in 'directory'one record) - -h help - -l'length' minimum line length (default = 1) - -o'value' OAI identifier starts with specified value (default = 1) - -b'file header' insert file header - -e'file footer' insert file footer - -s'record separator' record separator, default empty line (EOLEOL) + -c'config' configuration templates file + -d'directory' source_data fields are located in separated files in 'directory'one record) + -h help + -l'length' minimum line length (default = 1) + -o'value' OAI identifier starts with specified value (default = 1) + -b'file header' insert file header + -e'file footer' insert file footer + -s'record separator' record separator, default empty line (EOLEOL) - -Cx'field extraction template' alternative to -c when configuration is split to several files - -Cs'source data template' alternative to -c when configuration is split to several files - -Ct'target data template' alternative to -c when configuration is split to several files + -Cx'field extraction template' alternative to -c when configuration is split to several files + -Cs'source data template' alternative to -c when configuration is split to several files + -Ct'target data template' alternative to -c when configuration is split to several files """ def printHelp(): "print out help" print """ BibConvert data convertor Usage: bibconvert [options] -ctemplate.cfg < input.dat Options: - -c'config' configuration templates file - -d'directory' source_data fields are located in separated files in 'directory'one record) - -h help - -l'length' minimum line length (default = 1) - -o'value' OAI identifier starts with specified value (default = 1) - -b'file header' insert file header - -e'file footer' insert file footer - -s'record separator' record separator, default empty line (EOLEOL) + -c'config' configuration templates file + -d'directory' source_data fields are located in separated files in 'directory'one record) + -h help + -l'length' minimum line length (default = 1) + -o'value' OAI identifier starts with specified value (default = 1) + -b'file header' insert file header + -e'file footer' insert file footer + -s'record separator' record separator, default empty line (EOLEOL) - -Cx'field extraction template' alternative to -c when configuration is split to several files - -Cs'source data template' alternative to -c when configuration is split to several files - -Ct'target data template' alternative to -c when configuration is split to several files + -Cx'field extraction template' alternative to -c when configuration is split to several files + -Cs'source data template' alternative to -c when configuration is split to several files + -Ct'target data template' alternative to -c when configuration is split to several files Example: -------- Creation of an XML metadata container in output.xml file from text input file, bibconvert -o1 -l1 -csample.cfg < sample.dat > output.xml -l1 print out all output lines -o1 create OAI identifiers starting with value 1 -c* data conversion configuration templates """ def exit_on_error(error_message): "exit when error occured" sys.stderr.write("\n bibconvert data convertor\n") sys.stderr.write(" Error: %s\n" % error_message) sys.exit() return 0 def create_record(): "Create output record" - out = [] field_data_item_LIST = [] for T_tpl_item_LIST in target_tpl_parsed: - + # the line is printed only if the variables inside are not empty + print_line = 0 to_output = [] rows = 1 - for field_tpl_item_STRING in T_tpl_item_LIST[1]: - DATA = [] - if (field_tpl_item_STRING[:2]=="<:"): field_tpl_item_STRING = field_tpl_item_STRING[2:-2] - field = field_tpl_item_STRING.split("::")[0] if (len(field_tpl_item_STRING.split("::")) == 1): value = generate(field) to_output.append([value]) else: subfield = field_tpl_item_STRING.split("::")[1] if (field[-1] == "*"): repetitive = 1 field = field[:-1] else: repetitive = 0 - if (is_opt("-d",opt_list)[:2]=="-d"): DATA = select_line(field,data_parsed) else: DATA = select_line(field,data_parsed) - if (repetitive == 0): DATA = [string.join(DATA," ")] - SRC_TPL = select_line(field,source_tpl_parsed) try: if (DATA[0] != ""): DATA = get_subfields(DATA,subfield,SRC_TPL) - FF = field_tpl_item_STRING.split("::") if (len(FF) > 2): FF = FF[2:] for fn in FF: - - # DATAFORMATTED = [] - if (len(DATA) != 0 and DATA[0] != ""): DATA = get_subfields(DATA,subfield,SRC_TPL) - FF = field_tpl_item_STRING.split("::") if (len(FF) > 2): FF = FF[2:] for fn2 in FF: - DATAFORMATTED = [] - - for item in DATA: item = FormatField(item,fn) DATAFORMATTED.append(item) - DATA = DATAFORMATTED - if (len(DATA) > rows): - rows = len(DATA) + if DATA != "": + print_line = 1 to_output.append(DATA) except IndexError, e: - pass - + pass else: to_output.append([field_tpl_item_STRING]) - current = 0 + default_print = 0 while (current < rows): line_to_print = [] for item in to_output: - if (item==[]): item =[''] - if (len(item) <= current): printout = item[0] else: printout = item[current] - line_to_print.append(printout) - - output = exp_n(string.join(line_to_print,"")) - global_formatting_functions = T_tpl_item_LIST[0].split("::")[1:] - for GFF in global_formatting_functions: if (GFF[:5] == "RANGE"): - parR = get_pars(GFF)[1] parR = set_par_defaults(parR,"MIN,MAX") - if (parR[0]!="MIN"): if (string.atoi(parR[0]) > (current+1)): output = "" - if (parR[1]!="MAX"): if (string.atoi(parR[1]) < (current+1)): output = "" - + elif (GFF[:4] == "DEFP"): + default_print = 1 else: output = FormatField(output,GFF) - if (len(output) > conv_setting[0]): - + if ((len(output) > conv_setting[0] and print_line == 1) or default_print): print output current = current + 1 return ### MAIN ### try: import fileinput import string import os import re import sys import time except ImportError, e: print "Error: %s" % e import sys sys.exit(1) from time import gmtime, strftime, localtime import os.path tcounter = 0 conv_setting = set_conv() sysno = generate("DATE(%w%H%M%S)") if(len(sys.argv) < 2): printInfo() sys.exit(0) opt_list = get_options() arg_list = get_arguments() if(len(opt_list) == 0): printInfo() sys.exit(0) elif (is_opt("-h",opt_list)[:2] == "-h"): printHelp() sys.exit(0) else: source_data = "" separator = "" if (is_opt("-s",opt_list)[:2] == "-s"): separator = is_opt("-s",opt_list)[2:] if (is_opt("-d",opt_list)[:2] == "-d"): source_data = is_opt("-d",opt_list)[2:] source_data = source_data + "/" extract_tpl = "/" else: if (is_opt("-Cx",opt_list)[:3] == "-Cx"): extract_tpl = is_opt("-Cx",opt_list)[3:] extract_tpl_parsed = parse_template(extract_tpl) elif (is_opt("-c",opt_list)[:2] == "-c"): extract_tpl = is_opt("-c",opt_list)[2:] extract_tpl_parsed = parse_common_template(extract_tpl,1) else: printInfo() sys.exit(0) if (is_opt("-Cs",opt_list)[:3] == "-Cs"): source_tpl = is_opt("-Cs",opt_list)[3:] source_tpl_parsed = parse_template(source_tpl) elif (is_opt("-c",opt_list)[:2] == "-c"): source_tpl = is_opt("-c",opt_list)[2:] source_tpl_parsed = parse_common_template(source_tpl,2) else: printInfo() sys.exit(0) if (is_opt("-Ct",opt_list)[:3] == "-Ct"): target_tpl = is_opt("-Ct",opt_list)[3:] target_tpl_parsed = parse_template(target_tpl) elif (is_opt("-c",opt_list)[:2] == "-c"): target_tpl = is_opt("-c",opt_list)[2:] target_tpl_parsed = parse_common_template(target_tpl,3) else: printInfo() sys.exit(0) if (is_opt("-t",opt_list)[:2] == "-t"): output_rec_sep = is_opt("-t",opt_list)[2:] else: output_rec_sep = "" if (is_opt("-b",opt_list)[:2] == "-b"): begin_header = is_opt("-b",opt_list)[2:] else: begin_header = "" if (is_opt("-e",opt_list)[:2] == "-e"): ending_footer = is_opt("-e",opt_list)[2:] else: ending_footer = "" if (is_opt("-l",opt_list)[:2] == "-l"): try: - conv_setting[0] = string.atoi(is_opt("-l",opt_list)[2:]) + conv_setting[0] = string.atoi(is_opt("-l",opt_list)[2:]) except ValueError, e: conv_setting[0] = 1 if (is_opt("-o",opt_list)[:2] == "-o"): try: oai_identifier_from = string.atoi(is_opt("-o",opt_list)[2:]) except ValueError, e: oai_identifier_from = 1 else: oai_identifier_from = 1 if (is_opt("-d",opt_list)[:2] == "-d"): if (os.path.isdir(source_data)): data_parsed = parse_input_data_d(source_data,source_tpl) create_record() tcounter = tcounter + 1 else: exit_on_error("Cannot access directory: %s" % source_data) if (is_opt("-d",opt_list)[:2] == ""): - done = 0 print begin_header while (done == 0): - data_parsed = parse_input_data_fx(source_tpl) - if (data_parsed == -1): done = 1 else: if (data_parsed[0][0]!= ''): - create_record() tcounter = tcounter + 1 print output_rec_sep - print ending_footer