Page MenuHomec4science

bibconvert.py
No OneTemporary

File Metadata

Created
Wed, Nov 13, 08:31

bibconvert.py

## $Id$
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002, 2003, 2004, 2005 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""BibConvert tool to convert bibliographic records from any format to any format."""
__version__ = "$Id$"
try:
import fileinput
import string
import os
import re
import sys
import time
import getopt
from time import gmtime, strftime, localtime
import os.path
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
try:
from config import *
from search_engine import perform_request_search
from oai_repository_config import oaiidprefix
except ImportError, e:
print "Error: %s" % e
sys.exit(1)
### Matching records with database content
def parse_query_string(query_string):
"""Parse query string, e.g.:
Input: 245__a::REP(-, )::SHAPE::SUP(SPACE, )::MINL(4)::MAXL(8)::EXPW(PUNCT)::WORDS(4,L)::SHAPE::SUP(SPACE, )||700__a::MINL(2)::REP(COMMA,).
Output:[['245__a','REP(-,)','SHAPE','SUP(SPACE, )','MINL(4)','MAXL(8)','EXPW(PUNCT)','WORDS(4,L)','SHAPE','SUP(SPACE, )'],['700__a','MINL(2)','REP(COMMA,)']]
"""
query_string_out = []
query_string_out_in = []
query_string_split_1 = query_string.split('||')
for item_1 in query_string_split_1:
query_string_split_2 = item_1.split('::')
query_string_out_in = []
for item in query_string_split_2:
query_string_out_in.append(item)
query_string_out.append(query_string_out_in)
return query_string_out
def set_conv():
"""
bibconvert common settings
=======================
minimal length of output line = 1
maximal length of output line = 4096
"""
conv_setting = [
1,
4096
]
return conv_setting
def get_pars(fn):
"Read function and its parameters into list"
out = []
out.append(re.split('\(|\)', fn)[0])
out.append(re.split(',', re.split('\(|\)', fn)[1]))
return out
def append_to_output_file(filename, output):
"bibconvert output file creation by output line"
try:
file = open(filename, 'a')
file.write(output)
file.close()
except IOError, e:
exit_on_error("Cannot write into %s" % filename)
return 1
def sub_keywd(out):
"bibconvert keywords literal substitution"
out = string.replace(out, "EOL", "\n")
out = string.replace(out, "_CR_", "\r")
out = string.replace(out, "_LF_", "\n")
out = string.replace(out, "\\", '\\')
out = string.replace(out, "\r", '\r')
out = string.replace(out, "BSLASH", '\\')
out = string.replace(out, "COMMA", ',')
out = string.replace(out, "LEFTB", '[')
out = string.replace(out, "RIGHTB", ']')
out = string.replace(out, "LEFTP", '(')
out = string.replace(out, "RIGHTP", ')')
return out
def check_split_on(data_item_split, sep, tpl_f):
"""
bibconvert conditional split with following conditions
===================================================
::NEXT(N,TYPE,SIDE) - next N chars are of the TYPE having the separator on the SIDE
::PREV(N,TYPE,SIDE) - prev.N chars are of the TYPE having the separator on the SIDE
"""
fn = get_pars(tpl_f)[0]
par = get_pars(tpl_f)[1]
done = 0
while (done == 0):
if ( (( fn == "NEXT" ) and ( par[2]=="R" )) or
(( fn == "PREV" ) and ( par[2]=="L" )) ):
test_value = data_item_split[0][-(string.atoi(par[0])):]
elif ( ((fn == "NEXT") and ( par[2]=="L")) or
((fn == "PREV") and ( par[2]=="R")) ):
test_value = data_item_split[1][:(string.atoi(par[0]))]
data_item_split_tmp = []
if ((FormatField(test_value, "SUP(" + par[1] + ",)") != "") or (len(test_value) < string.atoi(par[0]))):
data_item_split_tmp = data_item_split[1].split(sep, 1)
if(len(data_item_split_tmp)==1):
done = 1
data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0]
data_item_split[1] = ""
else:
data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0]
data_item_split[1] = data_item_split_tmp[1]
else:
done = 1
return data_item_split
def get_subfields(data, subfield, src_tpl):
"Get subfield according to the template"
out = []
for data_item in data:
found = 0
for src_tpl_item in src_tpl:
if (src_tpl_item[:2] == "<:"):
if (src_tpl_item[2:-2] == subfield):
found = 1
else:
sep_in_list = src_tpl_item.split("::")
sep = sep_in_list[0]
data_item_split = data_item.split(sep, 1)
if (len(data_item_split)==1):
data_item = data_item_split[0]
else:
if (len(sep_in_list) > 1):
data_item_split = check_split_on(data_item.split(sep, 1), sep_in_list[0], sep_in_list[1])
if(found == 1):
data_item = data_item_split[0]
else:
data_item = string.join(data_item_split[1:], sep)
out.append(data_item)
return out
def exp_n(word):
"Replace newlines and carriage return's from string."
out = ""
for ch in word:
if ((ch != '\n') and (ch != '\r')):
out = out + ch
return out
def exp_e(list):
"Expunge empty elements from a list"
out = []
for item in list:
item = exp_n(item)
if ((item != '\r\n' and item != '\r' and item != '\n' and item !="" and len(item)!=0)):
out.append(item)
return out
def sup_e(word):
"Replace spaces"
out = ""
for ch in word:
if (ch != ' '):
out = out + ch
return out
def select_line(field_code, list):
"Return appropriate item from a list"
out = ['']
for field in list:
field[0] = sup_e(field[0])
field_code = sup_e(field_code)
if (field[0] == field_code):
out = field[1]
return out
def parse_field_definition(source_field_definition):
"Create list of source_field_definition"
word_list = []
out = []
word = ""
counter = 0
if (len(source_field_definition.split("---"))==4):
out = source_field_definition.split("---")
else:
element_list_high = source_field_definition.split("<:")
for word_high in element_list_high:
element_list_low = word_high.split(':>')
for word_low in element_list_low:
word_list.append(word_low)
word_list.append(":>")
word_list.pop()
word_list.append("<:")
word_list.pop()
for item in word_list:
word = word + item
if (item == "<:"):
counter = counter + 1
if (item == ":>"):
counter = counter - 1
if counter == 0:
out.append(word)
word = ""
return out
def parse_template(template):
"""
bibconvert parse template
======================
in - template filename
out - [ [ field_code , [ field_template_parsed ] , [] ]
"""
out = []
for field_def in read_file(template, 1):
field_tpl_new = []
if ((len(field_def.split("---", 1)) > 1) and (field_def[:1] != "#")):
field_code = field_def.split("---", 1)[0]
field_tpl = parse_field_definition(field_def.split("---", 1)[1])
field_tpl_new = field_tpl
field_tpl = exp_e(field_tpl_new)
out_data = [field_code, field_tpl]
out.append(out_data)
return out
def parse_common_template(template, part):
"""
bibconvert parse template
=========================
in - template filename
out - [ [ field_code , [ field_template_parsed ] , [] ]
"""
out = []
counter = 0
for field_def in read_file(template, 1):
if (exp_n(field_def)[:3] == "==="):
counter = counter + 1
elif (counter == part):
field_tpl_new = []
if ((len(field_def.split("---", 1)) > 1) and (field_def[:1]!="#")):
field_code = field_def.split("---", 1)[0]
field_tpl = parse_field_definition(field_def.split("---", 1)[1])
field_tpl_new = field_tpl
field_tpl = exp_e(field_tpl_new)
out_data = [field_code, field_tpl]
out.append(out_data)
return out
def parse_input_data_f(source_data_open, source_tpl):
"""
bibconvert parse input data
========================
in - input source data location (filehandle)
source data template
source_field_code list of source field codes
source_field_data list of source field data values (repetitive fields each line one occurence)
out - [ [ source_field_code , [ source_field_data ] ] , [] ]
source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][]
destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[]
input data file; by line: - fieldcode value
"""
global separator
out = [['',[]]]
count = 0
values = []
while (count < 1):
line = source_data_open.readline()
if (line == ""):
return(-1)
line_split = line.split(" ", 1)
if (re.sub("\s", "", line) == separator):
count = count + 1
if (len(line_split) == 2):
field_code = line_split[0]
field_value = exp_n(line_split[1])
values.append([field_code, field_value])
item_prev = ""
stack = ['']
for item in values:
if ((item[0]==item_prev)or(item_prev == "")):
stack.append(item[1])
item_prev = item[0]
else:
out.append([item_prev, stack])
item_prev = item[0]
stack = []
stack.append(item[1])
try:
if (stack[0] != ""):
if (out[0][0]==""):
out = []
out.append([field_code, stack])
except IndexError, e:
out = out
return out
def parse_input_data_fx(source_tpl):
"""
bibconvert parse input data
========================
in - input source data location (filehandle)
source data template
source_field_code list of source field codes
source_field_data list of source field data values (repetitive fields each line one occurence)
out - [ [ source_field_code , [ source_field_data ] ] , [] ]
extraction_template_entry -
input data file - specified by extract_tpl
"""
global separator
count = 0
record = ""
field_data_1_in_list = []
out = [['',[]]]
while (count <10):
line = sys.stdin.readline()
if (line == ""):
count = count + 1
if (record == "" and count):
return (-1)
if (re.sub("\s", "", line) == separator):
count = count + 10
else:
record = record + line
for field_defined in extract_tpl_parsed:
try:
field_defined[1][0] = sub_keywd(field_defined[1][0])
field_defined[1][1] = sub_keywd(field_defined[1][1])
except IndexError, e:
field_defined = field_defined
try:
field_defined[1][2] = sub_keywd(field_defined[1][2])
except IndexError, e:
field_defined = field_defined
field_data_1 =""
if ((field_defined[1][0][0:2] == '//') and (field_defined[1][0][-2:] == '//')):
field_defined_regexp = field_defined[1][0][2:-2]
try:
####
if (len(re.split(field_defined_regexp, record)) == 1):
field_data_1 = ""
field_data_1_in_list = []
else:
field_data_1_tmp = re.split(field_defined_regexp, record, 1)[1]
field_data_1_in_list = field_data_1_tmp.split(field_defined_regexp)
except IndexError, e:
field_data_1 = ""
else:
try:
if (len(record.split(field_defined[1][0])) == 1):
field_data_1 = ""
field_data_1_in_list = []
else:
field_data_1_tmp = record.split(field_defined[1][0], 1)[1]
field_data_1_in_list = field_data_1_tmp.split(field_defined[1][0])
except IndexError, e:
field_data_1 = ""
spliton = []
outvalue = ""
field_data_2 = ""
field_data = ""
try:
if ((field_defined[1][1])=="EOL"):
spliton = ['\n']
elif ((field_defined[1][1])=="MIN"):
spliton = ['\n']
elif ((field_defined[1][1])=="MAX"):
for item in extract_tpl_parsed:
try:
spliton.append(item[1][0])
except IndexError, e:
spliton = spliton
elif (field_defined[1][1][0:2] == '//') and (field_defined[1][1][-2:] == '//'):
spliton = [field_defined[1][1][2:-2]]
else:
spliton = [field_defined[1][1]]
except IndexError,e :
spliton = ""
outvalues = []
for field_data in field_data_1_in_list:
outvalue = ""
for splitstring in spliton:
field_data_2 = ""
if (len(field_data.split(splitstring))==1):
if (outvalue == ""):
field_data_2 = field_data
else:
field_data_2 = outvalue
else:
field_data_2 = field_data.split(splitstring)[0]
outvalue = field_data_2
field_data = field_data_2
outvalues.append(outvalue)
outvalues = exp_e(outvalues)
if (len(outvalues) > 0):
if (out[0][0]==""):
out = []
outstack = []
if (len(field_defined[1])==3):
spliton = [field_defined[1][2]]
if (field_defined[1][2][0:2] == '//') and (field_defined[1][2][-2:] == '//'):
spliton = [field_defined[1][2][2:-2]]
for item in outvalues:
stack = re.split(spliton[0], item)
for stackitem in stack:
outstack.append(stackitem)
else:
outstack = outvalues
out.append([field_defined[0], outstack])
return out
def parse_input_data_d(source_data, source_tpl):
"""
bibconvert parse input data
========================
in - input source data location (directory)
source data template
source_field_code list of source field codes
source_field_data list of source field data values (repetitive fields each line one occurence)
out - [ [ source_field_code , [ source_field_data ] ] , [] ]
source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][]
destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[]
input data dir; by file: - fieldcode value per line
"""
out = []
for source_field_tpl in read_file(source_tpl, 1):
source_field_code = source_field_tpl.split("---")[0]
source_field_data = read_file(source_data + source_field_code, 0)
source_field_data = exp_e(source_field_data)
out_data = [source_field_code, source_field_data]
out.append(out_data)
return out
def sub_empty_lines(value):
out = re.sub('\n\n+', '', value)
return out
def set_par_defaults(par1, par2):
"Set default parameter when not defined"
par_new_in_list = par2.split(",")
i = 0
out = []
for par in par_new_in_list:
if (len(par1)>i):
if (par1[i] == ""):
out.append(par)
else:
out.append(par1[i])
else:
out.append(par)
i = i + 1
return out
def generate(keyword):
"""
bibconvert generaded values:
=========================
SYSNO() - generate date as '%w%H%M%S'
WEEK(N) - generate date as '%V' with shift (N)
DATE(format) - generate date in specifieddate FORMAT
VALUE(value) - enter value literarly
OAI() - generate oai_identifier, starting value given at command line as -o<value>
"""
out = keyword
fn = keyword + "()"
par = get_pars(fn)[1]
fn = get_pars(fn)[0]
par = set_par_defaults(par, "")
if (fn == "SYSNO"):
out = sysno500
if (fn == "SYSNO330"):
out = sysno
if (fn == "WEEK"):
par = set_par_defaults(par, "0")
out = "%02d" % (string.atoi(strftime("%V", localtime())) + string.atoi(par[0]))
if (string.atoi(out)<0):
out = "00"
if (fn == "VALUE"):
par = set_par_defaults(par, "")
out = par[0]
if (fn == "DATE"):
par = set_par_defaults(par, "%w%H%M%S," + "%d" % set_conv()[1])
out = strftime(par[0],localtime())
out = out[:string.atoi(par[1])]
if (fn == "XDATE"):
par = set_par_defaults(par,"%w%H%M%S," + ",%d" % set_conv()[1])
out = strftime(par[0],localtime())
out = par[1] + out[:string.atoi(par[2])]
if (fn == "OAI"):
out = "%s:%d" % (oaiidprefix,tcounter + oai_identifier_from)
return out
def read_file(filename,exception):
"Read file into list"
out = []
if (os.path.isfile(filename)):
file = open(filename,'r')
out = file.readlines()
file.close()
else:
if exception:
exit_on_error("Cannot access file: %s" % filename)
return out
def crawl_KB(filename,value,mode):
"""
bibconvert look-up value in KB_file in one of following modes:
===========================================================
1 - case sensitive / match (default)
2 - not case sensitive / search
3 - case sensitive / search
4 - not case sensitive / match
5 - case sensitive / search (in KB)
6 - not case sensitive / search (in KB)
7 - case sensitive / search (reciprocal)
8 - not case sensitive / search (reciprocal)
9 - replace by _DEFAULT_ only
R - not case sensitive / search (reciprocal) (8) replace
"""
if (os.path.isfile(filename) != 1):
pathtmp = string.split(extract_tpl,"/")
pathtmp.pop()
path = string.join(pathtmp,"/")
filename = path + "/" + filename
if (os.path.isfile(filename)):
file_to_read = open(filename,"r")
file_read = file_to_read.readlines()
for line in file_read:
code = string.split(line,"---")
if (mode == "2"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((len(string.split(value_to_cmp,code[0])) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif ((mode == "3") or (mode == "0")):
if ((len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "4"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((code[0] == value_to_cmp)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "5"):
if ((len(string.split(code[0],value)) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "6"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((len(string.split(code[0],value_to_cmp)) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "7"):
if ((len(string.split(code[0],value)) > 1)or(len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "8"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "9"):
if (code[0]=="_DEFAULT_"):
value = code[1]
return value
elif (mode == "R"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")):
value = value.replace(code[0],code[1])
else:
if ((code[0] == value)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
return value
def FormatField(value, fn):
"""
bibconvert formatting functions:
================================
ADD(prefix,suffix) - add prefix/suffix
KB(kb_file,mode) - lookup in kb_file and replace value
ABR(N,suffix) - abbreviate to N places with suffix
ABRX() - abbreviate exclusively words longer
ABRW() - abbreviate word (limit from right)
REP(x,y) - replace
SUP(type) - remove characters of certain TYPE
LIM(n,side) - limit to n letters from L/R
LIMW(string,side) - L/R after split on string
WORDS(n,side) - limit to n words from L/R
IF(value,valueT,valueF) - replace on IF condition
MINL(n) - replace words shorter than n
MINLW(n) - replace words shorter than n
MAXL(n) - replace words longer than n
EXPW(type) - replace word from value containing TYPE
EXP(STR,0/1) - replace word from value containing string
NUM() - take only digits in given string
SHAPE() - remove extra space
UP() - to uppercase
DOWN() - to lowercase
CAP() - make capitals each word
SPLIT(n,h,str,from) - only for final Aleph field, i.e. AB , maintain whole words
SPLITW(sep,h,str,from) - only for final Aleph field, split on string
CONF(filed,value,0/1) - confirm validity of output line (check other field)
CONFL(substr,0/1) - confirm validity of output line (check field being processed)
CUT(prefix,postfix) - remove substring from side
RANGE(MIN,MAX) - select items in repetitive fields
RE(regexp) - regular expressions
bibconvert character TYPES
==========================
ALPHA - alphabetic
NALPHA - not alpphabetic
NUM - numeric
NNUM - not numeric
ALNUM - alphanumeric
NALNUM - non alphanumeric
LOWER - lowercase
UPPER - uppercase
PUNCT - punctual
NPUNCT - non punctual
SPACE - space
"""
global data_parsed
out = value
fn = fn + "()"
par = get_pars(fn)[1]
fn = get_pars(fn)[0]
regexp = "//"
NRE = len(regexp)
value = sub_keywd(value)
par_tmp = []
for item in par:
item = sub_keywd(item)
par_tmp.append(item)
par = par_tmp
if (fn == "RE"):
new_value = ""
par = set_par_defaults(par,".*,0")
if (re.search(par[0],value) and (par[1] == "0")):
new_value = value
out = new_value
if (fn == "KB"):
new_value = ""
par = set_par_defaults(par,"KB,0")
new_value = crawl_KB(par[0],value,par[1])
out = new_value
elif (fn == "ADD"):
par = set_par_defaults(par,",")
out = par[0] + value + par[1]
elif (fn == "ABR"):
par = set_par_defaults(par,"1,.")
out = value[:string.atoi(par[0])] + par[1]
elif (fn == "ABRW"):
tmp = FormatField(value,"ABR(1,.)")
tmp = tmp.upper()
out = tmp
elif (fn == "ABRX"):
par = set_par_defaults(par,",")
toout = []
tmp = value.split(" ")
for wrd in tmp:
if (len(wrd) > string.atoi(par[0])):
wrd = wrd[:string.atoi(par[0])] + par[1]
toout.append(wrd)
out = string.join(toout," ")
elif (fn == "SUP"):
par = set_par_defaults(par,",")
if(par[0]=="NUM"):
out = re.sub('\d+',par[1],value)
if(par[0]=="NNUM"):
out = re.sub('\D+',par[1],value)
if(par[0]=="ALPHA"):
out = re.sub('[a-zA-Z]+',par[1],value)
if(par[0]=="NALPHA"):
out = re.sub('[^a-zA-Z]+',par[1],value)
if((par[0]=="ALNUM")or(par[0]=="NPUNCT")):
out = re.sub('\w+',par[1],value)
if(par[0]=="NALNUM"):
out = re.sub('\W+',par[1],value)
if(par[0]=="PUNCT"):
out = re.sub('\W+',par[1],value)
if(par[0]=="LOWER"):
out = re.sub('[a-z]+',par[1],value)
if(par[0]=="UPPER"):
out = re.sub('[A-Z]+',par[1],value)
if(par[0]=="SPACE"):
out = re.sub('\s+',par[1],value)
elif (fn == "LIM"):
par = set_par_defaults(par,",")
if (par[1] == "L"):
out = value[(len(value) - string.atoi(par[0])):]
if (par[1] == "R"):
out = value[:string.atoi(par[0])]
elif (fn == "LIMW"):
par = set_par_defaults(par,",")
if (par[0]!= ""):
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
par[0] = re.search(par[0],value).group()
tmp = value.split(par[0])
if (par[1] == "L"):
out = par[0] + tmp[1]
if (par[1] == "R"):
out = tmp[0] + par[0]
elif (fn == "WORDS"):
tmp2 = [value]
par = set_par_defaults(par,",")
if (par[1] == "R"):
tmp = value.split(" ")
tmp2 = []
i = 0
while (i < string.atoi(par[0])):
tmp2.append(tmp[i])
i = i + 1
if (par[1] == "L"):
tmp = value.split(" ")
tmp.reverse()
tmp2 = []
i = 0
while (i < string.atoi(par[0])):
tmp2.append(tmp[i])
i = i + 1
tmp2.reverse()
out = string.join(tmp2, " ")
elif (fn == "MINL"):
par = set_par_defaults(par,"1")
tmp = value.split(" ")
tmp2 = []
i = 0
for wrd in tmp:
if (len(wrd) >= string.atoi(par[0])):
tmp2.append(wrd)
out = string.join(tmp2, " ")
elif (fn == "MINLW"):
par = set_par_defaults(par,"1")
if (len(value) >= string.atoi(par[0])):
out = value
else:
out = ""
elif (fn == "MAXL"):
par = set_par_defaults(par,"4096")
tmp = value.split(" ")
tmp2 = []
i = 0
for wrd in tmp:
if (len(wrd) <= string.atoi(par[0])):
tmp2.append(wrd)
out = string.join(tmp2, " ")
elif (fn == "REP"):
set_par_defaults(par,",")
if (par[0]!= ""):
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
out = re.sub(par[0],value)
else:
out = value.replace(par[0],par[1])
elif (fn == "SHAPE"):
if (value != ""):
out = value.strip()
elif (fn == "UP"):
out = value.upper()
elif (fn == "DOWN"):
out = value.lower()
elif (fn == "CAP"):
tmp = value.split(" ")
out2 = []
for wrd in tmp:
wrd2 = wrd.capitalize()
out2.append(wrd2)
out = string.join(out2," ")
elif (fn == "IF"):
par = set_par_defaults(par,",,")
N = 0
while N < 3:
if (par[N][0:NRE] == regexp and par[N][-NRE:] == regexp):
par[N] = par[N][NRE:-NRE]
par[N] = re.search(par[N],value).group()
N += 1
if (value == par[0]):
out = par[1]
else:
out = par[2]
if (out == "ORIG"):
out = value
elif (fn == "EXP"):
par = set_par_defaults(par,",0")
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
par[0] = re.search(par[0],value).group()
tmp = value.split(" ")
out2 = []
for wrd in tmp:
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
if ((re.search(par[0],wrd).group() == wrd) and (par[1]=="1")):
out2.append(wrd)
if ((re.search(par[0],wrd).group() != wrd) and (par[1]=="0")):
out2.append(wrd)
else:
if ((len(wrd.split(par[0])) == 1)and(par[1]=="1")):
out2.append(wrd)
if ((len(wrd.split(par[0])) != 1)and(par[1]=="0")):
out2.append(wrd)
out = string.join(out2," ")
elif (fn == "EXPW"):
par = set_par_defaults(par,",0")
tmp = value.split(" ")
out2 = []
for wrd in tmp:
if ((FormatField(wrd,"SUP(" + par[0] + ")") == wrd)and(par[1]=="1")):
out2.append(wrd)
if ((FormatField(wrd,"SUP(" + par[0] + ")") != wrd)and(par[1]=="0")):
out2.append(wrd)
out = string.join(out2," ")
elif (fn == "SPLIT"):
par = set_par_defaults(par,"%d,0,,1" % conv_setting[1])
length = string.atoi(par[0]) + (string.atoi(par[1]))
header = string.atoi(par[1])
headerplus = par[2]
starting = string.atoi(par[3])
line = ""
tmp2 = []
tmp3 = []
tmp = value.split(" ")
linenumber = 1
if (linenumber >= starting):
tmp2.append(headerplus)
line = line + headerplus
for wrd in tmp:
line = line + " " + wrd
tmp2.append(wrd)
if (len(line) > length):
linenumber = linenumber + 1
line = tmp2.pop()
toout = string.join(tmp2)
tmp3.append(toout)
tmp2 = []
line2 = value[:header]
if (linenumber >= starting):
line3 = line2 + headerplus + line
else:
line3 = line2 + line
line = line3
tmp2.append(line)
tmp3.append(line)
out = string.join(tmp3,"\n")
out = FormatField(out,"SHAPE()")
elif (fn == "SPLITW"):
par = set_par_defaults(par,",0,,1")
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
str = re.search(par[0], value)
header = string.atoi(par[1])
headerplus = par[2]
starting = string.atoi(par[3])
counter = 1
tmp2 = []
tmp = re.split(par[0],value)
last = tmp.pop()
for wrd in tmp:
counter = counter + 1
if (counter >= starting):
tmp2.append(value[:header] + headerplus + wrd + str)
else:
tmp2.append(value[:header] + wrd + str)
if (last != ""):
counter = counter + 1
if (counter >= starting):
tmp2.append(value[:header] + headerplus + last)
else:
tmp2.append(value[:header] + last)
out = string.join(tmp2,"\n")
elif (fn == "CONF"):
par = set_par_defaults(par,",,1")
found = 0
par1 = ""
data = select_line(par[0],data_parsed)
for line in data:
if (par[1][0:NRE] == regexp and par[1][-NRE:] == regexp):
par1 = par[1][NRE:-NRE]
else:
par1 = par[1]
if (par1 == ""):
if (line == ""):
found = 1
elif (len(re.split(par1,line)) > 1 ):
found = 1
if ((found == 1)and(string.atoi(par[2]) == 1)):
out = value
if ((found == 1)and(string.atoi(par[2]) == 0)):
out = ""
if ((found == 0)and(string.atoi(par[2]) == 1)):
out = ""
if ((found == 0)and(string.atoi(par[2]) == 0)):
out = value
return out
elif (fn == "CONFL"):
set_par_defaults(par,",1")
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
if (re.search(par[0],value)):
if (string.atoi(par[1]) == 1):
out = value
else:
out = ""
else:
if (string.atoi(par[1]) == 1):
out = ""
else:
out = value
return out
elif (fn == "CUT"):
par = set_par_defaults(par,",")
left = value[:len(par[0])]
right = value[-(len(par[1])):]
if (left == par[0]):
out = out[len(par[0]):]
if (right == par[1]):
out = out[:-(len(par[1]))]
return out
elif (fn == "NUM"):
tmp = re.findall('\d',value)
out = string.join(tmp,"")
return out
def printInfo():
"print out when not enough parmeters given"
print """
BibConvert data convertor
Usage: bibconvert [options] -ctemplate.cfg < input.dat
Options:
-c'config' configuration templates file
-d'directory' source_data fields are located in separated files in 'directory'one record)
-h print this help
-V print version number
-l'length' minimum line length (default = 1)
-o'value' OAI identifier starts with specified value (default = 1)
-b'file header' insert file header
-e'file footer' insert file footer
-B'record header' insert record header
-E'record footer' insert record footer
-s'record separator' record separator, default empty line (EOLEOL)
-m0'query_string' match records using query string, output unmatched
-m1'query_string' match records using query string, output matched
-m2'query_string' match records using query string, output ambiguous
-Cx'field extraction template' alternative to -c when configuration is split to several files
-Cs'source data template' alternative to -c when configuration is split to several files
-Ct'target data template' alternative to -c when configuration is split to several files
"""
## Match records with the database content
##
def match_in_database(record, query_string):
"Check if record is in alreadey in database with an oai identifier. Returns recID if present, 0 otherwise."
query_string_parsed = parse_query_string(query_string)
search_pattern = []
search_field = []
for query_field in query_string_parsed:
ind1 = query_field[0][3:4]
if ind1 == "_":
ind1 = ""
ind2 = query_field[0][4:5]
if ind2 == "_":
ind2 = ""
stringsplit = "<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\"><subfield code=\"%s\">" % (query_field[0][0:3], ind1, ind2, query_field[0][5:6])
formatting = query_field[1:]
record1 = string.split(record, stringsplit)
if len(record1) > 1:
matching_value = string.split(record1[1],"<")[0]
for fn in formatting:
matching_value = FormatField(matching_value, fn)
search_pattern.append(matching_value)
search_field.append(query_field[0])
search_field.append("")
search_field.append("")
search_field.append("")
search_pattern.append("")
search_pattern.append("")
search_pattern.append("")
recID_list = perform_request_search(p1=search_pattern[0],f1=search_field[0],p2=search_pattern[1],f2=search_field[1],p3=search_pattern[2],f3=search_field[2])
return recID_list
def parse_query_string(query_string):
"""Parse query string, e.g.:
Input: 245__a::REP(-, )::SHAPE::SUP(SPACE, )::MINL(4)::MAXL(8)::EXPW(PUNCT)::WORDS(4,L)::SHAPE::SUP(SPACE, )||700__a::MINL(2)::REP(COMMA,).
Output:[['245__a','REP(-,)','SHAPE','SUP(SPACE, )','MINL(4)','MAXL(8)','EXPW(PUNCT)','WORDS(4,L)','SHAPE','SUP(SPACE, )'],['700__a','MINL(2)','REP(COMMA,)']]
"""
query_string_out = []
query_string_out_in = []
query_string_split_1 = query_string.split('||')
for item_1 in query_string_split_1:
query_string_split_2 = item_1.split('::')
query_string_out_in = []
for item in query_string_split_2:
query_string_out_in.append(item)
query_string_out.append(query_string_out_in)
return query_string_out
def exit_on_error(error_message):
"exit when error occured"
sys.stderr.write("\n bibconvert data convertor\n")
sys.stderr.write(" Error: %s\n" % error_message)
sys.exit()
return 0
def create_record(begin_record_header, ending_record_footer, query_string, match_mode, Xcount):
"Create output record"
global data_parsed
out_to_print = ""
out = []
field_data_item_LIST = []
ssn5cnt = "%3d" % Xcount
sysno = generate("DATE(%w%H%M%S)")
sysno500 = generate("XDATE(%w%H%M%S)," + ssn5cnt)
for T_tpl_item_LIST in target_tpl_parsed:
# the line is printed only if the variables inside are not empty
print_line = 0
to_output = []
rows = 1
for field_tpl_item_STRING in T_tpl_item_LIST[1]:
DATA = []
if (field_tpl_item_STRING[:2]=="<:"):
field_tpl_item_STRING = field_tpl_item_STRING[2:-2]
field = field_tpl_item_STRING.split("::")[0]
if (len(field_tpl_item_STRING.split("::")) == 1):
value = generate(field)
to_output.append([value])
else:
subfield = field_tpl_item_STRING.split("::")[1]
if (field[-1] == "*"):
repetitive = 1
field = field[:-1]
else:
repetitive = 0
if dirmode:
DATA = select_line(field,data_parsed)
else:
DATA = select_line(field,data_parsed)
if (repetitive == 0):
DATA = [string.join(DATA," ")]
SRC_TPL = select_line(field,source_tpl_parsed)
try:
if (DATA[0] != ""):
DATA = get_subfields(DATA,subfield,SRC_TPL)
FF = field_tpl_item_STRING.split("::")
if (len(FF) > 2):
FF = FF[2:]
for fn in FF:
# DATAFORMATTED = []
if (len(DATA) != 0 and DATA[0] != ""):
DATA = get_subfields(DATA,subfield,SRC_TPL)
FF = field_tpl_item_STRING.split("::")
if (len(FF) > 2):
FF = FF[2:]
for fn2 in FF:
DATAFORMATTED = []
for item in DATA:
item = FormatField(item,fn)
DATAFORMATTED.append(item)
DATA = DATAFORMATTED
if (len(DATA) > rows):
rows = len(DATA)
if DATA != "":
print_line = 1
to_output.append(DATA)
except IndexError, e:
pass
else:
to_output.append([field_tpl_item_STRING])
current = 0
default_print = 0
while (current < rows):
line_to_print = []
for item in to_output:
if (item==[]):
item =['']
if (len(item) <= current):
printout = item[0]
else:
printout = item[current]
line_to_print.append(printout)
output = exp_n(string.join(line_to_print,""))
global_formatting_functions = T_tpl_item_LIST[0].split("::")[1:]
for GFF in global_formatting_functions:
if (GFF[:5] == "RANGE"):
parR = get_pars(GFF)[1]
parR = set_par_defaults(parR,"MIN,MAX")
if (parR[0]!="MIN"):
if (string.atoi(parR[0]) > (current+1)):
output = ""
if (parR[1]!="MAX"):
if (string.atoi(parR[1]) < (current+1)):
output = ""
elif (GFF[:4] == "DEFP"):
default_print = 1
else:
output = FormatField(output,GFF)
if ((len(output) > set_conv()[0] and print_line == 1) or default_print):
out_to_print = out_to_print + output + "\n"
current = current + 1
###
out_flag = 0
if query_string:
recID = match_in_database(out_to_print, query_string)
if len(recID) == 1 and match_mode == 1:
ctrlfield = "<controlfield tag=\"001\">%d</controlfield>" % (recID[0])
out_to_print = ctrlfield + "\n" + out_to_print
out_flag = 1
if len(recID) == 0 and match_mode == 0:
out_flag = 1
if len(recID) > 1 and match_mode == 2:
out_flag = 1
if out_flag or match_mode == -1:
if begin_record_header != "":
out_to_print = begin_record_header + "\n" + out_to_print
if ending_record_footer != "":
out_to_print = out_to_print + "\n" + ending_record_footer
else:
out_to_print = ""
return out_to_print
def convert(ar_):
global dirmode, Xcount, conv_setting, sysno, sysno500, separator, tcounter, source_data, query_string, match_mode, begin_record_header ,ending_record_footer,output_rec_sep, begin_header, ending_footer, oai_identifier_from, source_tpl, source_tpl_parsed, target_tpl, target_tpl_parsed, extract_tpl, extract_tpl_parsed, data_parsed
dirmode, Xcount, conv_setting, sysno, sysno500, separator, tcounter, source_data, query_string, match_mode, begin_record_header ,ending_record_footer,output_rec_sep, begin_header, ending_footer, oai_identifier_from, source_tpl, source_tpl_parsed, target_tpl, target_tpl_parsed, extract_tpl, extract_tpl_parsed = ar_
# separator = spt
if dirmode:
if (os.path.isdir(source_data)):
data_parsed = parse_input_data_d(source_data,source_tpl)
record = create_record(begin_record_header, ending_record_footer, query_string, match_mode, Xcount)
if record != "":
print record
tcounter = tcounter + 1
if output_rec_sep != "":
print output_rec_sep
else:
exit_on_error("Cannot access directory: %s" % source_data)
else:
done = 0
print begin_header
while (done == 0):
data_parsed = parse_input_data_fx(source_tpl)
if (data_parsed == -1):
done = 1
else:
if (data_parsed[0][0]!= ''):
record = create_record(begin_record_header, ending_record_footer, query_string, match_mode, Xcount)
Xcount += 1
if record != "":
print record
tcounter = tcounter + 1
if output_rec_sep != "":
print output_rec_sep
print ending_footer
return

Event Timeline