Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90379951
bibconvert.wml
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Nov 1, 03:12
Size
49 KB
Mime Type
text/x-c
Expires
Sun, Nov 3, 03:12 (2 d)
Engine
blob
Format
Raw Data
Handle
22065148
Attached To
R3600 invenio-infoscience
bibconvert.wml
View Options
## $Id$
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
## read config variables:
#include "config.wml"
#include "configbis.wml"
#include "cdswmllib.wml"
## start Python:
<protect>#!</protect><PYTHON>
<protect>## $Id$</protect>
<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
"""BibConvert tool to convert bibliographic records from any format to any format."""
__version__ = "<: print generate_pretty_version_string('$Id$'); :>"
## okay, rest of the Python code goes below
#######
pylibdir = "<LIBDIR>/python"
try:
import fileinput
import string
import os
import re
import sys
import time
import getopt
from time import gmtime, strftime, localtime
import os.path
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
try:
sys.path.append('%s' % pylibdir)
from cdsware.search_engine import perform_request_search
from cdsware.config import *
except ImportError, e:
print "Error: %s" % e
sys.exit(1)
<protect>
### Matching records with database content
def parse_query_string(query_string):
"""Parse query string, e.g.:
Input: 245__a::REP(-, )::SHAPE::SUP(SPACE, )::MINL(4)::MAXL(8)::EXPW(PUNCT)::WORDS(4,L)::SHAPE::SUP(SPACE, )||700__a::MINL(2)::REP(COMMA,).
Output:[['245__a','REP(-,)','SHAPE','SUP(SPACE, )','MINL(4)','MAXL(8)','EXPW(PUNCT)','WORDS(4,L)','SHAPE','SUP(SPACE, )'],['700__a','MINL(2)','REP(COMMA,)']]
"""
query_string_out = []
query_string_out_in = []
query_string_split_1 = query_string.split('||')
for item_1 in query_string_split_1:
query_string_split_2 = item_1.split('::')
query_string_out_in = []
for item in query_string_split_2:
query_string_out_in.append(item)
query_string_out.append(query_string_out_in)
return query_string_out
def set_conv():
"""
bibconvert common settings
=======================
minimal length of output line = 1
maximal length of output line = 4096
"""
conv_setting = [
1,
4096
]
return conv_setting
def get_pars(fn):
"Read function and its parameters into list"
out = []
out.append(re.split('\(|\)',fn)[0])
out.append(re.split(',',re.split('\(|\)',fn)[1]))
return out
def append_to_output_file(filename, output):
"bibconvert output file creation by output line"
try:
file = open(filename,'a')
file.write(output)
file.close()
except IOError, e:
exit_on_error("Cannot write into %s" % filename)
return 1
def sub_keywd(out):
"bibconvert keywords literal substitution"
out = string.replace(out,"EOL","\n")
out = string.replace(out,"_CR_","\r")
out = string.replace(out,"_LF_","\n")
out = string.replace(out,"\\",'\\')
out = string.replace(out,"\r",'\r')
out = string.replace(out,"BSLASH",'\\')
out = string.replace(out,"COMMA",',')
out = string.replace(out,"LEFTB",'[')
out = string.replace(out,"RIGHTB",']')
out = string.replace(out,"LEFTP",'(')
out = string.replace(out,"RIGHTP",')')
return out
def check_split_on(data_item_split, sep, tpl_f):
"""
bibconvert conditional split with following conditions
===================================================
::NEXT(N,TYPE,SIDE) - next N chars are of the TYPE having the separator on the SIDE
::PREV(N,TYPE,SIDE) - prev.N chars are of the TYPE having the separator on the SIDE
"""
fn = get_pars(tpl_f)[0]
par = get_pars(tpl_f)[1]
done = 0
while (done == 0):
if ( (( fn == "NEXT" ) and ( par[2]=="R" )) or
(( fn == "PREV" ) and ( par[2]=="L" )) ):
test_value = data_item_split[0][-(string.atoi(par[0])):]
elif ( ((fn == "NEXT") and ( par[2]=="L")) or
((fn == "PREV") and ( par[2]=="R")) ):
test_value = data_item_split[1][:(string.atoi(par[0]))]
data_item_split_tmp = []
if ((FormatField(test_value,"SUP(" + par[1] + ",)") != "")or(len(test_value) < string.atoi(par[0]))):
data_item_split_tmp = data_item_split[1].split(sep,1)
if(len(data_item_split_tmp)==1):
done = 1
data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0]
data_item_split[1] = ""
else:
data_item_split[0] = data_item_split[0] + sep + data_item_split_tmp[0]
data_item_split[1] = data_item_split_tmp[1]
else:
done = 1
return data_item_split
def get_subfields(data,subfield,src_tpl):
"Get subfield according to the template"
out = []
for data_item in data:
found = 0
for src_tpl_item in src_tpl:
if (src_tpl_item[:2] == "<:"):
if (src_tpl_item[2:-2] == subfield):
found = 1
else:
sep_in_list = src_tpl_item.split("::")
sep = sep_in_list[0]
data_item_split = data_item.split(sep,1)
if (len(data_item_split)==1):
data_item = data_item_split[0]
else:
if (len(sep_in_list) > 1):
data_item_split = check_split_on(data_item.split(sep,1), sep_in_list[0],sep_in_list[1])
if(found == 1):
data_item = data_item_split[0]
else:
data_item = string.join(data_item_split[1:],sep)
out.append(data_item)
return out
def exp_n(word):
"Replace newlines and carriage return's from string."
out = ""
for ch in word:
if ((ch != '\n') and (ch != '\r')):
out = out + ch
return out
def exp_e(list):
"Expunge empty elements from a list"
out = []
for item in list:
item = exp_n(item)
if ((item != '\r\n' and item != '\r' and item != '\n' and item !="" and len(item)!=0)):
out.append(item)
return out
def sup_e(word):
"Replace spaces"
out = ""
for ch in word:
if (ch != ' '):
out = out + ch
return out
def select_line(field_code, list):
"Return appropriate item from a list"
out = ['']
for field in list:
field[0] = sup_e(field[0])
field_code = sup_e(field_code)
if (field[0] == field_code):
out = field[1]
return out
def parse_field_definition(source_field_definition):
"Create list of source_field_definition"
word_list = []
out = []
word = ""
counter = 0
if (len(source_field_definition.split("---"))==4):
out = source_field_definition.split("---")
else:
element_list_high = source_field_definition.split("<:")
for word_high in element_list_high:
element_list_low = word_high.split(':>')
for word_low in element_list_low:
word_list.append(word_low)
word_list.append(":>")
word_list.pop()
word_list.append("<:")
word_list.pop()
for item in word_list:
word = word + item
if (item == "<:"):
counter = counter + 1
if (item == ":>"):
counter = counter - 1
if counter == 0:
out.append(word)
word = ""
return out
def parse_template(template):
"""
bibconvert parse template
======================
in - template filename
out - [ [ field_code , [ field_template_parsed ] , [] ]
"""
out = []
for field_def in read_file(template,1):
field_tpl_new = []
if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")):
field_code = field_def.split("---",1)[0]
field_tpl = parse_field_definition(field_def.split("---",1)[1])
field_tpl_new = field_tpl
field_tpl = exp_e(field_tpl_new)
out_data = [field_code, field_tpl]
out.append(out_data)
return out
def parse_common_template(template,part):
"""
bibconvert parse template
=========================
in - template filename
out - [ [ field_code , [ field_template_parsed ] , [] ]
"""
out = []
counter = 0
for field_def in read_file(template,1):
if (exp_n(field_def)[:3] == "==="):
counter = counter + 1
elif (counter == part):
field_tpl_new = []
if ((len(field_def.split("---",1)) > 1) and (field_def[:1]!="#")):
field_code = field_def.split("---",1)[0]
field_tpl = parse_field_definition(field_def.split("---",1)[1])
field_tpl_new = field_tpl
field_tpl = exp_e(field_tpl_new)
out_data = [field_code, field_tpl]
out.append(out_data)
return out
def parse_input_data_f(source_data_open, source_tpl):
"""
bibconvert parse input data
========================
in - input source data location (filehandle)
source data template
source_field_code list of source field codes
source_field_data list of source field data values (repetitive fields each line one occurence)
out - [ [ source_field_code , [ source_field_data ] ] , [] ]
source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][]
destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[]
input data file; by line: - fieldcode value
"""
out = [['',[]]]
count = 0
values = []
while (count < 1):
line = source_data_open.readline()
if (line == ""):
return(-1)
line_split = line.split(" ",1)
if (re.sub("\s","",line) == separator):
count = count + 1
if (len(line_split) == 2):
field_code = line_split[0]
field_value = exp_n(line_split[1])
values.append([field_code,field_value])
item_prev = ""
stack = ['']
for item in values:
if ((item[0]==item_prev)or(item_prev == "")):
stack.append(item[1])
item_prev = item[0]
else:
out.append([item_prev,stack])
item_prev = item[0]
stack = []
stack.append(item[1])
try:
if (stack[0] != ""):
if (out[0][0]==""):
out = []
out.append([field_code,stack])
except IndexError, e:
out = out
return out
def parse_input_data_fx(source_tpl):
"""
bibconvert parse input data
========================
in - input source data location (filehandle)
source data template
source_field_code list of source field codes
source_field_data list of source field data values (repetitive fields each line one occurence)
out - [ [ source_field_code , [ source_field_data ] ] , [] ]
extraction_template_entry -
input data file - specified by extract_tpl
"""
count = 0
record = ""
field_data_1_in_list = []
out = [['',[]]]
while (count <10):
line = sys.stdin.readline()
if (line == ""):
count = count + 1
if (record == "" and count):
return (-1)
if (re.sub("\s","",line) == separator):
count = count + 10
else:
record = record + line
for field_defined in extract_tpl_parsed:
try:
field_defined[1][0] = sub_keywd(field_defined[1][0])
field_defined[1][1] = sub_keywd(field_defined[1][1])
except IndexError, e:
field_defined = field_defined
try:
field_defined[1][2] = sub_keywd(field_defined[1][2])
except IndexError, e:
field_defined = field_defined
field_data_1 =""
if ((field_defined[1][0][0:2] == '//') and (field_defined[1][0][-2:] == '//')):
field_defined_regexp = field_defined[1][0][2:-2]
try:
####
if (len(re.split(field_defined_regexp,record)) == 1):
field_data_1 = ""
field_data_1_in_list = []
else:
field_data_1_tmp = re.split(field_defined_regexp,record,1)[1]
field_data_1_in_list = field_data_1_tmp.split(field_defined_regexp)
except IndexError, e:
field_data_1 = ""
else:
try:
if (len(record.split(field_defined[1][0])) == 1):
field_data_1 = ""
field_data_1_in_list = []
else:
field_data_1_tmp = record.split(field_defined[1][0],1)[1]
field_data_1_in_list = field_data_1_tmp.split(field_defined[1][0])
except IndexError, e:
field_data_1 = ""
spliton = []
outvalue = ""
field_data_2 = ""
field_data = ""
try:
if ((field_defined[1][1])=="EOL"):
spliton = ['\n']
elif ((field_defined[1][1])=="MIN"):
spliton = ['\n']
elif ((field_defined[1][1])=="MAX"):
for item in extract_tpl_parsed:
try:
spliton.append(item[1][0])
except IndexError, e:
spliton = spliton
elif (field_defined[1][1][0:2] == '//') and (field_defined[1][1][-2:] == '//'):
spliton = [field_defined[1][1][2:-2]]
else:
spliton = [field_defined[1][1]]
except IndexError,e :
spliton = ""
outvalues = []
for field_data in field_data_1_in_list:
outvalue = ""
for splitstring in spliton:
field_data_2 = ""
if (len(field_data.split(splitstring))==1):
if (outvalue == ""):
field_data_2 = field_data
else:
field_data_2 = outvalue
else:
field_data_2 = field_data.split(splitstring)[0]
outvalue = field_data_2
field_data = field_data_2
outvalues.append(outvalue)
outvalues = exp_e(outvalues)
if (len(outvalues) > 0):
if (out[0][0]==""):
out = []
outstack = []
if (len(field_defined[1])==3):
spliton = [field_defined[1][2]]
if (field_defined[1][2][0:2] == '//') and (field_defined[1][2][-2:] == '//'):
spliton = [field_defined[1][2][2:-2]]
for item in outvalues:
stack = re.split(spliton[0],item)
for stackitem in stack:
outstack.append(stackitem)
else:
outstack = outvalues
out.append([field_defined[0],outstack])
return out
def parse_input_data_d(source_data, source_tpl):
"""
bibconvert parse input data
========================
in - input source data location (directory)
source data template
source_field_code list of source field codes
source_field_data list of source field data values (repetitive fields each line one occurence)
out - [ [ source_field_code , [ source_field_data ] ] , [] ]
source_data_template entry - field_code---[const]<:subfield_code:>[const][<:subfield_code:>][]
destination_templace entry - [::GFF()]---[const]<:field_code::subfield_code[::FF()]:>[]
input data dir; by file: - fieldcode value per line
"""
out = []
for source_field_tpl in read_file(source_tpl,1):
source_field_code = source_field_tpl.split("---")[0]
source_field_data = read_file(source_data + source_field_code,0)
source_field_data = exp_e(source_field_data)
out_data = [source_field_code, source_field_data]
out.append(out_data)
return out
def sub_empty_lines(value):
out = re.sub('\n\n+','',value)
return out
def set_par_defaults(par1,par2):
"Set default parameter when not defined"
par_new_in_list = par2.split(",")
i = 0
out = []
for par in par_new_in_list:
if (len(par1)>i):
if (par1[i] == ""):
out.append(par)
else:
out.append(par1[i])
else:
out.append(par)
i = i + 1
return out
def generate(keyword):
"""
bibconvert generaded values:
=========================
SYSNO() - generate date as '%w%H%M%S'
WEEK(N) - generate date as '%V' with shift (N)
DATE(format) - generate date in specifieddate FORMAT
VALUE(value) - enter value literarly
OAI() - generate oai_identifier, starting value given at command line as -o<value>
"""
out = keyword
fn = keyword + "()"
par = get_pars(fn)[1]
fn = get_pars(fn)[0]
par = set_par_defaults(par,"")
if (fn == "SYSNO"):
out = sysno
if (fn == "WEEK"):
par = set_par_defaults(par,"0")
out = "%02d" % (string.atoi(strftime("%V",localtime())) + string.atoi(par[0]))
if (string.atoi(out)<0):
out = "00"
if (fn == "VALUE"):
par = set_par_defaults(par,"")
out = par[0]
if (fn == "DATE"):
par = set_par_defaults(par,"%w%H%M%S," + "%d" % conv_setting[1])
out = strftime(par[0],localtime())
out = out[:string.atoi(par[1])]
if (fn == "OAI"):
oai_prefix = "</protect><OAIIDPREFIX><protect>"
out = "%s:%d" % (oai_prefix,tcounter + oai_identifier_from)
return out
def read_file(filename,exception):
"Read file into list"
out = []
if (os.path.isfile(filename)):
file = open(filename,'r')
out = file.readlines()
file.close()
else:
if exception:
exit_on_error("Cannot access file: %s" % filename)
return out
def crawl_KB(filename,value,mode):
"""
bibconvert look-up value in KB_file in one of following modes:
===========================================================
1 - case sensitive / match (default)
2 - not case sensitive / search
3 - case sensitive / search
4 - not case sensitive / match
5 - case sensitive / search (in KB)
6 - not case sensitive / search (in KB)
7 - case sensitive / search (reciprocal)
8 - not case sensitive / search (reciprocal)
9 - replace by _DEFAULT_ only
R - not case sensitive / search (reciprocal) (8) replace
"""
if (os.path.isfile(filename) != 1):
pathtmp = string.split(extract_tpl,"/")
pathtmp.pop()
path = string.join(pathtmp,"/")
filename = path + "/" + filename
if (os.path.isfile(filename)):
file_to_read = open(filename,"r")
file_read = file_to_read.readlines()
for line in file_read:
code = string.split(line,"---")
if (mode == "2"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((len(string.split(value_to_cmp,code[0])) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif ((mode == "3") or (mode == "0")):
if ((len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "4"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((code[0] == value_to_cmp)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "5"):
if ((len(string.split(code[0],value)) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "6"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((len(string.split(code[0],value_to_cmp)) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "7"):
if ((len(string.split(code[0],value)) > 1)or(len(string.split(value,code[0])) > 1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "8"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
elif (mode == "9"):
if (code[0]=="_DEFAULT_"):
value = code[1]
return value
elif (mode == "R"):
value_to_cmp = string.lower(value)
code[0] = string.lower(code[0])
if ((len(string.split(code[0],value_to_cmp)) > 1)or(len(string.split(value_to_cmp,code[0]))>1)or(code[0]=="_DEFAULT_")):
value = value.replace(code[0],code[1])
else:
if ((code[0] == value)or(code[0]=="_DEFAULT_")):
value = code[1]
return value
return value
def FormatField(value,fn):
"""
bibconvert formatting functions:
================================
ADD(prefix,suffix) - add prefix/suffix
KB(kb_file,mode) - lookup in kb_file and replace value
ABR(N,suffix) - abbreviate to N places with suffix
ABRX() - abbreviate exclusively words longer
ABRW() - abbreviate word (limit from right)
REP(x,y) - replace
SUP(type) - remove characters of certain TYPE
LIM(n,side) - limit to n letters from L/R
LIMW(string,side) - L/R after split on string
WORDS(n,side) - limit to n words from L/R
IF(value,valueT,valueF) - replace on IF condition
MINL(n) - replace words shorter than n
MINLW(n) - replace words shorter than n
MAXL(n) - replace words longer than n
EXPW(type) - replace word from value containing TYPE
EXP(STR,0/1) - replace word from value containing string
NUM() - take only digits in given string
SHAPE() - remove extra space
UP() - to uppercase
DOWN() - to lowercase
CAP() - make capitals each word
SPLIT(n,h,str,from) - only for final Aleph field, i.e. AB , maintain whole words
SPLITW(sep,h,str,from) - only for final Aleph field, split on string
CONF(filed,value,0/1) - confirm validity of output line (check other field)
CONFL(substr,0/1) - confirm validity of output line (check field being processed)
CUT(prefix,postfix) - remove substring from side
RANGE(MIN,MAX) - select items in repetitive fields
RE(regexp) - regular expressions
bibconvert character TYPES
==========================
ALPHA - alphabetic
NALPHA - not alpphabetic
NUM - numeric
NNUM - not numeric
ALNUM - alphanumeric
NALNUM - non alphanumeric
LOWER - lowercase
UPPER - uppercase
PUNCT - punctual
NPUNCT - non punctual
SPACE - space
"""
out = value
fn = fn + "()"
par = get_pars(fn)[1]
fn = get_pars(fn)[0]
regexp = "//"
NRE = len(regexp)
value = sub_keywd(value)
par_tmp = []
for item in par:
item = sub_keywd(item)
par_tmp.append(item)
par = par_tmp
if (fn == "RE"):
new_value = ""
par = set_par_defaults(par,".*,0")
if (re.search(par[0],value) and (par[1] == "0")):
new_value = value
out = new_value
if (fn == "KB"):
new_value = ""
par = set_par_defaults(par,"KB,0")
new_value = crawl_KB(par[0],value,par[1])
out = new_value
elif (fn == "ADD"):
par = set_par_defaults(par,",")
out = par[0] + value + par[1]
elif (fn == "ABR"):
par = set_par_defaults(par,"1,.")
out = value[:string.atoi(par[0])] + par[1]
elif (fn == "ABRW"):
tmp = FormatField(value,"ABR(1,.)")
tmp = tmp.upper()
out = tmp
elif (fn == "ABRX"):
par = set_par_defaults(par,",")
toout = []
tmp = value.split(" ")
for wrd in tmp:
if (len(wrd) > string.atoi(par[0])):
wrd = wrd[:string.atoi(par[0])] + par[1]
toout.append(wrd)
out = string.join(toout," ")
elif (fn == "SUP"):
par = set_par_defaults(par,",")
if(par[0]=="NUM"):
out = re.sub('\d+',par[1],value)
if(par[0]=="NNUM"):
out = re.sub('\D+',par[1],value)
if(par[0]=="ALPHA"):
out = re.sub('[a-zA-Z]+',par[1],value)
if(par[0]=="NALPHA"):
out = re.sub('[^a-zA-Z]+',par[1],value)
if((par[0]=="ALNUM")or(par[0]=="NPUNCT")):
out = re.sub('\w+',par[1],value)
if(par[0]=="NALNUM"):
out = re.sub('\W+',par[1],value)
if(par[0]=="PUNCT"):
out = re.sub('\W+',par[1],value)
if(par[0]=="LOWER"):
out = re.sub('[a-z]+',par[1],value)
if(par[0]=="UPPER"):
out = re.sub('[A-Z]+',par[1],value)
if(par[0]=="SPACE"):
out = re.sub('\s+',par[1],value)
elif (fn == "LIM"):
par = set_par_defaults(par,",")
if (par[1] == "L"):
out = value[(len(value) - string.atoi(par[0])):]
if (par[1] == "R"):
out = value[:string.atoi(par[0])]
elif (fn == "LIMW"):
par = set_par_defaults(par,",")
if (par[0]!= ""):
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
par[0] = re.search(par[0],value).group()
tmp = value.split(par[0])
if (par[1] == "L"):
out = par[0] + tmp[1]
if (par[1] == "R"):
out = tmp[0] + par[0]
elif (fn == "WORDS"):
tmp2 = [value]
par = set_par_defaults(par,",")
if (par[1] == "R"):
tmp = value.split(" ")
tmp2 = []
i = 0
while (i < string.atoi(par[0])):
tmp2.append(tmp[i])
i = i + 1
if (par[1] == "L"):
tmp = value.split(" ")
tmp.reverse()
tmp2 = []
i = 0
while (i < string.atoi(par[0])):
tmp2.append(tmp[i])
i = i + 1
tmp2.reverse()
out = string.join(tmp2, " ")
elif (fn == "MINL"):
par = set_par_defaults(par,"1")
tmp = value.split(" ")
tmp2 = []
i = 0
for wrd in tmp:
if (len(wrd) >= string.atoi(par[0])):
tmp2.append(wrd)
out = string.join(tmp2, " ")
elif (fn == "MINLW"):
par = set_par_defaults(par,"1")
if (len(value) >= string.atoi(par[0])):
out = value
else:
out = ""
elif (fn == "MAXL"):
par = set_par_defaults(par,"4096")
tmp = value.split(" ")
tmp2 = []
i = 0
for wrd in tmp:
if (len(wrd) <= string.atoi(par[0])):
tmp2.append(wrd)
out = string.join(tmp2, " ")
elif (fn == "REP"):
set_par_defaults(par,",")
if (par[0]!= ""):
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
out = re.sub(par[0],value)
else:
out = value.replace(par[0],par[1])
elif (fn == "SHAPE"):
if (value != ""):
out = value.strip()
elif (fn == "UP"):
out = value.upper()
elif (fn == "DOWN"):
out = value.lower()
elif (fn == "CAP"):
tmp = value.split(" ")
out2 = []
for wrd in tmp:
wrd2 = wrd.capitalize()
out2.append(wrd2)
out = string.join(out2," ")
elif (fn == "IF"):
par = set_par_defaults(par,",,")
N = 0
while N < 3:
if (par[N][0:NRE] == regexp and par[N][-NRE:] == regexp):
par[N] = par[N][NRE:-NRE]
par[N] = re.search(par[N],value).group()
N += 1
if (value == par[0]):
out = par[1]
else:
out = par[2]
if (out == "ORIG"):
out = value
elif (fn == "EXP"):
par = set_par_defaults(par,",0")
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
par[0] = re.search(par[0],value).group()
tmp = value.split(" ")
out2 = []
for wrd in tmp:
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
if ((re.search(par[0],wrd).group() == wrd) and (par[1]=="1")):
out2.append(wrd)
if ((re.search(par[0],wrd).group() != wrd) and (par[1]=="0")):
out2.append(wrd)
else:
if ((len(wrd.split(par[0])) == 1)and(par[1]=="1")):
out2.append(wrd)
if ((len(wrd.split(par[0])) != 1)and(par[1]=="0")):
out2.append(wrd)
out = string.join(out2," ")
elif (fn == "EXPW"):
par = set_par_defaults(par,",0")
tmp = value.split(" ")
out2 = []
for wrd in tmp:
if ((FormatField(wrd,"SUP(" + par[0] + ")") == wrd)and(par[1]=="1")):
out2.append(wrd)
if ((FormatField(wrd,"SUP(" + par[0] + ")") != wrd)and(par[1]=="0")):
out2.append(wrd)
out = string.join(out2," ")
elif (fn == "SPLIT"):
par = set_par_defaults(par,"%d,0,,1" % conv_setting[1])
length = string.atoi(par[0]) + (string.atoi(par[1]))
header = string.atoi(par[1])
headerplus = par[2]
starting = string.atoi(par[3])
line = ""
tmp2 = []
tmp3 = []
tmp = value.split(" ")
linenumber = 1
if (linenumber >= starting):
tmp2.append(headerplus)
line = line + headerplus
for wrd in tmp:
line = line + " " + wrd
tmp2.append(wrd)
if (len(line) > length):
linenumber = linenumber + 1
line = tmp2.pop()
toout = string.join(tmp2)
tmp3.append(toout)
tmp2 = []
line2 = value[:header]
if (linenumber >= starting):
line3 = line2 + headerplus + line
else:
line3 = line2 + line
line = line3
tmp2.append(line)
tmp3.append(line)
out = string.join(tmp3,"\n")
out = FormatField(out,"SHAPE()")
elif (fn == "SPLITW"):
par = set_par_defaults(par,",0,,1")
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
str = re.search(par[0], value)
header = string.atoi(par[1])
headerplus = par[2]
starting = string.atoi(par[3])
counter = 1
tmp2 = []
tmp = re.split(par[0],value)
last = tmp.pop()
for wrd in tmp:
counter = counter + 1
if (counter >= starting):
tmp2.append(value[:header] + headerplus + wrd + str)
else:
tmp2.append(value[:header] + wrd + str)
if (last != ""):
counter = counter + 1
if (counter >= starting):
tmp2.append(value[:header] + headerplus + last)
else:
tmp2.append(value[:header] + last)
out = string.join(tmp2,"\n")
elif (fn == "CONF"):
par = set_par_defaults(par,",,1")
found = 0
par1 = ""
data = select_line(par[0],data_parsed)
for line in data:
if (par[1][0:NRE] == regexp and par[1][-NRE:] == regexp):
par1 = par[1][NRE:-NRE]
else:
par1 = par[1]
if (par1 == ""):
if (line == ""):
found = 1
elif (len(re.split(par1,line)) > 1 ):
found = 1
if ((found == 1)and(string.atoi(par[2]) == 1)):
out = value
if ((found == 1)and(string.atoi(par[2]) == 0)):
out = ""
if ((found == 0)and(string.atoi(par[2]) == 1)):
out = ""
if ((found == 0)and(string.atoi(par[2]) == 0)):
out = value
return out
elif (fn == "CONFL"):
set_par_defaults(par,",1")
if (par[0][0:NRE] == regexp and par[0][-NRE:] == regexp):
par[0] = par[0][NRE:-NRE]
if (re.search(par[0],value)):
if (string.atoi(par[1]) == 1):
out = value
else:
out = ""
else:
if (string.atoi(par[1]) == 1):
out = ""
else:
out = value
return out
elif (fn == "CUT"):
par = set_par_defaults(par,",")
left = value[:len(par[0])]
right = value[-(len(par[1])):]
if (left == par[0]):
out = out[len(par[0]):]
if (right == par[1]):
out = out[:-(len(par[1]))]
return out
elif (fn == "NUM"):
tmp = re.findall('\d',value)
out = string.join(tmp,"")
return out
def printInfo():
"print out when not enough parmeters given"
print """
BibConvert data convertor
Usage: bibconvert [options] -ctemplate.cfg < input.dat
Options:
-c'config' configuration templates file
-d'directory' source_data fields are located in separated files in 'directory'one record)
-h print this help
-V print version number
-l'length' minimum line length (default = 1)
-o'value' OAI identifier starts with specified value (default = 1)
-b'file header' insert file header
-e'file footer' insert file footer
-B'record header' insert record header
-E'record footer' insert record footer
-s'record separator' record separator, default empty line (EOLEOL)
-m0'query_string' match records using query string, output unmatched
-m1'query_string' match records using query string, output matched
-m2'query_string' match records using query string, output ambiguous
-Cx'field extraction template' alternative to -c when configuration is split to several files
-Cs'source data template' alternative to -c when configuration is split to several files
-Ct'target data template' alternative to -c when configuration is split to several files
"""
## Match records with the database content
##
def match_in_database(record, query_string):
"Check if record is in alreadey in database with an oai identifier. Returns recID if present, 0 otherwise."
query_string_parsed = parse_query_string(query_string)
search_pattern = []
search_field = []
for query_field in query_string_parsed:
ind1 = query_field[0][3:4]
if ind1 == "_":
ind1 = ""
ind2 = query_field[0][4:5]
if ind2 == "_":
ind2 = ""
stringsplit = "<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\"><subfield code=\"%s\">" % (query_field[0][0:3], ind1, ind2, query_field[0][5:6])
formatting = query_field[1:]
record1 = string.split(record, stringsplit)
if len(record1) > 1:
matching_value = string.split(record1[1],"<")[0]
for fn in formatting:
matching_value = FormatField(matching_value, fn)
search_pattern.append(matching_value)
search_field.append(query_field[0])
search_field.append("")
search_field.append("")
search_field.append("")
search_pattern.append("")
search_pattern.append("")
search_pattern.append("")
recID_list = perform_request_search(p1=search_pattern[0],f1=search_field[0],p2=search_pattern[1],f2=search_field[1],p3=search_pattern[2],f3=search_field[2])
return recID_list
def parse_query_string(query_string):
"""Parse query string, e.g.:
Input: 245__a::REP(-, )::SHAPE::SUP(SPACE, )::MINL(4)::MAXL(8)::EXPW(PUNCT)::WORDS(4,L)::SHAPE::SUP(SPACE, )||700__a::MINL(2)::REP(COMMA,).
Output:[['245__a','REP(-,)','SHAPE','SUP(SPACE, )','MINL(4)','MAXL(8)','EXPW(PUNCT)','WORDS(4,L)','SHAPE','SUP(SPACE, )'],['700__a','MINL(2)','REP(COMMA,)']]
"""
query_string_out = []
query_string_out_in = []
query_string_split_1 = query_string.split('||')
for item_1 in query_string_split_1:
query_string_split_2 = item_1.split('::')
query_string_out_in = []
for item in query_string_split_2:
query_string_out_in.append(item)
query_string_out.append(query_string_out_in)
return query_string_out
def exit_on_error(error_message):
"exit when error occured"
sys.stderr.write("\n bibconvert data convertor\n")
sys.stderr.write(" Error: %s\n" % error_message)
sys.exit()
return 0
def create_record(begin_record_header, ending_record_footer, query_string, match_mode):
"Create output record"
out_to_print = ""
out = []
field_data_item_LIST = []
for T_tpl_item_LIST in target_tpl_parsed:
# the line is printed only if the variables inside are not empty
print_line = 0
to_output = []
rows = 1
for field_tpl_item_STRING in T_tpl_item_LIST[1]:
DATA = []
if (field_tpl_item_STRING[:2]=="<:"):
field_tpl_item_STRING = field_tpl_item_STRING[2:-2]
field = field_tpl_item_STRING.split("::")[0]
if (len(field_tpl_item_STRING.split("::")) == 1):
value = generate(field)
to_output.append([value])
else:
subfield = field_tpl_item_STRING.split("::")[1]
if (field[-1] == "*"):
repetitive = 1
field = field[:-1]
else:
repetitive = 0
if dirmode:
DATA = select_line(field,data_parsed)
else:
DATA = select_line(field,data_parsed)
if (repetitive == 0):
DATA = [string.join(DATA," ")]
SRC_TPL = select_line(field,source_tpl_parsed)
try:
if (DATA[0] != ""):
DATA = get_subfields(DATA,subfield,SRC_TPL)
FF = field_tpl_item_STRING.split("::")
if (len(FF) > 2):
FF = FF[2:]
for fn in FF:
# DATAFORMATTED = []
if (len(DATA) != 0 and DATA[0] != ""):
DATA = get_subfields(DATA,subfield,SRC_TPL)
FF = field_tpl_item_STRING.split("::")
if (len(FF) > 2):
FF = FF[2:]
for fn2 in FF:
DATAFORMATTED = []
for item in DATA:
item = FormatField(item,fn)
DATAFORMATTED.append(item)
DATA = DATAFORMATTED
if (len(DATA) > rows):
rows = len(DATA)
if DATA != "":
print_line = 1
to_output.append(DATA)
except IndexError, e:
pass
else:
to_output.append([field_tpl_item_STRING])
current = 0
default_print = 0
while (current < rows):
line_to_print = []
for item in to_output:
if (item==[]):
item =['']
if (len(item) <= current):
printout = item[0]
else:
printout = item[current]
line_to_print.append(printout)
output = exp_n(string.join(line_to_print,""))
global_formatting_functions = T_tpl_item_LIST[0].split("::")[1:]
for GFF in global_formatting_functions:
if (GFF[:5] == "RANGE"):
parR = get_pars(GFF)[1]
parR = set_par_defaults(parR,"MIN,MAX")
if (parR[0]!="MIN"):
if (string.atoi(parR[0]) > (current+1)):
output = ""
if (parR[1]!="MAX"):
if (string.atoi(parR[1]) < (current+1)):
output = ""
elif (GFF[:4] == "DEFP"):
default_print = 1
else:
output = FormatField(output,GFF)
if ((len(output) > conv_setting[0] and print_line == 1) or default_print):
out_to_print = out_to_print + output + "\n"
current = current + 1
###
out_flag = 0
if query_string:
recID = match_in_database(out_to_print, query_string)
if len(recID) == 1 and match_mode == 1:
ctrlfield = "<controlfield tag=\"001\">%d</controlfield>" % (recID[0])
out_to_print = ctrlfield + "\n" + out_to_print
out_flag = 1
if len(recID) == 0 and match_mode == 0:
out_flag = 1
if len(recID) > 1 and match_mode == 2:
out_flag = 1
if out_flag or match_mode == -1:
if begin_record_header != "":
out_to_print = begin_record_header + "\n" + out_to_print
if ending_record_footer != "":
out_to_print = out_to_print + "\n" + ending_record_footer
else:
out_to_print = ""
return out_to_print
### MAIN ###
conv_setting = set_conv()
sysno = generate("DATE(%w%H%M%S)")
separator = ""
tcounter = 0
source_data = ""
query_string = ""
match_mode = -1
begin_record_header = ""
ending_record_footer = ""
output_rec_sep = ""
begin_header = ""
ending_footer = ""
oai_identifier_from = 1
opts, args = getopt.getopt(sys.argv[1:],"c:d:hVl:o:b:e:B:E:s:m:C:",
[
"config",
"directory",
"help",
"version",
"length",
"oai",
"header",
"footer",
"record-header",
"record-footer",
"separator",
"match",
"config-alt"
])
# get options and arguments
dirmode = 0
for opt, opt_value in opts:
if opt in ["-c", "--config"]:
extract_tpl = opt_value
extract_tpl_parsed = parse_common_template(extract_tpl,1)
source_tpl = opt_value
source_tpl_parsed = parse_common_template(source_tpl,2)
target_tpl = opt_value
target_tpl_parsed = parse_common_template(target_tpl,3)
elif opt in ["-d", "--directory"]:
source_data = opt_value
source_data = source_data + "/"
extract_tpl = "/"
dirmode = 1
elif opt in ["-h", "--help"]:
printInfo()
sys.exit(0)
elif opt in ["-V", "--version"]:
print __version__
sys.exit(0)
elif opt in ["-l", "--length"]:
try:
conv_setting[0] = string.atoi(opt_value)
except ValueError, e:
conv_setting[0] = 1
elif opt in ["-o", "--oai"]:
try:
oai_identifier_from = string.atoi(opt_value)
except ValueError, e:
oai_identifier_from = 1
elif opt in ["-b", "--header"]:
begin_header = opt_value
elif opt in ["-e", "--footer"]:
ending_footer = opt_value
elif opt in ["-B", "--record-header"]:
begin_record_header = opt_value
elif opt in ["-E", "--record-footer"]:
ending_record_footer = opt_value
elif opt in ["-s", "--separator"]:
separator = opt_value
elif opt in ["-t", "--output_separator"]:
output_rec_sep = opt_value
elif opt in ["-m", "--match"]:
match_mode = string.atoi(opt_value[0:1])
query_string = opt_value[1:]
elif opt in ["-C", "--config-alt"]:
if opt_value[0:1] == "x":
extract_tpl = opt_value[1:]
extract_tpl_parsed = parse_template(extract_tpl)
if opt_value[0:1] == "t":
target_tpl = opt_value[1:]
target_tpl_parsed = parse_template(target_tpl)
if opt_value[0:1] == "s":
source_tpl = opt_value[1:]
source_tpl_parsed = parse_template(source_tpl)
if dirmode:
if (os.path.isdir(source_data)):
data_parsed = parse_input_data_d(source_data,source_tpl)
record = create_record(begin_record_header, ending_record_footer, query_string, match_mode)
if record != "":
print record
tcounter = tcounter + 1
if output_rec_sep != "":
print output_rec_sep
else:
exit_on_error("Cannot access directory: %s" % source_data)
else:
done = 0
print begin_header
while (done == 0):
data_parsed = parse_input_data_fx(source_tpl)
if (data_parsed == -1):
done = 1
else:
if (data_parsed[0][0]!= ''):
record = create_record(begin_record_header, ending_record_footer, query_string, match_mode)
if record != "":
print record
tcounter = tcounter + 1
if output_rec_sep != "":
print output_rec_sep
print ending_footer
</protect>
Event Timeline
Log In to Comment