Page MenuHomec4science

bibrankgkb.wml
No OneTemporary

File Metadata

Created
Wed, Jul 24, 09:31

bibrankgkb.wml

# Utility for creating knowledge base for use with bibrank
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002, 2003, 2004, 2005 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
## read config variables:
#include "config.wml"
#include "configbis.wml"
## start Python:
<protect>#!</protect><PYTHON>
<protect># -*- coding: utf-8 -*-</protect>
<protect>## $Id$</protect>
<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
"""
Usage: bibrankgkb %s [options]
Examples:
bibrankgkb --input=bibrankgkb.cfg --output=test.kb
bibrankgkb -otest.kb -v9
bibrankgkb -v9
Generate options:
-i, --input=file input file, default from /etc/bibrank/bibrankgkb.cfg
-o, --output=file output file, will be placed in current folder
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
"""
## fill config variables:
pylibdir = "<LIBDIR>/python"
__version__ = "$Id$"
try:
from marshal import loads,dumps
from zlib import compress,decompress
from string import split,translate,lower,upper
import getopt
import getpass
import string
import os
import sre
import sys
import time
import MySQLdb
import Numeric
import urllib
import signal
import tempfile
import unicodedata
import traceback
import cStringIO
import re
import copy
import types
import ConfigParser
except ImportError, e:
import sys
try:
sys.path.append('%s' % pylibdir)
from cdsware.config import *
from cdsware.search_engine_config import cfg_max_recID
from cdsware.search_engine import perform_request_search, strip_accents
from cdsware.search_engine import HitSet
from cdsware.dbquery import run_sql
except ImportError, e:
import sys
try:
import psyco
psyco.bind(serialize_via_numeric_array)
except:
pass
opts_dict = {}
task_id = -1
def bibrankgkb(config):
"""Generates a .kb file based on input from the configuration file"""
if opts_dict["verbose"] >= 1:
write_message("Running: Generate Knowledgebase.")
journals = {}
journal_src = {}
i = 0
#Reading the configuration file
while config.has_option("bibrankgkb","create_%s" % i):
cfg = split(config.get("bibrankgkb", "create_%s" % i),",,")
conv = {}
temp = {}
#Input source 1, either file, www or from db
if cfg[0] == "file":
conv = get_from_source(cfg[0], cfg[1])
del cfg[0:2]
elif cfg[0] == "www":
j = 0
urls = {}
while config.has_option("bibrankgkb",cfg[1] % j):
urls[j] = config.get("bibrankgkb",cfg[1] % j)
j = j + 1
conv = get_from_source(cfg[0], (urls, cfg[2]))
del cfg[0:3]
elif cfg[0] == "db":
conv = get_from_source(cfg[0], (cfg[1], cfg[2]))
del cfg[0:3]
if not conv:
del cfg[0:2]
else:
if opts_dict["verbose"] >= 9:
write_message("Using last resource for converting values.")
#Input source 2, either file, www or from db
if cfg[0] == "file":
temp = get_from_source(cfg[0], cfg[1])
elif cfg[0] == "www":
j = 0
urls = {}
while config.has_option("bibrankgkb",cfg[1] % j):
urls[j] = config.get("bibrankgkb",cfg[1] % j)
j = j + 1
temp = get_from_source(cfg[0], (urls, cfg[2]))
elif cfg[0] == "db":
temp = get_from_source(cfg[0], (cfg[1], cfg[2]))
i = i + 1
#If a convertion file is given, the names will be converted to the correct convention
if len(conv) != 0:
if opts_dict["verbose"] >= 9:
write_message("Converting between naming conventions given.")
temp = convert(conv, temp)
if len(journals) != 0:
for element in temp.keys():
if not journals.has_key(element):
journals[element] = temp[element]
else:
journals = temp
#Writing output file
if opts_dict["output"]:
f = open(opts_dict["output"], 'w')
f.write("#Created by %s\n" % __version__)
f.write("#Sources:\n")
for key in journals.keys():
f.write("%s---%s\n" % (key,journals[key]))
f.close()
if opts_dict["verbose"] >= 9:
write_message("Output complete: %s" % opts_dict["output"])
write_message("Number of hits: %s" % len(journals))
if opts_dict["verbose"] >= 9:
write_message("Result:")
for key in journals.keys():
write_message("%s---%s" % (key,journals[key]))
write_message("Total nr of lines: %s" % len(journals))
def showtime(timeused):
if opts_dict["verbose"] >= 9:
write_message("Time used: %d second(s)." % timeused)
def get_from_source(type, data):
"""Read a source based on the input to the function"""
datastruct = {}
if type == "db":
jvalue = run_sql(data[0])
jname = dict(run_sql(data[1]))
if opts_dict["verbose"] >= 9:
write_message("Reading data from database using SQL statements:")
write_message(jvalue)
write_message(jname)
for key, value in jvalue:
if jname.has_key(key):
key2 = string.strip(jname[key])
datastruct[key2] = value
#print "%s---%s" % (key2, value)
elif type == "file":
input = open(data, 'r')
if opts_dict["verbose"] >= 9:
write_message("Reading data from file: %s" % data)
data = input.readlines()
datastruct = {}
for line in data:
#print line
if not line[0:1] == "#":
key = string.strip((string.split(string.strip(line),"---"))[0])
value = (string.split(string.strip(line), "---"))[1]
datastruct[key] = value
#print "%s---%s" % (key,value)
elif type == "www":
if opts_dict["verbose"] >= 9:
write_message("Reading data from www using regexp: %s" % data[1])
write_message("Reading data from url:")
for link in data[0].keys():
if opts_dict["verbose"] >= 9:
write_message(data[0][link])
page = urllib.urlopen(data[0][link])
input = page.read()
#Using the regexp from config file
reg = re.compile(data[1])
iterator = re.finditer(reg, input)
for match in iterator:
if match.group("value"):
key = string.strip(match.group("key"))
value = string.replace(match.group("value"),",",".")
datastruct[key] = value
if opts_dict["verbose"] == 9:
print "%s---%s" % (key,value)
return datastruct
def convert(convstruct, journals):
"""Converting between names"""
if len(convstruct) > 0 and len(journals) > 0:
invconvstruct = dict(map(lambda x: (x[1], x[0]), convstruct.items()))
tempjour = {}
for name in journals.keys():
if convstruct.has_key(name):
tempjour[convstruct[name]] = journals[name]
elif invconvstruct.has_key(name):
tempjour[name] = journals[name]
return tempjour
else:
return journals
def serialize_via_numeric_array_dumps(arr):
return Numeric.dumps(arr)
def serialize_via_numeric_array_compr(str):
return compress(str)
def serialize_via_numeric_array_escape(str):
return MySQLdb.escape_string(str)
def serialize_via_numeric_array(arr):
"""Serialize Numeric array into a compressed string."""
return serialize_via_numeric_array_escape(serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr)))
def deserialize_via_numeric_array(string):
"""Decompress and deserialize string into a Numeric array."""
return Numeric.loads(decompress(string))
def write_message(msg, stream = sys.stdout):
"""Write message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
if stream == sys.stdout or stream == sys.stderr:
stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
stream.write("%s\n" % msg)
stream.flush()
else:
sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)
return
def usage(code, msg=''):
"Prints usage for this module."
if msg:
sys.stderr.write("Error: %s.\n" % msg)
print >> sys.stderr, \
""" Usage: %s [options]
Examples:
%s --input=bibrankgkb.cfg --output=test.kb
%s -otest.kb -v9
%s -v9
Generate options:
-i, --input=file input file, default from /etc/bibrank/bibrankgkb.cfg
-o, --output=file output file, will be placed in current folder
General options:
-h, --help print this help and exit
-V, --version print version and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
""" % ((sys.argv[0],) * 4)
sys.exit(code)
def command_line():
global opts_dict
long_flags = ["input=", "output=", "help", "version", "verbose="]
short_flags = "i:o:hVv:"
format_string = "%Y-%m-%d %H:%M:%S"
sleeptime = ""
try:
opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags)
except getopt.GetoptError, err:
write_message(err, sys.stderr)
usage(1)
if args:
usage(1)
opts_dict = {"input": "%s/bibrank/bibrankgkb.cfg" % etcdir, "output":"", "verbose":1}
sched_time = time.strftime(format_string)
user = ""
try:
for opt in opts:
if opt == ("-h","") or opt == ("--help",""):
usage(1)
elif opt == ("-V","") or opt == ("--version",""):
print __version__
sys.exit(1)
elif opt[0] in ["--input", "-i"]:
opts_dict["input"] = opt[1]
elif opt[0] in ["--output", "-o"]:
opts_dict["output"] = opt[1]
elif opt[0] in ["--verbose", "-v"]:
opts_dict["verbose"] = int(opt[1])
else:
usage(1)
startCreate = time.time()
file = opts_dict["input"]
config = ConfigParser.ConfigParser()
config.readfp(open(file))
bibrankgkb(config)
if opts_dict["verbose"] >= 9:
showtime((time.time() - startCreate))
except StandardError, e:
write_message(e, sys.stderr)
sys.exit(1)
return
def main():
command_line()
if __name__ == "__main__":
main()

Event Timeline