Page MenuHomec4science

bibrank_tag_based_indexer.py
No OneTemporary

File Metadata

Created
Sun, Jul 7, 00:29

bibrank_tag_based_indexer.py

# -*- coding: utf-8 -*-
## $Id$
## Ranking of records using different parameters and methods.
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
__revision__ = "$Id$"
import sys
import time
import marshal
import traceback
import ConfigParser
from invenio.config import \
CFG_SITE_LANG, \
CFG_ETCDIR
from invenio.search_engine import perform_request_search, HitSet
from invenio.bibrank_citation_indexer import get_citation_weight
from invenio.bibrank_downloads_indexer import *
from invenio.dbquery import run_sql, serialize_via_marshal, deserialize_via_marshal
from invenio.bibtask import task_get_option, write_message
options = {}
def citation_exec(rank_method_code, name, config):
"""Creating the rank method data for citation"""
dict = get_citation_weight(rank_method_code, config)
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
if dict: intoDB(dict, date, rank_method_code)
else: write_message("no need to update the indexes for citations")
def download_weight_filtering_user(run):
return bibrank_engine(run)
def download_weight_total(run):
return bibrank_engine(run)
def file_similarity_by_times_downloaded(run):
return bibrank_engine(run)
def download_weight_filtering_user_exec (rank_method_code, name, config):
"""Ranking by number of downloads per User.
Only one full Text Download is taken in account for one specific userIP address"""
time1 = time.time()
dic = fromDB(rank_method_code)
last_updated = get_lastupdated(rank_method_code)
keys = new_downloads_to_index(last_updated)
filter_downloads_per_hour(keys, last_updated)
dic = get_download_weight_filtering_user(dic, keys)
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
intoDB(dic, date, rank_method_code)
time2 = time.time()
return {"time":time2-time1}
def download_weight_total_exec(rank_method_code, name, config):
"""rankink by total number of downloads without check the user ip
if users downloads 3 time the same full text document it has to be count as 3 downloads"""
time1 = time.time()
dic = fromDB(rank_method_code)
last_updated = get_lastupdated(rank_method_code)
keys = new_downloads_to_index(last_updated)
filter_downloads_per_hour(keys, last_updated)
dic = get_download_weight_total(dic, keys)
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
intoDB(dic, date, rank_method_code)
time2 = time.time()
return {"time":time2-time1}
def file_similarity_by_times_downloaded_exec(rank_method_code, name, config):
"""update dictionnary {recid:[(recid, nb page similarity), ()..]}"""
time1 = time.time()
dic = fromDB(rank_method_code)
last_updated = get_lastupdated(rank_method_code)
keys = new_downloads_to_index(last_updated)
filter_downloads_per_hour(keys, last_updated)
dic = get_file_similarity_by_times_downloaded(dic, keys)
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
intoDB(dic, date, rank_method_code)
time2 = time.time()
return {"time":time2-time1}
def single_tag_rank_method_exec(rank_method_code, name, config):
"""Creating the rank method data"""
startCreate = time.time()
rnkset = {}
rnkset_old = fromDB(rank_method_code)
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
rnkset_new = single_tag_rank(config)
rnkset = union_dicts(rnkset_old, rnkset_new)
intoDB(rnkset, date, rank_method_code)
def single_tag_rank(config):
"""Connect the given tag with the data from the kb file given"""
write_message("Loading knowledgebase file", verbose=9)
kb_data = {}
records = []
write_message("Reading knowledgebase file: %s" % config.get(config.get("rank_method", "function"), "kb_src"))
input = open(config.get(config.get("rank_method", "function"), "kb_src"), 'r')
data = input.readlines()
for line in data:
if not line[0:1] == "#":
kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1]
write_message("Number of lines read from knowledgebase file: %s" % len(kb_data))
tag = config.get(config.get("rank_method", "function"), "tag")
tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ")
if tags == ['']:
tags = ""
records = []
for (recids, recide) in options["recid_range"]:
write_message("......Processing records #%s-%s" % (recids, recide))
recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide))
valid = HitSet(trailing_bits=1)
valid.discard(0)
for key in tags:
newset = HitSet()
newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))]
valid.intersection_update(newset)
if tags:
recs = filter(lambda x: x[0] in valid, recs)
records = records + list(recs)
write_message("Number of records found with the necessary tags: %s" % len(records))
records = filter(lambda x: x[0] in options["validset"], records)
rnkset = {}
for key, value in records:
if kb_data.has_key(value):
if not rnkset.has_key(key):
rnkset[key] = float(kb_data[value])
else:
if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]):
rnkset[key] = float(kb_data[value])
else:
rnkset[key] = 0
write_message("Number of records available in rank method: %s" % len(rnkset))
return rnkset
def get_lastupdated(rank_method_code):
"""Get the last time the rank method was updated"""
res = run_sql("SELECT rnkMETHOD.last_updated FROM rnkMETHOD WHERE name=%s", (rank_method_code, ))
if res:
return res[0][0]
else:
raise Exception("Is this the first run? Please do a complete update.")
def intoDB(dict, date, rank_method_code):
"""Insert the rank method data into the database"""
mid = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
del_rank_method_codeDATA(rank_method_code)
serdata = serialize_via_marshal(dict);
midstr = str(mid[0][0]);
run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) VALUES (%s,%s)", (midstr, serdata,))
run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", (date, rank_method_code))
def fromDB(rank_method_code):
"""Get the data for a rank method"""
id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
if res:
return deserialize_via_marshal(res[0][0])
else:
return {}
def del_rank_method_codeDATA(rank_method_code):
"""Delete the data for a rank method"""
id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
res = run_sql("DELETE FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
def del_recids(rank_method_code, range_rec):
"""Delete some records from the rank method"""
id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0] ))
if res:
rec_dict = deserialize_via_marshal(res[0][0])
write_message("Old size: %s" % len(rec_dict))
for (recids, recide) in range_rec:
for i in range(int(recids), int(recide)):
if rec_dict.has_key(i):
del rec_dict[i]
write_message("New size: %s" % len(rec_dict))
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
intoDB(rec_dict, date, rank_method_code)
else:
write_message("Create before deleting!")
def union_dicts(dict1, dict2):
"Returns union of the two dicts."
union_dict = {}
for (key, value) in dict1.iteritems():
union_dict[key] = value
for (key, value) in dict2.iteritems():
union_dict[key] = value
return union_dict
def rank_method_code_statistics(rank_method_code):
"""Print statistics"""
method = fromDB(rank_method_code)
max = ('', -999999)
maxcount = 0
min = ('', 999999)
mincount = 0
for (recID, value) in method.iteritems():
if value < min and value > 0:
min = value
if value > max:
max = value
for (recID, value) in method.iteritems():
if value == min:
mincount += 1
if value == max:
maxcount += 1
write_message("Showing statistic for selected method")
write_message("Method name: %s" % getName(rank_method_code))
write_message("Short name: %s" % rank_method_code)
write_message("Last run: %s" % get_lastupdated(rank_method_code))
write_message("Number of records: %s" % len(method))
write_message("Lowest value: %s - Number of records: %s" % (min, mincount))
write_message("Highest value: %s - Number of records: %s" % (max, maxcount))
write_message("Divided into 10 sets:")
for i in range(1, 11):
setcount = 0
distinct_values = {}
lower = -1.0 + ((float(max + 1) / 10)) * (i - 1)
upper = -1.0 + ((float(max + 1) / 10)) * i
for (recID, value) in method.iteritems():
if value >= lower and value <= upper:
setcount += 1
distinct_values[value] = 1
write_message("Set %s (%s-%s) %s Distinct values: %s" % (i, lower, upper, len(distinct_values), setcount))
def check_method(rank_method_code):
write_message("Checking rank method...")
if len(fromDB(rank_method_code)) == 0:
write_message("Rank method not yet executed, please run it to create the necessary data.")
else:
if len(add_recIDs_by_date(rank_method_code)) > 0:
write_message("Records modified, update recommended")
else:
write_message("No records modified, update not necessary")
def bibrank_engine(run):
"""Run the indexing task.
Return 1 in case of success and 0 in case of failure.
"""
try:
import psyco
psyco.bind(single_tag_rank)
psyco.bind(single_tag_rank_method_exec)
psyco.bind(serialize_via_marshal)
psyco.bind(deserialize_via_marshal)
except StandardError, e:
print "Psyco ERROR", e
startCreate = time.time()
sets = {}
try:
options["run"] = []
options["run"].append(run)
for rank_method_code in options["run"]:
cfg_name = getName(rank_method_code)
write_message("Running rank method: %s." % cfg_name)
file = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg"
config = ConfigParser.ConfigParser()
try:
config.readfp(open(file))
except StandardError, e:
write_message("Cannot find configurationfile: %s" % file, sys.stderr)
raise StandardError
cfg_short = rank_method_code
cfg_function = config.get("rank_method", "function") + "_exec"
cfg_name = getName(cfg_short)
options["validset"] = get_valid_range(rank_method_code)
if task_get_option("collection"):
l_of_colls = string.split(task_get_option("collection"), ", ")
recIDs = perform_request_search(c=l_of_colls)
recIDs_range = []
for recID in recIDs:
recIDs_range.append([recID, recID])
options["recid_range"] = recIDs_range
elif task_get_option("id"):
options["recid_range"] = task_get_option("id")
elif task_get_option("modified"):
options["recid_range"] = add_recIDs_by_date(rank_method_code, task_get_option("modified"))
elif task_get_option("last_updated"):
options["recid_range"] = add_recIDs_by_date(rank_method_code)
else:
write_message("No records specified, updating all", verbose=2)
min_id = run_sql("SELECT min(id) from bibrec")[0][0]
max_id = run_sql("SELECT max(id) from bibrec")[0][0]
options["recid_range"] = [[min_id, max_id]]
if task_get_option("quick") == "no":
write_message("Recalculate parameter not used, parameter ignored.", verbose=9)
if task_get_option("cmd") == "del":
del_recids(cfg_short, options["recid_range"])
elif task_get_option("cmd") == "add":
func_object = globals().get(cfg_function)
func_object(rank_method_code, cfg_name, config)
elif task_get_option("cmd") == "stat":
rank_method_code_statistics(rank_method_code)
elif task_get_option("cmd") == "check":
check_method(rank_method_code)
elif task_get_option("cmd") == "repair":
pass
else:
write_message("Invalid command found processing %s" % rank_method_code, sys.stderr)
raise StandardError
except StandardError, e:
write_message("\nException caught: %s" % e, sys.stderr)
if task_get_option("verbose") >= 9:
traceback.print_tb(sys.exc_info()[2])
raise StandardError
if task_get_option("verbose"):
showtime((time.time() - startCreate))
return 1
def get_valid_range(rank_method_code):
"""Return a range of records"""
write_message("Getting records from collections enabled for rank method.", verbose=9)
res = run_sql("SELECT collection.name FROM collection, collection_rnkMETHOD, rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name=%s", (rank_method_code, ))
l_of_colls = []
for coll in res:
l_of_colls.append(coll[0])
if len(l_of_colls) > 0:
recIDs = perform_request_search(c=l_of_colls)
else:
recIDs = []
valid = HitSet()
valid += recIDs
return valid
def add_recIDs_by_date(rank_method_code, dates=""):
"""Return recID range from records modified between DATES[0] and DATES[1].
If DATES is not set, then add records modified since the last run of
the ranking method RANK_METHOD_CODE.
"""
if not dates:
try:
dates = (get_lastupdated(rank_method_code), '')
except Exception, e:
dates = ("0000-00-00 00:00:00", '')
if dates[0] is None:
dates = ("0000-00-00 00:00:00", '')
query = """SELECT b.id FROM bibrec AS b WHERE b.modification_date >= %s"""
if dates[1]:
query += " and b.modification_date <= %s"
query += " ORDER BY b.id ASC"""
if dates[1]:
res = run_sql(query, (dates[0], dates[1]))
else:
res = run_sql(query, (dates[0], ))
list = create_range_list(res)
if not list:
write_message("No new records added since last time method was run")
return list
def getName(rank_method_code, ln=CFG_SITE_LANG, type='ln'):
"""Returns the name of the method if it exists"""
try:
rnkid = run_sql("SELECT id FROM rnkMETHOD where name=%s", (rank_method_code, ))
if rnkid:
rnkid = str(rnkid[0][0])
res = run_sql("SELECT value FROM rnkMETHODNAME where type=%s and ln=%s and id_rnkMETHOD=%s", (type, ln, rnkid))
if not res:
res = run_sql("SELECT value FROM rnkMETHODNAME WHERE ln=%s and id_rnkMETHOD=%s and type=%s", (CFG_SITE_LANG, rnkid, type))
if not res:
return rank_method_code
return res[0][0]
else:
raise Exception
except Exception, e:
write_message("Cannot run rank method, either given code for method is wrong, or it has not been added using the webinterface.")
raise Exception
def create_range_list(res):
"""Creates a range list from a recID select query result contained
in res. The result is expected to have ascending numerical order."""
if not res:
return []
row = res[0]
if not row:
return []
else:
range_list = [[row[0], row[0]]]
for row in res[1:]:
id = row[0]
if id == range_list[-1][1] + 1:
range_list[-1][1] = id
else:
range_list.append([id, id])
return range_list
def single_tag_rank_method(run):
return bibrank_engine(run)
def showtime(timeused):
"""Show time used for method"""
write_message("Time used: %d second(s)." % timeused, verbose=9)
def citation(run):
return bibrank_engine(run)

Event Timeline