bibrank_citerank_indexer.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, May 19, 13:57

bibrank_citerank_indexer.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of CDS Invenio.
	## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
	##
	## CDS Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## CDS Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""Implementation of different ranking methods based on
	the citation graph:
	- citation count/ time decayed citation count
	- pagerank / pagerank with external citations
	- time decayed pagerank
	"""

	# pylint: disable-msg=E0611

	import marshal
	import ConfigParser
	from math import exp
	import datetime
	import time
	import re
	import sys
	from numpy import array, ones, zeros, int32, float32, sqrt, dot
	from zlib import decompress

	if sys.hexversion < 0x2040000:
	# pylint: disable-msg=W0622
	from sets import Set as set
	# pylint: enable-msg=W0622

	from invenio.dbquery import run_sql, serialize_via_marshal
	from invenio.bibtask import write_message
	from invenio.config import CFG_ETCDIR

	def get_citations_from_file(filename):
	"""gets the citation data (who cites who) from a file and returns
	- a dictionary of type x:{x1,x2..},
	where x is cited by x1,x2..
	- a dictionary of type a:{b},
	where recid 'a' is asociated with an index 'b' """
	cit = {}
	dict_of_ids = {}
	count = 0
	try:
	citation_file = open(filename,"r")
	except StandardError:
	write_message("Cannot find file: %s" % filename, sys.stderr)
	raise StandardError
	for line in citation_file:
	tokens = line.strip().split()
	recid_cites = int(tokens[0])
	recid_cited = int(tokens[1])
	if recid_cited not in cit:
	cit[recid_cited] = []
	#without this, duplicates might be introduced
	if recid_cites not in cit[recid_cited] and recid_cites != recid_cited:
	cit[recid_cited].append(recid_cites)
	if recid_cites not in dict_of_ids:
	dict_of_ids[recid_cites] = count
	count += 1
	if recid_cited not in dict_of_ids:
	dict_of_ids[recid_cited] = count
	count += 1
	citation_file.close()
	write_message("Citation data collected from file: %s" %filename, verbose=2)
	write_message("Ids and recids corespondace: %s" \
	%str(dict_of_ids), verbose=9)
	write_message("Citations: %s" % str(cit), verbose=9)
	return cit, dict_of_ids

	def get_citations_from_db():
	"""gets the citation data (who cites who) from the rnkCITATIONDATA table,
	and returns:
	-a dictionary of type x:{x1,x2..}, where x is cited by x1,x2..
	-a dict of type a:{b} where recid 'a' is asociated with an index 'b'"""
	dict_of_ids = {}
	count = 0
	query = "select object_value from rnkCITATIONDATA \
	where object_name = 'citationdict'"
	cit_compressed = run_sql(query)
	cit = []
	if cit_compressed and cit_compressed[0] and cit_compressed[0][0]:
	cit = marshal.loads(decompress(cit_compressed[0][0]))
	if cit:
	for item in cit:
	#check for duplicates in citation dictionary
	cit[item] = set(cit[item])
	if item in cit[item]:
	cit[item].remove(item)
	if item not in dict_of_ids:
	dict_of_ids[item] = count
	count += 1
	for value in cit[item]:
	if value not in dict_of_ids:
	dict_of_ids[value] = count
	count += 1
	write_message ("Citation data collected\
	from rnkCITATIONDATA", verbose=2)
	write_message("Ids and recids corespondace: %s" \
	% str(dict_of_ids), verbose=9)
	write_message("Citations: %s" % str(cit), verbose=9)
	return cit, dict_of_ids
	else:
	write_message("Error while extracting citation data \
	from rnkCITATIONDATA table", verbose=1)
	else:
	write_message("Error while extracting citation data \
	from rnkCITATIONDATA table", verbose=1)
	return {}, {}

	def construct_ref_array (cit, dict_of_ids, len_):
	"""returns an array with the number of references that each recid has """
	ref = array((), int32)
	ref = zeros(len_, int32)
	for key in cit:
	for value in cit[key]:
	ref[dict_of_ids[value]] += 1
	write_message("Number of references: %s" %str(ref), verbose=9)
	write_message("Finished computing total number \
	of references for each paper.", verbose=5)
	return ref

	def get_external_links_from_file(filename, ref, dict_of_ids):
	"""returns a dictionary containing the number of
	external links for each recid
	external link=citation that is not in our database """
	ext_links = {}
	#format: ext_links[dict_of_ids[recid]]=number of total external links
	try:
	external_file = open(filename,"r")
	except StandardError:
	write_message("Cannot find file: %s" % filename, sys.stderr)
	raise StandardError
	for line in external_file:
	tokens = line.strip().split()
	recid = int(tokens[0])
	nr_of_external = int(tokens[1])
	ext_links[dict_of_ids[recid]] = nr_of_external - ref[dict_of_ids[recid]]
	if ext_links[dict_of_ids[recid]] < 0:
	ext_links[dict_of_ids[recid]] = 0
	external_file.close()
	write_message("External link information extracted", verbose=2)
	return ext_links

	def get_external_links_from_db(ref, dict_of_ids, reference_indicator):
	"""returns a dictionary containing the number of
	external links for each recid
	external link=citation that is not in our database """
	ext_links = {}
	reference_tag_regex = reference_indicator + "[a-z]"
	for recid in dict_of_ids:
	query = "select COUNT(DISTINCT field_number) from bibrec_bib99x \
	where id_bibrec='%s' and id_bibxxx in \
	(select id from bib99x where tag RLIKE '%s');" \
	% (str(recid), reference_tag_regex)
	result_set = run_sql(query)
	if result_set:
	total_links = int(result_set[0][0])
	internal_links = ref[dict_of_ids[recid]]
	ext_links[dict_of_ids[recid]] = total_links - internal_links
	if ext_links[dict_of_ids[recid]] < 0:
	ext_links[dict_of_ids[recid]] = 0
	else:
	ext_links[dict_of_ids[recid]] = 0
	write_message ("External link information extracted", verbose=2)
	write_message("External links: %s" % str(ext_links), verbose=9)
	return ext_links

	def avg_ext_links_with_0(ext_links):
	"""returns the average number of external links per paper
	including in the counting the papers with 0 external links"""
	total = 0.0
	for item in ext_links:
	total += ext_links[item]
	avg_ext = total/len(ext_links)
	write_message("The average number of external links per paper (including \
	papers with 0 external links) is: %s" % str(avg_ext), verbose=3)
	return avg_ext

	def avg_ext_links_without_0(ext_links):
	"""returns the average number of external links per paper
	excluding in the counting the papers with 0 external links"""
	count = 0.0
	total = 0.0
	for item in ext_links:
	if ext_links[item] != 0:
	count += 1
	total += ext_links[item]
	avg_ext = total/count
	write_message ("The average number of external links per paper (excluding \
	papers with 0 external links) is: %s" % str(avg_ext), verbose=3)
	return avg_ext

	def leaves(ref):
	"""returns the number of papers that do not cite any other paper"""
	nr_of_leaves = 0
	for i in ref:
	if i == 0:
	nr_of_leaves += 1
	write_message ("The number of papers that do not cite \
	any other papers: %s" % str(leaves), verbose=3)
	return nr_of_leaves

	def get_dates_from_file(filename, dict_of_ids):
	"""Returns the year of the publication for each paper.
	In case the year is not in the db, the year of the submission is taken"""
	dates = {}
	# the format is: dates[dict_of_ids[recid]] = year
	try:
	dates_file = open(filename,"r")
	except StandardError:
	write_message("Cannot find file: %s" % filename, sys.stderr)
	raise StandardError
	for line in dates_file:
	tokens = line.strip().split()
	recid = int(tokens[0])
	year = int(tokens[1])
	dates[dict_of_ids[recid]] = year
	dates_file.close()
	write_message("Dates extracted", verbose=2)
	write_message("Dates dictionary %s" % str(dates), verbose=9)
	return dates

	def get_dates_from_db(dict_of_ids):
	"""Returns the year of the publication for each paper.
	In case the year is not in the db, the year of the submission is taken"""
	current_year = int(datetime.datetime.now().strftime("%Y"))
	total = 0
	count = 0
	dict_of_dates = {}
	for recid in dict_of_ids:
	dict_of_dates[recid] = 0
	query1 = "select * from bib26x where tag='260__c';"
	date_list = run_sql(query1)
	date_dict = {}
	for item in date_list:
	date_dict[int(item[0])] = item[2]
	pattern = re.compile('.(\d{4}).')
	query2 = "select * from bibrec_bib26x;"
	date_list = run_sql(query2)
	for item in date_list:
	recid = int(item[0])
	id_ = int(item[1])
	if id_ in date_dict and recid in dict_of_dates:
	reg = pattern.match(date_dict[id_])
	if reg:
	date = int(reg.group(1))
	if date > 1000 and date <= current_year:
	dict_of_dates[recid] = date
	total += date
	count += 1
	not_covered = []
	for recid in dict_of_dates:
	if dict_of_dates[recid] == 0:
	not_covered.append(recid)
	query3 = "select * from bib96x where tag='961__x';"
	date_list = run_sql(query3)
	date_dict = {}
	for item in date_list:
	date_dict[int(item[0])] = item[2]
	query4 = "select * from bibrec_bib96x;"
	date_list = run_sql(query4)
	for item in date_list:
	recid = int(item[0])
	id_ = int(item[1])
	if id_ in date_dict and recid in not_covered:
	date = int(str(date_dict[id_])[0:4])
	if date > 1000 and date <= current_year:
	dict_of_dates[recid] = date
	total += date
	count += 1
	dates = {}
	med = total/count
	for recid in dict_of_dates:
	if dict_of_dates[recid] == 0:
	dates[dict_of_ids[recid]] = med
	else:
	dates[dict_of_ids[recid]] = dict_of_dates[recid]
	write_message("Dates extracted", verbose=2)
	write_message("Dates dictionary %s" % str(dates), verbose=9)
	return dates

	def construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor):
	"""returns several structures needed in the calculation
	of the PAGERANK method using this structures, we don't need
	to keep the full matrix in the memory"""
	sparse = {}
	for item in cit:
	for value in cit[item]:
	sparse[(dict_of_ids[item], dict_of_ids[value])] = \
	damping_factor * 1.0/ref[dict_of_ids[value]]
	semi_sparse = []
	for j in range(len_):
	if ref[j] == 0:
	semi_sparse.append(j)
	semi_sparse_coeficient = damping_factor/len_
	#zero_coeficient = (1-damping_factor)/len_
	write_message("Sparse information calculated", verbose=3)
	return sparse, semi_sparse, semi_sparse_coeficient

	def construct_sparse_matrix_ext(cit, ref, ext_links, dict_of_ids, alpha, beta):
	"""if x doesn't cite anyone: cites everyone : 1/len_ -- should be used!
	returns several structures needed in the calculation
	of the PAGERANK_EXT method"""
	len_ = len(dict_of_ids)
	sparse = {}
	semi_sparse = {}
	for i in range(len_):
	sparse[i+1, 0] = alpha/(len_)
	sparse[0, 0] = 1.0 - alpha
	for j in range(len_):
	if j not in ext_links:
	sparse[0, j+1] = beta/(len_ + beta)
	else:
	if ext_links[j] == 0:
	sparse[0, j+1] = beta/(len_ + beta)
	else:
	aux = beta * ext_links[j]
	if ref[j] == 0:
	sparse[0, j+1] = aux/(aux + len_)
	else:
	sparse[0, j+1] = aux/(aux + ref[j])
	if ref[j] == 0:
	semi_sparse[j+1] = (1.0 - sparse[0, j + 1])/len_
	for item in cit:
	for value in cit[item]:
	sparse[(dict_of_ids[item] + 1, dict_of_ids[value] + 1)] = \
	(1.0 - sparse[0, dict_of_ids[value] + 1])/ref[dict_of_ids[value]]
	#for i in range(len_ + 1):
	# a = ""
	# for j in range (len_ + 1):
	# if (i,j) in sparse:
	# a += str(sparse[(i,j)]) + "\t"
	# else:
	# a += "0\t"
	# print a
	#print semi_sparse
	write_message("Sparse information calculated", verbose=3)
	return sparse, semi_sparse

	def construct_sparse_matrix_time(cit, ref, dict_of_ids, \
	damping_factor, date_coef):
	"""returns several structures needed in the calculation of the PAGERANK_time
	method using this structures,
	we don't need to keep the full matrix in the memory"""
	len_ = len(dict_of_ids)
	sparse = {}
	for item in cit:
	for value in cit[item]:
	sparse[(dict_of_ids[item], dict_of_ids[value])] = damping_factor * \
	date_coef[dict_of_ids[value]]/ref[dict_of_ids[value]]
	semi_sparse = []
	for j in range(len_):
	if ref[j] == 0:
	semi_sparse.append(j)
	semi_sparse_coeficient = damping_factor/len_
	#zero_coeficient = (1-damping_factor)/len_
	write_message("Sparse information calculated", verbose=3)
	return sparse, semi_sparse, semi_sparse_coeficient

	def statistics_on_sparse(sparse):
	"""returns the number of papers that cite themselves"""
	count_diag = 0
	for (i, j) in sparse.keys():
	if i == j:
	count_diag += 1
	write_message("The number of papers that cite themselves: %s" % \
	str(count_diag), verbose=3)
	return count_diag

	def pagerank(conv_threshold, check_point, len_, sparse, \
	semi_sparse, semi_sparse_coef):
	"""the core function of the PAGERANK method
	returns an array with the ranks coresponding to each recid"""
	weights_old = array((), float32)
	weights_old = ones((len_), float32) # initial weights
	weights_new = array((), float32)
	converged = False
	nr_of_check_points = 0
	difference = len_
	while not converged:
	nr_of_check_points += 1
	for step in (range(check_point)):
	weights_new = zeros((len_), float32)
	for (i, j) in sparse.keys():
	weights_new[i] += sparse[(i, j)]*weights_old[j]
	semi_total = 0.0
	for j in semi_sparse:
	semi_total += weights_old[j]
	weights_new = weights_new + semi_sparse_coef * semi_total + \
	(1.0/len_ - semi_sparse_coef) * sum(weights_old)
	if step == check_point - 1:
	diff = weights_new - weights_old
	difference = sqrt(dot(diff, diff))/len_
	write_message( "Finished step: %s, %s " \
	%(str(check_point*(nr_of_check_points-1) + step), \
	str(difference)), verbose=5)
	weights_old = weights_new.copy()
	converged = (difference < conv_threshold)
	write_message("PageRank calculated for all recids finnished in %s steps. \
	The threshold was %s" % (str(nr_of_check_points), str(difference)),\
	verbose=2)
	return weights_old

	def pagerank_ext( conv_threshold, check_point, len_, sparse, semi_sparse):
	"""the core function of the PAGERANK_EXT method
	returns an array with the ranks coresponding to each recid"""
	weights_old = array((), float32)
	weights_old = ones((len_), float32)
	weights_new = array((), float32)
	converged = False
	nr_of_check_points = 0
	difference = len_
	while not converged:
	nr_of_check_points += 1
	for step in (range(check_point)):
	weights_new = zeros((len_), float32)
	for (i, j) in sparse.keys():
	weights_new[i] += sparse[(i, j)]*weights_old[j]
	total_sum = 0.0
	for j in semi_sparse:
	total_sum += semi_sparse[j]*weights_old[j]
	weights_new[1:len_] = weights_new[1:len_] + total_sum
	if step == check_point - 1:
	diff = weights_new - weights_old
	difference = sqrt(dot(diff, diff))/len_
	write_message( "Finished step: %s, %s " \
	% (str(check_point*(nr_of_check_points-1) + step), \
	str(difference)), verbose=5)
	weights_old = weights_new.copy()
	converged = (difference < conv_threshold)
	write_message("PageRank calculated for all recids finnished in %s steps. \
	The threshold was %s" % (str(nr_of_check_points), \
	str(difference)), verbose=2)
	#return weights_old[1:len_]/(len_ - weights_old[0])
	return weights_old[1:len_]

	def pagerank_time(conv_threshold, check_point, len_, \
	sparse, semi_sparse, semi_sparse_coeficient, date_coef):
	"""the core function of the PAGERANK_TIME method: pageRank + time decay
	returns an array with the ranks coresponding to each recid"""
	weights_old = array((), float32)
	weights_old = ones((len_), float32) # initial weights
	weights_new = array((), float32)
	converged = False
	nr_of_check_points = 0
	difference = len_
	while not converged:
	nr_of_check_points += 1
	for step in (range(check_point)):
	weights_new = zeros((len_), float32)
	for (i, j) in sparse.keys():
	weights_new[i] += sparse[(i, j)]*weights_old[j]
	semi_total = 0.0
	for j in semi_sparse:
	semi_total += weights_old[j]*date_coef[j]
	zero_total = 0.0
	for i in range(len_):
	zero_total += weights_old[i]*date_coef[i]
	#dates = array(date_coef.keys())
	#zero_total = dot(weights_old, dates)
	weights_new = weights_new + semi_sparse_coeficient * semi_total + \
	(1.0/len_ - semi_sparse_coeficient) * zero_total
	if step == check_point - 1:
	diff = weights_new - weights_old
	difference = sqrt(dot(diff, diff))/len_
	write_message( "Finished step: %s, %s " \
	% (str(check_point*(nr_of_check_points-1) + step), \
	str(difference)), verbose=5)
	weights_old = weights_new.copy()
	converged = (difference < conv_threshold)
	write_message("PageRank calculated for all recids finnished in %s steps.\
	The threshold was %s" % (str(nr_of_check_points), \
	str(difference)), verbose=2)
	return weights_old

	def citation_rank_time(cit, dict_of_ids, date_coef, dates, decimals):
	"""returns a dictionary recid:weight based on the total number of
	citations as function of time"""
	dict_of_ranks = {}
	for key in dict_of_ids:
	if key in cit:
	dict_of_ranks[key] = 0
	for recid in cit[key]:
	dict_of_ranks[key] += date_coef[dict_of_ids[recid]]
	dict_of_ranks[key] = round(dict_of_ranks[key], decimals) \
	+ dates[dict_of_ids[key]]* pow(10, 0-4-decimals)
	else:
	dict_of_ranks[key] = dates[dict_of_ids[key]]* pow(10, 0-4-decimals)
	write_message("Citation rank calculated", verbose=2)
	return dict_of_ranks

	def get_ranks(weights, dict_of_ids, mult, dates, decimals):
	"""returns a dictionary recid:value, where value is the weight of the
	recid paper; the second order is the reverse time order,
	from recent to past"""
	dict_of_ranks = {}
	for item in dict_of_ids:
	dict_of_ranks[item] = round(weights[dict_of_ids[item]]* mult, decimals) \
	+ dates[dict_of_ids[item]]* pow(10, 0-4-decimals)
	#dict_of_ranks[item] = weights[dict_of_ids[item]]
	return dict_of_ranks

	def sort_weights(dict_of_ranks):
	"""sorts the recids based on weights(first order)
	and on dates(second order)"""
	ranks_by_citations = sorted(dict_of_ranks.keys(), lambda x, y: \
	cmp(dict_of_ranks[y], dict_of_ranks[x]))
	return ranks_by_citations

	def write_first_ranks_to_file(ranks_by_citations, dict_of_ranks, \
	nr_of_ranks, filename):
	"""Writes the first n results of the ranking method into a file"""
	try:
	ranks_file = open(filename,"w")
	except StandardError:
	write_message("Problems with file: %s" % filename, sys.stderr)
	raise StandardError
	for i in range(nr_of_ranks):
	ranks_file.write(str(i+1) + "\t" + str(ranks_by_citations[i]) + \
	"\t" + str(dict_of_ranks[ranks_by_citations[i]]) + "\n")
	ranks_file.close()
	write_message("The first %s pairs recid:rank in the ranking order \
	are written into this file: %s" % (nr_of_ranks, filename), verbose=2)

	def del_rank_method_data(rank_method_code):
	"""Delete the data for a rank method from rnkMETHODDATA table"""
	id_ = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code,))
	run_sql("DELETE FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id_[0][0], ))

	def into_db(dict_of_ranks, rank_method_code):
	"""Writes into the rnkMETHODDATA table the ranking results"""
	method_id = run_sql("SELECT id from rnkMETHOD where name=%s", \
	(rank_method_code, ))
	del_rank_method_data(rank_method_code)
	serialized_data = serialize_via_marshal(dict_of_ranks)
	method_id_str = str(method_id[0][0])
	run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) \
	VALUES (%s,%s)",(method_id_str, serialized_data,))
	date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
	run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", \
	(date, rank_method_code))
	write_message("Finished writing the ranks into rnkMETHOD table", verbose=5)

	def run_pagerank(cit, dict_of_ids, len_, ref, damping_factor, \
	conv_threshold, check_point, dates):
	"""returns the final form of the ranks when using pagerank method"""
	write_message("Running the PageRank method", verbose=5)
	sparse, semi_sparse, semi_sparse_coeficient = \
	construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor)
	weights = pagerank(conv_threshold, check_point, len_, \
	sparse, semi_sparse, semi_sparse_coeficient)
	dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2)
	return dict_of_ranks

	def run_pagerank_ext(cit, dict_of_ids, ref, ext_links, \
	conv_threshold, check_point, alpha, beta, dates):
	"""returns the final form of the ranks when using pagerank_ext method"""
	write_message( "Running the PageRank with external links method", verbose=5)
	len_ = len(dict_of_ids)
	sparse, semi_sparse = construct_sparse_matrix_ext(cit, ref, \
	ext_links, dict_of_ids, alpha, beta)
	weights = pagerank_ext(conv_threshold, check_point, \
	len_ + 1, sparse, semi_sparse)
	dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2)
	return dict_of_ranks

	def run_pagerank_time(cit, dict_of_ids, len_, ref, damping_factor, \
	conv_threshold, check_point, date_coef, dates):
	"""returns the final form of the ranks when using
	pagerank + time decay method"""
	write_message("Running the PageRank_time method", verbose=5)
	sparse, semi_sparse, semi_sparse_coeficient = \
	construct_sparse_matrix_time(cit, ref, dict_of_ids, \
	damping_factor, date_coef)
	weights = pagerank_time(conv_threshold, check_point, len_, \
	sparse, semi_sparse, semi_sparse_coeficient, date_coef)
	dict_of_ranks = get_ranks(weights, dict_of_ids, 100000, dates, 2)
	return dict_of_ranks

	def run_citation_rank_time(cit, dict_of_ids, date_coef, dates):
	"""returns the final form of the ranks when using citation count
	as function of time method"""
	write_message("Running the citation rank with time decay method", verbose=5)
	dict_of_ranks = citation_rank_time(cit, dict_of_ids, date_coef, dates, 2)
	return dict_of_ranks

	def spearman_rank_correlation_coef(rank1, rank2, len_):
	"""rank1 and rank2 are arrays containing the recids in the ranking order
	returns the corelation coeficient (-1 <= c <= 1) between 2 rankings
	the closec c is to 1, the more correlated are the two ranking methods"""
	total = 0
	for i in range(len_):
	rank_value = rank2.index(rank1[i])
	total += ( i - rank_value)*( i - rank_value)
	return 1 - (6.0 * total) / (len_(len_len_ - 1))

	def remove_loops(cit, dates, dict_of_ids):
	"""when using time decay, new papers that are part of a loop
	are accumulating a lot of fake weight"""
	new_cit = {}
	for recid in cit:
	new_cit[recid] = []
	for cited_by in cit[recid]:
	if dates[dict_of_ids[cited_by]] >= dates[dict_of_ids[recid]]:
	if cited_by in cit:
	if recid not in cit[cited_by]:
	new_cit[recid].append(cited_by)
	else:
	write_message("Loop removed: %s <-> %s" \
	%(cited_by, recid), verbose=9)
	else:
	new_cit[recid].append(cited_by)
	else:
	write_message("Loop removed: %s <-> %s" \
	%(cited_by, recid), verbose=9)
	write_message("Simple loops removed", verbose=5)
	return new_cit

	def calculate_time_weights(len_, time_decay, dates):
	"""calculates the time coeficients for each paper"""
	current_year = int(datetime.datetime.now().strftime("%Y"))
	date_coef = {}
	for j in range(len_):
	date_coef[j] = exp(time_decay*(dates[j] - current_year))
	write_message("Time weights calculated", verbose=5)
	write_message("Time weights: %s" % str(date_coef), verbose=9)
	return date_coef

	def get_dates(function, config, dict_of_ids):
	"""returns a dictionary containing the year of
	publishing for each paper"""
	try:
	file_for_dates = config.get(function, "file_with_dates")
	dates = get_dates_from_file(file_for_dates, dict_of_ids)
	except (ConfigParser.NoOptionError, StandardError), err:
	write_message("If you want to read the dates from file set up the \
	'file_for_dates' variable in the config file [%s]" %err, verbose=3)
	dates = get_dates_from_db(dict_of_ids)
	return dates

	def citerank(rank_method_code):
	"""new ranking method based on the citation graph"""
	write_message("Running rank method: %s" % rank_method_code, verbose=0)
	try:
	file_ = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg"
	config = ConfigParser.ConfigParser()
	config.readfp(open(file_))
	except StandardError:
	write_message("Cannot find configuration file: %s" % file_, sys.stderr)
	raise StandardError
	# the file for citations needs to have the following format:
	#each line needs to be x[tab]y, where x cites y; x,y are recids
	function = config.get("rank_method", "function")
	try:
	file_for_citations = config.get(function, "file_with_citations")
	cit, dict_of_ids = get_citations_from_file(file_for_citations)
	except (ConfigParser.NoOptionError, StandardError), err:
	write_message("If you want to read the citation data from file set up \
	the file_for_citations parameter in the config file [%s]" %err, verbose=2)
	cit, dict_of_ids = get_citations_from_db()
	len_ = len(dict_of_ids.keys())
	write_message("Number of nodes(papers) to rank : %s" % str(len_), verbose=3)
	if len_ == 0:
	write_message("Error: No citations to read!", sys.stderr)
	raise Exception
	try:
	method = config.get(function, "citerank_method")
	except ConfigParser.NoOptionError, err:
	write_message("Exception: %s " %err, sys.stderr)
	raise Exception
	write_message("Running %s method." % method, verbose=2)
	dates = get_dates(function, config, dict_of_ids)
	if method == "citation_time":
	try:
	time_decay = float(config.get(function, "time_decay"))
	except (ConfigParser.NoOptionError, ValueError), err:
	write_message("Exception: %s" % err, sys.stderr)
	raise Exception
	date_coef = calculate_time_weights(len_, time_decay, dates)
	#cit = remove_loops(cit, dates, dict_of_ids)
	dict_of_ranks = \
	run_citation_rank_time(cit, dict_of_ids, date_coef, dates)
	else:
	try:
	conv_threshold = float(config.get(function, "conv_threshold"))
	check_point = int(config.get(function, "check_point"))
	damping_factor = float(config.get(function, "damping_factor"))
	write_message("Parameters: d = %s, conv_threshold = %s, \
	check_point = %s" %(str(damping_factor), \
	str(conv_threshold), str(check_point)), verbose=5)
	except (ConfigParser.NoOptionError, StandardError), err:
	write_message("Exception: %s" % err, sys.stderr)
	raise Exception
	if method == "pagerank_classic":
	ref = construct_ref_array(cit, dict_of_ids, len_)
	use_ext_cit = ""
	try:
	use_ext_cit = config.get(function, "use_external_citations")
	write_message("Pagerank will use external citations: %s" \
	%str(use_ext_cit), verbose=5)
	except (ConfigParser.NoOptionError, StandardError), err:
	write_message("%s" % err, verbose=2)
	if use_ext_cit == "yes":
	try:
	ext_citation_file = config.get(function, "ext_citation_file")
	ext_links = get_external_links_from_file\
	(ext_citation_file, ref, dict_of_ids)
	except (ConfigParser.NoOptionError, StandardError):
	write_message("If you want to read the external citation data \
	from file set up the ext_citation_file parameter in the config. file", verbose=3)
	try:
	reference_tag = config.get(function, "ext_reference_tag")
	dummy = int(reference_tag[0:3])
	except (ConfigParser.NoOptionError, StandardError):
	write_message("You need to set up correctly the \
	reference_tag in the cfg file", sys.stderr)
	raise Exception
	ext_links = get_external_links_from_db(ref, \
	dict_of_ids, reference_tag)
	avg = avg_ext_links_with_0(ext_links)
	if avg < 1:
	write_message("This method can't be ran. There is not enough \
	information about the external citation. Hint: check the reference tag", sys.stderr)
	raise Exception
	avg_ext_links_without_0(ext_links)
	try:
	alpha = float(config.get(function, "ext_alpha"))
	beta = float(config.get(function, "ext_beta"))
	except (ConfigParser.NoOptionError, StandardError), err:
	write_message("Exception: %s" % err, sys.stderr)
	raise Exception
	dict_of_ranks = run_pagerank_ext(cit, dict_of_ids, ref, \
	ext_links, conv_threshold, check_point, alpha, beta, dates)
	else:
	dict_of_ranks = run_pagerank(cit, dict_of_ids, len_, ref, \
	damping_factor, conv_threshold, check_point, dates)
	elif method == "pagerank_time":
	try:
	time_decay = float(config.get(function, "time_decay"))
	write_message("Parameter: time_decay = %s" %str(time_decay), verbose=5)
	except (ConfigParser.NoOptionError, StandardError), err:
	write_message("Exception: %s" % err, sys.stderr)
	raise Exception
	date_coef = calculate_time_weights(len_, time_decay, dates)
	cit = remove_loops(cit, dates, dict_of_ids)
	ref = construct_ref_array(cit, dict_of_ids, len_)
	dict_of_ranks = run_pagerank_time(cit, dict_of_ids, len_, ref, \
	damping_factor, conv_threshold, check_point, date_coef, dates)
	else:
	write_message("Error: Unknown ranking method. \
	Please check the ranking_method parameter in the config. file.", sys.stderr)
	raise Exception
	try:
	filename_ranks = config.get(function, "output_ranks_to_filename")
	max_ranks = config.get(function, "output_rank_limit")
	if not max_ranks.isdigit():
	max_ranks = len_
	else:
	max_ranks = int(max_ranks)
	if max_ranks > len_:
	max_ranks = len_
	ranks = sort_weights(dict_of_ranks)
	write_message("Ranks: %s" % str(ranks), verbose=9)
	write_first_ranks_to_file(ranks, dict_of_ranks, \
	max_ranks, filename_ranks)
	except (ConfigParser.NoOptionError, StandardError):
	write_message("If you want the ranks to be printed in a file you have \
	to set output_ranks_to_filename and output_rank_limit \
	parameters in the configuration file", verbose=3)
	into_db(dict_of_ranks, rank_method_code)

bibrank_citerank_indexer.pyNo OneTemporaryActions

File Metadata

bibrank_citerank_indexer.pyView Options

Event Timeline

bibrank_citerank_indexer.py
No OneTemporary
Actions

bibrank_citerank_indexer.py
View Options