Page MenuHomec4science

bibrank_citerank_indexer.py
No OneTemporary

File Metadata

Created
Sun, Aug 4, 18:50

bibrank_citerank_indexer.py

# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Implementation of different ranking methods based on
the citation graph:
- citation count/ time decayed citation count
- pagerank / pagerank with external citations
- time decayed pagerank
"""
# pylint: disable=E0611
import ConfigParser
from math import exp
import datetime
import time
import re
import sys
try:
from numpy import array, ones, zeros, int32, float32, sqrt, dot
import_numpy = 1
except ImportError:
import_numpy = 0
if sys.hexversion < 0x2040000:
# pylint: disable=W0622
from sets import Set as set
# pylint: enable=W0622
from invenio.dbquery import run_sql, serialize_via_marshal, \
deserialize_via_marshal
from invenio.bibtask import write_message
from invenio.config import CFG_ETCDIR
def get_citations_from_file(filename):
"""gets the citation data (who cites who) from a file and returns
- a dictionary of type x:{x1,x2..},
where x is cited by x1,x2..
- a dictionary of type a:{b},
where recid 'a' is asociated with an index 'b' """
cit = {}
dict_of_ids = {}
count = 0
try:
citation_file = open(filename, "r")
except StandardError:
write_message("Cannot find file: %s" % filename, sys.stderr)
raise StandardError
for line in citation_file:
tokens = line.strip().split()
recid_cites = int(tokens[0])
recid_cited = int(tokens[1])
if recid_cited not in cit:
cit[recid_cited] = []
#without this, duplicates might be introduced
if recid_cites not in cit[recid_cited] and recid_cites != recid_cited:
cit[recid_cited].append(recid_cites)
if recid_cites not in dict_of_ids:
dict_of_ids[recid_cites] = count
count += 1
if recid_cited not in dict_of_ids:
dict_of_ids[recid_cited] = count
count += 1
citation_file.close()
write_message("Citation data collected from file: %s" %filename, verbose=2)
write_message("Ids and recids corespondace: %s" \
%str(dict_of_ids), verbose=9)
write_message("Citations: %s" % str(cit), verbose=9)
return cit, dict_of_ids
def get_citations_from_db():
"""gets the citation data (who cites who) from the rnkCITATIONDATA table,
and returns:
-a dictionary of type x:{x1,x2..}, where x is cited by x1,x2..
-a dict of type a:{b} where recid 'a' is asociated with an index 'b'"""
dict_of_ids = {}
count = 0
query = "select object_value from rnkCITATIONDATA \
where object_name = 'citationdict'"
cit_compressed = run_sql(query)
cit = []
if cit_compressed and cit_compressed[0] and cit_compressed[0][0]:
cit = deserialize_via_marshal(cit_compressed[0][0])
if cit:
for item in cit:
#check for duplicates in citation dictionary
cit[item] = set(cit[item])
if item in cit[item]:
cit[item].remove(item)
if item not in dict_of_ids:
dict_of_ids[item] = count
count += 1
for value in cit[item]:
if value not in dict_of_ids:
dict_of_ids[value] = count
count += 1
write_message("Citation data collected\
from rnkCITATIONDATA", verbose=2)
write_message("Ids and recids corespondace: %s" \
% str(dict_of_ids), verbose=9)
write_message("Citations: %s" % str(cit), verbose=9)
return cit, dict_of_ids
else:
write_message("Error while extracting citation data \
from rnkCITATIONDATA table", verbose=1)
else:
write_message("Error while extracting citation data \
from rnkCITATIONDATA table", verbose=1)
return {}, {}
def construct_ref_array(cit, dict_of_ids, len_):
"""returns an array with the number of references that each recid has """
ref = array((), int32)
ref = zeros(len_, int32)
for key in cit:
for value in cit[key]:
ref[dict_of_ids[value]] += 1
write_message("Number of references: %s" %str(ref), verbose=9)
write_message("Finished computing total number \
of references for each paper.", verbose=5)
return ref
def get_external_links_from_file(filename, ref, dict_of_ids):
"""returns a dictionary containing the number of
external links for each recid
external link=citation that is not in our database """
ext_links = {}
#format: ext_links[dict_of_ids[recid]]=number of total external links
try:
external_file = open(filename, "r")
except StandardError:
write_message("Cannot find file: %s" % filename, sys.stderr)
raise StandardError
for line in external_file:
tokens = line.strip().split()
recid = int(tokens[0])
nr_of_external = int(tokens[1])
ext_links[dict_of_ids[recid]] = nr_of_external - ref[dict_of_ids[recid]]
if ext_links[dict_of_ids[recid]] < 0:
ext_links[dict_of_ids[recid]] = 0
external_file.close()
write_message("External link information extracted", verbose=2)
return ext_links
def get_external_links_from_db_old(ref, dict_of_ids, reference_indicator):
"""returns a dictionary containing the number of
external links for each recid
external link=citation that is not in our database """
ext_links = {}
reference_tag_regex = reference_indicator + "[a-z]"
for recid in dict_of_ids:
query = "select COUNT(DISTINCT field_number) from bibrec_bib99x \
where id_bibrec='%s' and id_bibxxx in \
(select id from bib99x where tag RLIKE '%s');" \
% (str(recid), reference_tag_regex)
result_set = run_sql(query)
if result_set:
total_links = int(result_set[0][0])
internal_links = ref[dict_of_ids[recid]]
ext_links[dict_of_ids[recid]] = total_links - internal_links
if ext_links[dict_of_ids[recid]] < 0:
ext_links[dict_of_ids[recid]] = 0
else:
ext_links[dict_of_ids[recid]] = 0
write_message("External link information extracted", verbose=2)
write_message("External links: %s" % str(ext_links), verbose=9)
return ext_links
def get_external_links_from_db(ref, dict_of_ids, reference_indicator):
"""returns a dictionary containing the number of
external links for each recid
external link=citation that is not in our database """
ext_links = {}
dict_all_ref = {}
for recid in dict_of_ids:
dict_all_ref[recid] = 0
ext_links[dict_of_ids[recid]] = 0
reference_db_id = reference_indicator[0:2]
reference_tag_regex = reference_indicator + "[a-z]"
tag_list = run_sql("select id from bib" + reference_db_id + \
"x where tag RLIKE %s", (reference_tag_regex, ))
tag_set = set()
for tag in tag_list:
tag_set.add(tag[0])
ref_list = run_sql("select id_bibrec, id_bibxxx, field_number from \
bibrec_bib" + reference_db_id + "x group by \
id_bibrec, field_number")
for item in ref_list:
recid = int(item[0])
id_bib = int(item[1])
if recid in dict_of_ids and id_bib in tag_set:
dict_all_ref[recid] += 1
for recid in dict_of_ids:
total_links = dict_all_ref[recid]
internal_links = ref[dict_of_ids[recid]]
ext_links[dict_of_ids[recid]] = total_links - internal_links
if ext_links[dict_of_ids[recid]] < 0:
ext_links[dict_of_ids[recid]] = 0
write_message("External link information extracted", verbose=2)
write_message("External links: %s" % str(ext_links), verbose=9)
return ext_links
def avg_ext_links_with_0(ext_links):
"""returns the average number of external links per paper
including in the counting the papers with 0 external links"""
total = 0.0
for item in ext_links:
total += ext_links[item]
avg_ext = total/len(ext_links)
write_message("The average number of external links per paper (including \
papers with 0 external links) is: %s" % str(avg_ext), verbose=3)
return avg_ext
def avg_ext_links_without_0(ext_links):
"""returns the average number of external links per paper
excluding in the counting the papers with 0 external links"""
count = 0.0
total = 0.0
for item in ext_links:
if ext_links[item] != 0:
count += 1
total += ext_links[item]
avg_ext = total/count
write_message("The average number of external links per paper (excluding \
papers with 0 external links) is: %s" % str(avg_ext), verbose=3)
return avg_ext
def leaves(ref):
"""returns the number of papers that do not cite any other paper"""
nr_of_leaves = 0
for i in ref:
if i == 0:
nr_of_leaves += 1
write_message("The number of papers that do not cite \
any other papers: %s" % str(leaves), verbose=3)
return nr_of_leaves
def get_dates_from_file(filename, dict_of_ids):
"""Returns the year of the publication for each paper.
In case the year is not in the db, the year of the submission is taken"""
dates = {}
# the format is: dates[dict_of_ids[recid]] = year
try:
dates_file = open(filename, "r")
except StandardError:
write_message("Cannot find file: %s" % filename, sys.stderr)
raise StandardError
for line in dates_file:
tokens = line.strip().split()
recid = int(tokens[0])
year = int(tokens[1])
dates[dict_of_ids[recid]] = year
dates_file.close()
write_message("Dates extracted", verbose=2)
write_message("Dates dictionary %s" % str(dates), verbose=9)
return dates
def get_dates_from_db(dict_of_ids, publication_year_tag, creation_date_tag):
"""Returns the year of the publication for each paper.
In case the year is not in the db, the year of the submission is taken"""
current_year = int(datetime.datetime.now().strftime("%Y"))
publication_year_db_id = publication_year_tag[0:2]
creation_date_db_id = creation_date_tag[0:2]
total = 0
count = 0
dict_of_dates = {}
for recid in dict_of_ids:
dict_of_dates[recid] = 0
date_list = run_sql("select id, tag, value from bib" + \
publication_year_db_id + "x where tag=%s", \
(publication_year_tag, ))
date_dict = {}
for item in date_list:
date_dict[int(item[0])] = item[2]
pattern = re.compile('.*(\d{4}).*')
date_list = run_sql("select id_bibrec, id_bibxxx, field_number \
from bibrec_bib" + publication_year_db_id +"x")
for item in date_list:
recid = int(item[0])
id_ = int(item[1])
if id_ in date_dict and recid in dict_of_dates:
reg = pattern.match(date_dict[id_])
if reg:
date = int(reg.group(1))
if date > 1000 and date <= current_year:
dict_of_dates[recid] = date
total += date
count += 1
not_covered = []
for recid in dict_of_dates:
if dict_of_dates[recid] == 0:
not_covered.append(recid)
date_list = run_sql("select id, tag, value from bib" + \
creation_date_db_id + "x where tag=%s", \
(creation_date_tag, ))
date_dict = {}
for item in date_list:
date_dict[int(item[0])] = item[2]
date_list = run_sql("select id_bibrec, id_bibxxx, field_number \
from bibrec_bib" + creation_date_db_id + "x")
for item in date_list:
recid = int(item[0])
id_ = int(item[1])
if id_ in date_dict and recid in not_covered:
date = int(str(date_dict[id_])[0:4])
if date > 1000 and date <= current_year:
dict_of_dates[recid] = date
total += date
count += 1
dates = {}
med = total/count
for recid in dict_of_dates:
if dict_of_dates[recid] == 0:
dates[dict_of_ids[recid]] = med
else:
dates[dict_of_ids[recid]] = dict_of_dates[recid]
write_message("Dates extracted", verbose=2)
write_message("Dates dictionary %s" % str(dates), verbose=9)
return dates
def construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor):
"""returns several structures needed in the calculation
of the PAGERANK method using this structures, we don't need
to keep the full matrix in the memory"""
sparse = {}
for item in cit:
for value in cit[item]:
sparse[(dict_of_ids[item], dict_of_ids[value])] = \
damping_factor * 1.0/ref[dict_of_ids[value]]
semi_sparse = []
for j in range(len_):
if ref[j] == 0:
semi_sparse.append(j)
semi_sparse_coeficient = damping_factor/len_
#zero_coeficient = (1-damping_factor)/len_
write_message("Sparse information calculated", verbose=3)
return sparse, semi_sparse, semi_sparse_coeficient
def construct_sparse_matrix_ext(cit, ref, ext_links, dict_of_ids, alpha, beta):
"""if x doesn't cite anyone: cites everyone : 1/len_ -- should be used!
returns several structures needed in the calculation
of the PAGERANK_EXT method"""
len_ = len(dict_of_ids)
sparse = {}
semi_sparse = {}
sparse[0, 0] = 1.0 - alpha
for j in range(len_):
sparse[j+1, 0] = alpha/(len_)
if j not in ext_links:
sparse[0, j+1] = beta/(len_ + beta)
else:
if ext_links[j] == 0:
sparse[0, j+1] = beta/(len_ + beta)
else:
aux = beta * ext_links[j]
if ref[j] == 0:
sparse[0, j+1] = aux/(aux + len_)
else:
sparse[0, j+1] = aux/(aux + ref[j])
if ref[j] == 0:
semi_sparse[j+1] = (1.0 - sparse[0, j + 1])/len_
for item in cit:
for value in cit[item]:
sparse[(dict_of_ids[item] + 1, dict_of_ids[value] + 1)] = \
(1.0 - sparse[0, dict_of_ids[value] + 1])/ref[dict_of_ids[value]]
#for i in range(len_ + 1):
# a = ""
# for j in range (len_ + 1):
# if (i,j) in sparse:
# a += str(sparse[(i,j)]) + "\t"
# else:
# a += "0\t"
# print a
#print semi_sparse
write_message("Sparse information calculated", verbose=3)
return sparse, semi_sparse
def construct_sparse_matrix_time(cit, ref, dict_of_ids, \
damping_factor, date_coef):
"""returns several structures needed in the calculation of the PAGERANK_time
method using this structures,
we don't need to keep the full matrix in the memory"""
len_ = len(dict_of_ids)
sparse = {}
for item in cit:
for value in cit[item]:
sparse[(dict_of_ids[item], dict_of_ids[value])] = damping_factor * \
date_coef[dict_of_ids[value]]/ref[dict_of_ids[value]]
semi_sparse = []
for j in range(len_):
if ref[j] == 0:
semi_sparse.append(j)
semi_sparse_coeficient = damping_factor/len_
#zero_coeficient = (1-damping_factor)/len_
write_message("Sparse information calculated", verbose=3)
return sparse, semi_sparse, semi_sparse_coeficient
def statistics_on_sparse(sparse):
"""returns the number of papers that cite themselves"""
count_diag = 0
for (i, j) in sparse.keys():
if i == j:
count_diag += 1
write_message("The number of papers that cite themselves: %s" % \
str(count_diag), verbose=3)
return count_diag
def pagerank(conv_threshold, check_point, len_, sparse, \
semi_sparse, semi_sparse_coef):
"""the core function of the PAGERANK method
returns an array with the ranks coresponding to each recid"""
weights_old = ones((len_), float32) # initial weights
weights_new = array((), float32)
converged = False
nr_of_check_points = 0
difference = len_
while not converged:
nr_of_check_points += 1
for step in (range(check_point)):
weights_new = zeros((len_), float32)
for (i, j) in sparse.keys():
weights_new[i] += sparse[(i, j)]*weights_old[j]
semi_total = 0.0
for j in semi_sparse:
semi_total += weights_old[j]
weights_new = weights_new + semi_sparse_coef * semi_total + \
(1.0/len_ - semi_sparse_coef) * sum(weights_old)
if step == check_point - 1:
diff = weights_new - weights_old
difference = sqrt(dot(diff, diff))/len_
write_message("Finished step: %s, %s " \
%(str(check_point*(nr_of_check_points-1) + step), \
str(difference)), verbose=5)
weights_old = weights_new.copy()
converged = (difference < conv_threshold)
write_message("PageRank calculated for all recids finnished in %s steps. \
The threshold was %s" % (str(nr_of_check_points), str(difference)),\
verbose=2)
return weights_old
def pagerank_ext(conv_threshold, check_point, len_, sparse, semi_sparse):
"""the core function of the PAGERANK_EXT method
returns an array with the ranks coresponding to each recid"""
weights_old = array((), float32)
weights_old = ones((len_), float32)
weights_new = array((), float32)
converged = False
nr_of_check_points = 0
difference = len_
while not converged:
nr_of_check_points += 1
for step in (range(check_point)):
weights_new = zeros((len_), float32)
for (i, j) in sparse.keys():
weights_new[i] += sparse[(i, j)]*weights_old[j]
total_sum = 0.0
for j in semi_sparse:
total_sum += semi_sparse[j]*weights_old[j]
weights_new[1:len_] = weights_new[1:len_] + total_sum
if step == check_point - 1:
diff = weights_new - weights_old
difference = sqrt(dot(diff, diff))/len_
write_message("Finished step: %s, %s " \
% (str(check_point*(nr_of_check_points-1) + step), \
str(difference)), verbose=5)
weights_old = weights_new.copy()
converged = (difference < conv_threshold)
write_message("PageRank calculated for all recids finnished in %s steps. \
The threshold was %s" % (str(nr_of_check_points), \
str(difference)), verbose=2)
#return weights_old[1:len_]/(len_ - weights_old[0])
return weights_old[1:len_]
def pagerank_time(conv_threshold, check_point, len_, \
sparse, semi_sparse, semi_sparse_coeficient, date_coef):
"""the core function of the PAGERANK_TIME method: pageRank + time decay
returns an array with the ranks coresponding to each recid"""
weights_old = array((), float32)
weights_old = ones((len_), float32) # initial weights
weights_new = array((), float32)
converged = False
nr_of_check_points = 0
difference = len_
while not converged:
nr_of_check_points += 1
for step in (range(check_point)):
weights_new = zeros((len_), float32)
for (i, j) in sparse.keys():
weights_new[i] += sparse[(i, j)]*weights_old[j]
semi_total = 0.0
for j in semi_sparse:
semi_total += weights_old[j]*date_coef[j]
zero_total = 0.0
for i in range(len_):
zero_total += weights_old[i]*date_coef[i]
#dates = array(date_coef.keys())
#zero_total = dot(weights_old, dates)
weights_new = weights_new + semi_sparse_coeficient * semi_total + \
(1.0/len_ - semi_sparse_coeficient) * zero_total
if step == check_point - 1:
diff = weights_new - weights_old
difference = sqrt(dot(diff, diff))/len_
write_message("Finished step: %s, %s " \
% (str(check_point*(nr_of_check_points-1) + step), \
str(difference)), verbose=5)
weights_old = weights_new.copy()
converged = (difference < conv_threshold)
write_message("PageRank calculated for all recids finnished in %s steps.\
The threshold was %s" % (str(nr_of_check_points), \
str(difference)), verbose=2)
return weights_old
def citation_rank_time(cit, dict_of_ids, date_coef, dates, decimals):
"""returns a dictionary recid:weight based on the total number of
citations as function of time"""
dict_of_ranks = {}
for key in dict_of_ids:
if key in cit:
dict_of_ranks[key] = 0
for recid in cit[key]:
dict_of_ranks[key] += date_coef[dict_of_ids[recid]]
dict_of_ranks[key] = round(dict_of_ranks[key], decimals) \
+ dates[dict_of_ids[key]]* pow(10, 0-4-decimals)
else:
dict_of_ranks[key] = dates[dict_of_ids[key]]* pow(10, 0-4-decimals)
write_message("Citation rank calculated", verbose=2)
return dict_of_ranks
def get_ranks(weights, dict_of_ids, mult, dates, decimals):
"""returns a dictionary recid:value, where value is the weight of the
recid paper; the second order is the reverse time order,
from recent to past"""
dict_of_ranks = {}
for item in dict_of_ids:
dict_of_ranks[item] = round(weights[dict_of_ids[item]]* mult, decimals)\
+ dates[dict_of_ids[item]]* pow(10, 0-4-decimals)
#dict_of_ranks[item] = weights[dict_of_ids[item]]
return dict_of_ranks
def sort_weights(dict_of_ranks):
"""sorts the recids based on weights(first order)
and on dates(second order)"""
ranks_by_citations = sorted(dict_of_ranks.keys(), lambda x, y: \
cmp(dict_of_ranks[y], dict_of_ranks[x]))
return ranks_by_citations
def normalize_weights(dict_of_ranks):
"""the weights should be normalized to 100, so they woun't be
different from the weights from other ranking methods"""
max_weight = 0.0
for recid in dict_of_ranks:
weight = dict_of_ranks[recid]
if weight > max_weight:
max_weight = weight
for recid in dict_of_ranks:
dict_of_ranks[recid] = round(dict_of_ranks[recid] * 100.0/max_weight, 3)
def write_first_ranks_to_file(ranks_by_citations, dict_of_ranks, \
nr_of_ranks, filename):
"""Writes the first n results of the ranking method into a file"""
try:
ranks_file = open(filename, "w")
except StandardError:
write_message("Problems with file: %s" % filename, sys.stderr)
raise StandardError
for i in range(nr_of_ranks):
ranks_file.write(str(i+1) + "\t" + str(ranks_by_citations[i]) + \
"\t" + str(dict_of_ranks[ranks_by_citations[i]]) + "\n")
ranks_file.close()
write_message("The first %s pairs recid:rank in the ranking order \
are written into this file: %s" % (nr_of_ranks, filename), verbose=2)
def del_rank_method_data(rank_method_code):
"""Delete the data for a rank method from rnkMETHODDATA table"""
id_ = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
run_sql("DELETE FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id_[0][0], ))
def into_db(dict_of_ranks, rank_method_code):
"""Writes into the rnkMETHODDATA table the ranking results"""
method_id = run_sql("SELECT id from rnkMETHOD where name=%s", \
(rank_method_code, ))
del_rank_method_data(rank_method_code)
serialized_data = serialize_via_marshal(dict_of_ranks)
method_id_str = str(method_id[0][0])
run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) \
VALUES(%s, %s) ", (method_id_str, serialized_data, ))
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", \
(date, rank_method_code))
write_message("Finished writing the ranks into rnkMETHOD table", verbose=5)
def run_pagerank(cit, dict_of_ids, len_, ref, damping_factor, \
conv_threshold, check_point, dates):
"""returns the final form of the ranks when using pagerank method"""
write_message("Running the PageRank method", verbose=5)
sparse, semi_sparse, semi_sparse_coeficient = \
construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor)
weights = pagerank(conv_threshold, check_point, len_, \
sparse, semi_sparse, semi_sparse_coeficient)
dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2)
return dict_of_ranks
def run_pagerank_ext(cit, dict_of_ids, ref, ext_links, \
conv_threshold, check_point, alpha, beta, dates):
"""returns the final form of the ranks when using pagerank_ext method"""
write_message("Running the PageRank with external links method", verbose=5)
len_ = len(dict_of_ids)
sparse, semi_sparse = construct_sparse_matrix_ext(cit, ref, \
ext_links, dict_of_ids, alpha, beta)
weights = pagerank_ext(conv_threshold, check_point, \
len_ + 1, sparse, semi_sparse)
dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2)
return dict_of_ranks
def run_pagerank_time(cit, dict_of_ids, len_, ref, damping_factor, \
conv_threshold, check_point, date_coef, dates):
"""returns the final form of the ranks when using
pagerank + time decay method"""
write_message("Running the PageRank_time method", verbose=5)
sparse, semi_sparse, semi_sparse_coeficient = \
construct_sparse_matrix_time(cit, ref, dict_of_ids, \
damping_factor, date_coef)
weights = pagerank_time(conv_threshold, check_point, len_, \
sparse, semi_sparse, semi_sparse_coeficient, date_coef)
dict_of_ranks = get_ranks(weights, dict_of_ids, 100000, dates, 2)
return dict_of_ranks
def run_citation_rank_time(cit, dict_of_ids, date_coef, dates):
"""returns the final form of the ranks when using citation count
as function of time method"""
write_message("Running the citation rank with time decay method", verbose=5)
dict_of_ranks = citation_rank_time(cit, dict_of_ids, date_coef, dates, 2)
return dict_of_ranks
def spearman_rank_correlation_coef(rank1, rank2, len_):
"""rank1 and rank2 are arrays containing the recids in the ranking order
returns the corelation coeficient (-1 <= c <= 1) between 2 rankings
the closec c is to 1, the more correlated are the two ranking methods"""
total = 0
for i in range(len_):
rank_value = rank2.index(rank1[i])
total += (i - rank_value)*(i - rank_value)
return 1 - (6.0 * total) / (len_*(len_*len_ - 1))
def remove_loops(cit, dates, dict_of_ids):
"""when using time decay, new papers that are part of a loop
are accumulating a lot of fake weight"""
new_cit = {}
for recid in cit:
new_cit[recid] = []
for cited_by in cit[recid]:
if dates[dict_of_ids[cited_by]] >= dates[dict_of_ids[recid]]:
if cited_by in cit:
if recid not in cit[cited_by]:
new_cit[recid].append(cited_by)
else:
write_message("Loop removed: %s <-> %s" \
%(cited_by, recid), verbose=9)
else:
new_cit[recid].append(cited_by)
else:
write_message("Loop removed: %s <-> %s" \
%(cited_by, recid), verbose=9)
write_message("Simple loops removed", verbose=5)
return new_cit
def calculate_time_weights(len_, time_decay, dates):
"""calculates the time coeficients for each paper"""
current_year = int(datetime.datetime.now().strftime("%Y"))
date_coef = {}
for j in range(len_):
date_coef[j] = exp(time_decay*(dates[j] - current_year))
write_message("Time weights calculated", verbose=5)
write_message("Time weights: %s" % str(date_coef), verbose=9)
return date_coef
def get_dates(function, config, dict_of_ids):
"""returns a dictionary containing the year of
publishing for each paper"""
try:
file_for_dates = config.get(function, "file_with_dates")
dates = get_dates_from_file(file_for_dates, dict_of_ids)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("If you want to read the dates from file set up the \
'file_for_dates' variable in the config file [%s]" %err, verbose=3)
try:
publication_year_tag = config.get(function, "publication_year_tag")
dummy = int(publication_year_tag[0:3])
except (ConfigParser.NoOptionError, StandardError):
write_message("You need to set up correctly the publication_year_tag \
in the cfg file", sys.stderr)
raise Exception
try:
creation_date_tag = config.get(function, "creation_date_tag")
dummy = int(creation_date_tag[0:3])
except (ConfigParser.NoOptionError, StandardError):
write_message("You need to set up correctly the creation_date_tag \
in the cfg file", sys.stderr)
raise Exception
dates = get_dates_from_db(dict_of_ids, publication_year_tag, \
creation_date_tag)
return dates
def citerank(rank_method_code):
"""new ranking method based on the citation graph"""
write_message("Running rank method: %s" % rank_method_code, verbose=0)
if not import_numpy:
write_message('The numpy package could not be imported. \
This package is compulsory for running the citerank methods.')
return
try:
file_ = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg"
config = ConfigParser.ConfigParser()
config.readfp(open(file_))
except StandardError:
write_message("Cannot find configuration file: %s" % file_, sys.stderr)
raise StandardError
# the file for citations needs to have the following format:
#each line needs to be x[tab]y, where x cites y; x,y are recids
function = config.get("rank_method", "function")
try:
file_for_citations = config.get(function, "file_with_citations")
cit, dict_of_ids = get_citations_from_file(file_for_citations)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("If you want to read the citation data from file set up \
the file_for_citations parameter in the config file [%s]" %err, verbose=2)
cit, dict_of_ids = get_citations_from_db()
len_ = len(dict_of_ids.keys())
write_message("Number of nodes(papers) to rank : %s" % str(len_), verbose=3)
if len_ == 0:
write_message("Error: No citations to read!", sys.stderr)
raise Exception
try:
method = config.get(function, "citerank_method")
except ConfigParser.NoOptionError, err:
write_message("Exception: %s " %err, sys.stderr)
raise Exception
write_message("Running %s method." % method, verbose=2)
dates = get_dates(function, config, dict_of_ids)
if method == "citation_time":
try:
time_decay = float(config.get(function, "time_decay"))
except (ConfigParser.NoOptionError, ValueError), err:
write_message("Exception: %s" % err, sys.stderr)
raise Exception
date_coef = calculate_time_weights(len_, time_decay, dates)
#cit = remove_loops(cit, dates, dict_of_ids)
dict_of_ranks = \
run_citation_rank_time(cit, dict_of_ids, date_coef, dates)
else:
try:
conv_threshold = float(config.get(function, "conv_threshold"))
check_point = int(config.get(function, "check_point"))
damping_factor = float(config.get(function, "damping_factor"))
write_message("Parameters: d = %s, conv_threshold = %s, \
check_point = %s" %(str(damping_factor), \
str(conv_threshold), str(check_point)), verbose=5)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("Exception: %s" % err, sys.stderr)
raise Exception
if method == "pagerank_classic":
ref = construct_ref_array(cit, dict_of_ids, len_)
use_ext_cit = ""
try:
use_ext_cit = config.get(function, "use_external_citations")
write_message("Pagerank will use external citations: %s" \
%str(use_ext_cit), verbose=5)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("%s" % err, verbose=2)
if use_ext_cit == "yes":
try:
ext_citation_file = config.get(function, "ext_citation_file")
ext_links = get_external_links_from_file(ext_citation_file,
ref, dict_of_ids)
except (ConfigParser.NoOptionError, StandardError):
write_message("If you want to read the external citation \
data from file set up the ext_citation_file parameter in the config. file", \
verbose=3)
try:
reference_tag = config.get(function, "ext_reference_tag")
dummy = int(reference_tag[0:3])
except (ConfigParser.NoOptionError, StandardError):
write_message("You need to set up correctly the \
reference_tag in the cfg file", sys.stderr)
raise Exception
ext_links = get_external_links_from_db(ref, \
dict_of_ids, reference_tag)
avg = avg_ext_links_with_0(ext_links)
if avg < 1:
write_message("This method can't be ran. There is not \
enough information about the external citation. Hint: check the reference tag", \
sys.stderr)
raise Exception
avg_ext_links_without_0(ext_links)
try:
alpha = float(config.get(function, "ext_alpha"))
beta = float(config.get(function, "ext_beta"))
except (ConfigParser.NoOptionError, StandardError), err:
write_message("Exception: %s" % err, sys.stderr)
raise Exception
dict_of_ranks = run_pagerank_ext(cit, dict_of_ids, ref, \
ext_links, conv_threshold, check_point, alpha, beta, dates)
else:
dict_of_ranks = run_pagerank(cit, dict_of_ids, len_, ref, \
damping_factor, conv_threshold, check_point, dates)
elif method == "pagerank_time":
try:
time_decay = float(config.get(function, "time_decay"))
write_message("Parameter: time_decay = %s" \
%str(time_decay), verbose=5)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("Exception: %s" % err, sys.stderr)
raise Exception
date_coef = calculate_time_weights(len_, time_decay, dates)
cit = remove_loops(cit, dates, dict_of_ids)
ref = construct_ref_array(cit, dict_of_ids, len_)
dict_of_ranks = run_pagerank_time(cit, dict_of_ids, len_, ref, \
damping_factor, conv_threshold, check_point, date_coef, dates)
else:
write_message("Error: Unknown ranking method. \
Please check the ranking_method parameter in the config. file.", sys.stderr)
raise Exception
try:
filename_ranks = config.get(function, "output_ranks_to_filename")
max_ranks = config.get(function, "output_rank_limit")
if not max_ranks.isdigit():
max_ranks = len_
else:
max_ranks = int(max_ranks)
if max_ranks > len_:
max_ranks = len_
ranks = sort_weights(dict_of_ranks)
write_message("Ranks: %s" % str(ranks), verbose=9)
write_first_ranks_to_file(ranks, dict_of_ranks, \
max_ranks, filename_ranks)
except (ConfigParser.NoOptionError, StandardError):
write_message("If you want the ranks to be printed in a file you have \
to set output_ranks_to_filename and output_rank_limit \
parameters in the configuration file", verbose=3)
normalize_weights(dict_of_ranks)
into_db(dict_of_ranks, rank_method_code)

Event Timeline