Page MenuHomec4science

bibrank_citerank_indexer.py
No OneTemporary

File Metadata

Created
Sun, May 19, 13:57

bibrank_citerank_indexer.py

# -*- coding: utf-8 -*-
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Implementation of different ranking methods based on
the citation graph:
- citation count/ time decayed citation count
- pagerank / pagerank with external citations
- time decayed pagerank
"""
# pylint: disable-msg=E0611
import marshal
import ConfigParser
from math import exp
import datetime
import time
import re
import sys
from numpy import array, ones, zeros, int32, float32, sqrt, dot
from zlib import decompress
if sys.hexversion < 0x2040000:
# pylint: disable-msg=W0622
from sets import Set as set
# pylint: enable-msg=W0622
from invenio.dbquery import run_sql, serialize_via_marshal
from invenio.bibtask import write_message
from invenio.config import CFG_ETCDIR
def get_citations_from_file(filename):
"""gets the citation data (who cites who) from a file and returns
- a dictionary of type x:{x1,x2..},
where x is cited by x1,x2..
- a dictionary of type a:{b},
where recid 'a' is asociated with an index 'b' """
cit = {}
dict_of_ids = {}
count = 0
try:
citation_file = open(filename,"r")
except StandardError:
write_message("Cannot find file: %s" % filename, sys.stderr)
raise StandardError
for line in citation_file:
tokens = line.strip().split()
recid_cites = int(tokens[0])
recid_cited = int(tokens[1])
if recid_cited not in cit:
cit[recid_cited] = []
#without this, duplicates might be introduced
if recid_cites not in cit[recid_cited] and recid_cites != recid_cited:
cit[recid_cited].append(recid_cites)
if recid_cites not in dict_of_ids:
dict_of_ids[recid_cites] = count
count += 1
if recid_cited not in dict_of_ids:
dict_of_ids[recid_cited] = count
count += 1
citation_file.close()
write_message("Citation data collected from file: %s" %filename, verbose=2)
write_message("Ids and recids corespondace: %s" \
%str(dict_of_ids), verbose=9)
write_message("Citations: %s" % str(cit), verbose=9)
return cit, dict_of_ids
def get_citations_from_db():
"""gets the citation data (who cites who) from the rnkCITATIONDATA table,
and returns:
-a dictionary of type x:{x1,x2..}, where x is cited by x1,x2..
-a dict of type a:{b} where recid 'a' is asociated with an index 'b'"""
dict_of_ids = {}
count = 0
query = "select object_value from rnkCITATIONDATA \
where object_name = 'citationdict'"
cit_compressed = run_sql(query)
cit = []
if cit_compressed and cit_compressed[0] and cit_compressed[0][0]:
cit = marshal.loads(decompress(cit_compressed[0][0]))
if cit:
for item in cit:
#check for duplicates in citation dictionary
cit[item] = set(cit[item])
if item in cit[item]:
cit[item].remove(item)
if item not in dict_of_ids:
dict_of_ids[item] = count
count += 1
for value in cit[item]:
if value not in dict_of_ids:
dict_of_ids[value] = count
count += 1
write_message ("Citation data collected\
from rnkCITATIONDATA", verbose=2)
write_message("Ids and recids corespondace: %s" \
% str(dict_of_ids), verbose=9)
write_message("Citations: %s" % str(cit), verbose=9)
return cit, dict_of_ids
else:
write_message("Error while extracting citation data \
from rnkCITATIONDATA table", verbose=1)
else:
write_message("Error while extracting citation data \
from rnkCITATIONDATA table", verbose=1)
return {}, {}
def construct_ref_array (cit, dict_of_ids, len_):
"""returns an array with the number of references that each recid has """
ref = array((), int32)
ref = zeros(len_, int32)
for key in cit:
for value in cit[key]:
ref[dict_of_ids[value]] += 1
write_message("Number of references: %s" %str(ref), verbose=9)
write_message("Finished computing total number \
of references for each paper.", verbose=5)
return ref
def get_external_links_from_file(filename, ref, dict_of_ids):
"""returns a dictionary containing the number of
external links for each recid
external link=citation that is not in our database """
ext_links = {}
#format: ext_links[dict_of_ids[recid]]=number of total external links
try:
external_file = open(filename,"r")
except StandardError:
write_message("Cannot find file: %s" % filename, sys.stderr)
raise StandardError
for line in external_file:
tokens = line.strip().split()
recid = int(tokens[0])
nr_of_external = int(tokens[1])
ext_links[dict_of_ids[recid]] = nr_of_external - ref[dict_of_ids[recid]]
if ext_links[dict_of_ids[recid]] < 0:
ext_links[dict_of_ids[recid]] = 0
external_file.close()
write_message("External link information extracted", verbose=2)
return ext_links
def get_external_links_from_db(ref, dict_of_ids, reference_indicator):
"""returns a dictionary containing the number of
external links for each recid
external link=citation that is not in our database """
ext_links = {}
reference_tag_regex = reference_indicator + "[a-z]"
for recid in dict_of_ids:
query = "select COUNT(DISTINCT field_number) from bibrec_bib99x \
where id_bibrec='%s' and id_bibxxx in \
(select id from bib99x where tag RLIKE '%s');" \
% (str(recid), reference_tag_regex)
result_set = run_sql(query)
if result_set:
total_links = int(result_set[0][0])
internal_links = ref[dict_of_ids[recid]]
ext_links[dict_of_ids[recid]] = total_links - internal_links
if ext_links[dict_of_ids[recid]] < 0:
ext_links[dict_of_ids[recid]] = 0
else:
ext_links[dict_of_ids[recid]] = 0
write_message ("External link information extracted", verbose=2)
write_message("External links: %s" % str(ext_links), verbose=9)
return ext_links
def avg_ext_links_with_0(ext_links):
"""returns the average number of external links per paper
including in the counting the papers with 0 external links"""
total = 0.0
for item in ext_links:
total += ext_links[item]
avg_ext = total/len(ext_links)
write_message("The average number of external links per paper (including \
papers with 0 external links) is: %s" % str(avg_ext), verbose=3)
return avg_ext
def avg_ext_links_without_0(ext_links):
"""returns the average number of external links per paper
excluding in the counting the papers with 0 external links"""
count = 0.0
total = 0.0
for item in ext_links:
if ext_links[item] != 0:
count += 1
total += ext_links[item]
avg_ext = total/count
write_message ("The average number of external links per paper (excluding \
papers with 0 external links) is: %s" % str(avg_ext), verbose=3)
return avg_ext
def leaves(ref):
"""returns the number of papers that do not cite any other paper"""
nr_of_leaves = 0
for i in ref:
if i == 0:
nr_of_leaves += 1
write_message ("The number of papers that do not cite \
any other papers: %s" % str(leaves), verbose=3)
return nr_of_leaves
def get_dates_from_file(filename, dict_of_ids):
"""Returns the year of the publication for each paper.
In case the year is not in the db, the year of the submission is taken"""
dates = {}
# the format is: dates[dict_of_ids[recid]] = year
try:
dates_file = open(filename,"r")
except StandardError:
write_message("Cannot find file: %s" % filename, sys.stderr)
raise StandardError
for line in dates_file:
tokens = line.strip().split()
recid = int(tokens[0])
year = int(tokens[1])
dates[dict_of_ids[recid]] = year
dates_file.close()
write_message("Dates extracted", verbose=2)
write_message("Dates dictionary %s" % str(dates), verbose=9)
return dates
def get_dates_from_db(dict_of_ids):
"""Returns the year of the publication for each paper.
In case the year is not in the db, the year of the submission is taken"""
current_year = int(datetime.datetime.now().strftime("%Y"))
total = 0
count = 0
dict_of_dates = {}
for recid in dict_of_ids:
dict_of_dates[recid] = 0
query1 = "select * from bib26x where tag='260__c';"
date_list = run_sql(query1)
date_dict = {}
for item in date_list:
date_dict[int(item[0])] = item[2]
pattern = re.compile('.*(\d{4}).*')
query2 = "select * from bibrec_bib26x;"
date_list = run_sql(query2)
for item in date_list:
recid = int(item[0])
id_ = int(item[1])
if id_ in date_dict and recid in dict_of_dates:
reg = pattern.match(date_dict[id_])
if reg:
date = int(reg.group(1))
if date > 1000 and date <= current_year:
dict_of_dates[recid] = date
total += date
count += 1
not_covered = []
for recid in dict_of_dates:
if dict_of_dates[recid] == 0:
not_covered.append(recid)
query3 = "select * from bib96x where tag='961__x';"
date_list = run_sql(query3)
date_dict = {}
for item in date_list:
date_dict[int(item[0])] = item[2]
query4 = "select * from bibrec_bib96x;"
date_list = run_sql(query4)
for item in date_list:
recid = int(item[0])
id_ = int(item[1])
if id_ in date_dict and recid in not_covered:
date = int(str(date_dict[id_])[0:4])
if date > 1000 and date <= current_year:
dict_of_dates[recid] = date
total += date
count += 1
dates = {}
med = total/count
for recid in dict_of_dates:
if dict_of_dates[recid] == 0:
dates[dict_of_ids[recid]] = med
else:
dates[dict_of_ids[recid]] = dict_of_dates[recid]
write_message("Dates extracted", verbose=2)
write_message("Dates dictionary %s" % str(dates), verbose=9)
return dates
def construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor):
"""returns several structures needed in the calculation
of the PAGERANK method using this structures, we don't need
to keep the full matrix in the memory"""
sparse = {}
for item in cit:
for value in cit[item]:
sparse[(dict_of_ids[item], dict_of_ids[value])] = \
damping_factor * 1.0/ref[dict_of_ids[value]]
semi_sparse = []
for j in range(len_):
if ref[j] == 0:
semi_sparse.append(j)
semi_sparse_coeficient = damping_factor/len_
#zero_coeficient = (1-damping_factor)/len_
write_message("Sparse information calculated", verbose=3)
return sparse, semi_sparse, semi_sparse_coeficient
def construct_sparse_matrix_ext(cit, ref, ext_links, dict_of_ids, alpha, beta):
"""if x doesn't cite anyone: cites everyone : 1/len_ -- should be used!
returns several structures needed in the calculation
of the PAGERANK_EXT method"""
len_ = len(dict_of_ids)
sparse = {}
semi_sparse = {}
for i in range(len_):
sparse[i+1, 0] = alpha/(len_)
sparse[0, 0] = 1.0 - alpha
for j in range(len_):
if j not in ext_links:
sparse[0, j+1] = beta/(len_ + beta)
else:
if ext_links[j] == 0:
sparse[0, j+1] = beta/(len_ + beta)
else:
aux = beta * ext_links[j]
if ref[j] == 0:
sparse[0, j+1] = aux/(aux + len_)
else:
sparse[0, j+1] = aux/(aux + ref[j])
if ref[j] == 0:
semi_sparse[j+1] = (1.0 - sparse[0, j + 1])/len_
for item in cit:
for value in cit[item]:
sparse[(dict_of_ids[item] + 1, dict_of_ids[value] + 1)] = \
(1.0 - sparse[0, dict_of_ids[value] + 1])/ref[dict_of_ids[value]]
#for i in range(len_ + 1):
# a = ""
# for j in range (len_ + 1):
# if (i,j) in sparse:
# a += str(sparse[(i,j)]) + "\t"
# else:
# a += "0\t"
# print a
#print semi_sparse
write_message("Sparse information calculated", verbose=3)
return sparse, semi_sparse
def construct_sparse_matrix_time(cit, ref, dict_of_ids, \
damping_factor, date_coef):
"""returns several structures needed in the calculation of the PAGERANK_time
method using this structures,
we don't need to keep the full matrix in the memory"""
len_ = len(dict_of_ids)
sparse = {}
for item in cit:
for value in cit[item]:
sparse[(dict_of_ids[item], dict_of_ids[value])] = damping_factor * \
date_coef[dict_of_ids[value]]/ref[dict_of_ids[value]]
semi_sparse = []
for j in range(len_):
if ref[j] == 0:
semi_sparse.append(j)
semi_sparse_coeficient = damping_factor/len_
#zero_coeficient = (1-damping_factor)/len_
write_message("Sparse information calculated", verbose=3)
return sparse, semi_sparse, semi_sparse_coeficient
def statistics_on_sparse(sparse):
"""returns the number of papers that cite themselves"""
count_diag = 0
for (i, j) in sparse.keys():
if i == j:
count_diag += 1
write_message("The number of papers that cite themselves: %s" % \
str(count_diag), verbose=3)
return count_diag
def pagerank(conv_threshold, check_point, len_, sparse, \
semi_sparse, semi_sparse_coef):
"""the core function of the PAGERANK method
returns an array with the ranks coresponding to each recid"""
weights_old = array((), float32)
weights_old = ones((len_), float32) # initial weights
weights_new = array((), float32)
converged = False
nr_of_check_points = 0
difference = len_
while not converged:
nr_of_check_points += 1
for step in (range(check_point)):
weights_new = zeros((len_), float32)
for (i, j) in sparse.keys():
weights_new[i] += sparse[(i, j)]*weights_old[j]
semi_total = 0.0
for j in semi_sparse:
semi_total += weights_old[j]
weights_new = weights_new + semi_sparse_coef * semi_total + \
(1.0/len_ - semi_sparse_coef) * sum(weights_old)
if step == check_point - 1:
diff = weights_new - weights_old
difference = sqrt(dot(diff, diff))/len_
write_message( "Finished step: %s, %s " \
%(str(check_point*(nr_of_check_points-1) + step), \
str(difference)), verbose=5)
weights_old = weights_new.copy()
converged = (difference < conv_threshold)
write_message("PageRank calculated for all recids finnished in %s steps. \
The threshold was %s" % (str(nr_of_check_points), str(difference)),\
verbose=2)
return weights_old
def pagerank_ext( conv_threshold, check_point, len_, sparse, semi_sparse):
"""the core function of the PAGERANK_EXT method
returns an array with the ranks coresponding to each recid"""
weights_old = array((), float32)
weights_old = ones((len_), float32)
weights_new = array((), float32)
converged = False
nr_of_check_points = 0
difference = len_
while not converged:
nr_of_check_points += 1
for step in (range(check_point)):
weights_new = zeros((len_), float32)
for (i, j) in sparse.keys():
weights_new[i] += sparse[(i, j)]*weights_old[j]
total_sum = 0.0
for j in semi_sparse:
total_sum += semi_sparse[j]*weights_old[j]
weights_new[1:len_] = weights_new[1:len_] + total_sum
if step == check_point - 1:
diff = weights_new - weights_old
difference = sqrt(dot(diff, diff))/len_
write_message( "Finished step: %s, %s " \
% (str(check_point*(nr_of_check_points-1) + step), \
str(difference)), verbose=5)
weights_old = weights_new.copy()
converged = (difference < conv_threshold)
write_message("PageRank calculated for all recids finnished in %s steps. \
The threshold was %s" % (str(nr_of_check_points), \
str(difference)), verbose=2)
#return weights_old[1:len_]/(len_ - weights_old[0])
return weights_old[1:len_]
def pagerank_time(conv_threshold, check_point, len_, \
sparse, semi_sparse, semi_sparse_coeficient, date_coef):
"""the core function of the PAGERANK_TIME method: pageRank + time decay
returns an array with the ranks coresponding to each recid"""
weights_old = array((), float32)
weights_old = ones((len_), float32) # initial weights
weights_new = array((), float32)
converged = False
nr_of_check_points = 0
difference = len_
while not converged:
nr_of_check_points += 1
for step in (range(check_point)):
weights_new = zeros((len_), float32)
for (i, j) in sparse.keys():
weights_new[i] += sparse[(i, j)]*weights_old[j]
semi_total = 0.0
for j in semi_sparse:
semi_total += weights_old[j]*date_coef[j]
zero_total = 0.0
for i in range(len_):
zero_total += weights_old[i]*date_coef[i]
#dates = array(date_coef.keys())
#zero_total = dot(weights_old, dates)
weights_new = weights_new + semi_sparse_coeficient * semi_total + \
(1.0/len_ - semi_sparse_coeficient) * zero_total
if step == check_point - 1:
diff = weights_new - weights_old
difference = sqrt(dot(diff, diff))/len_
write_message( "Finished step: %s, %s " \
% (str(check_point*(nr_of_check_points-1) + step), \
str(difference)), verbose=5)
weights_old = weights_new.copy()
converged = (difference < conv_threshold)
write_message("PageRank calculated for all recids finnished in %s steps.\
The threshold was %s" % (str(nr_of_check_points), \
str(difference)), verbose=2)
return weights_old
def citation_rank_time(cit, dict_of_ids, date_coef, dates, decimals):
"""returns a dictionary recid:weight based on the total number of
citations as function of time"""
dict_of_ranks = {}
for key in dict_of_ids:
if key in cit:
dict_of_ranks[key] = 0
for recid in cit[key]:
dict_of_ranks[key] += date_coef[dict_of_ids[recid]]
dict_of_ranks[key] = round(dict_of_ranks[key], decimals) \
+ dates[dict_of_ids[key]]* pow(10, 0-4-decimals)
else:
dict_of_ranks[key] = dates[dict_of_ids[key]]* pow(10, 0-4-decimals)
write_message("Citation rank calculated", verbose=2)
return dict_of_ranks
def get_ranks(weights, dict_of_ids, mult, dates, decimals):
"""returns a dictionary recid:value, where value is the weight of the
recid paper; the second order is the reverse time order,
from recent to past"""
dict_of_ranks = {}
for item in dict_of_ids:
dict_of_ranks[item] = round(weights[dict_of_ids[item]]* mult, decimals) \
+ dates[dict_of_ids[item]]* pow(10, 0-4-decimals)
#dict_of_ranks[item] = weights[dict_of_ids[item]]
return dict_of_ranks
def sort_weights(dict_of_ranks):
"""sorts the recids based on weights(first order)
and on dates(second order)"""
ranks_by_citations = sorted(dict_of_ranks.keys(), lambda x, y: \
cmp(dict_of_ranks[y], dict_of_ranks[x]))
return ranks_by_citations
def write_first_ranks_to_file(ranks_by_citations, dict_of_ranks, \
nr_of_ranks, filename):
"""Writes the first n results of the ranking method into a file"""
try:
ranks_file = open(filename,"w")
except StandardError:
write_message("Problems with file: %s" % filename, sys.stderr)
raise StandardError
for i in range(nr_of_ranks):
ranks_file.write(str(i+1) + "\t" + str(ranks_by_citations[i]) + \
"\t" + str(dict_of_ranks[ranks_by_citations[i]]) + "\n")
ranks_file.close()
write_message("The first %s pairs recid:rank in the ranking order \
are written into this file: %s" % (nr_of_ranks, filename), verbose=2)
def del_rank_method_data(rank_method_code):
"""Delete the data for a rank method from rnkMETHODDATA table"""
id_ = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code,))
run_sql("DELETE FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id_[0][0], ))
def into_db(dict_of_ranks, rank_method_code):
"""Writes into the rnkMETHODDATA table the ranking results"""
method_id = run_sql("SELECT id from rnkMETHOD where name=%s", \
(rank_method_code, ))
del_rank_method_data(rank_method_code)
serialized_data = serialize_via_marshal(dict_of_ranks)
method_id_str = str(method_id[0][0])
run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) \
VALUES (%s,%s)",(method_id_str, serialized_data,))
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", \
(date, rank_method_code))
write_message("Finished writing the ranks into rnkMETHOD table", verbose=5)
def run_pagerank(cit, dict_of_ids, len_, ref, damping_factor, \
conv_threshold, check_point, dates):
"""returns the final form of the ranks when using pagerank method"""
write_message("Running the PageRank method", verbose=5)
sparse, semi_sparse, semi_sparse_coeficient = \
construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor)
weights = pagerank(conv_threshold, check_point, len_, \
sparse, semi_sparse, semi_sparse_coeficient)
dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2)
return dict_of_ranks
def run_pagerank_ext(cit, dict_of_ids, ref, ext_links, \
conv_threshold, check_point, alpha, beta, dates):
"""returns the final form of the ranks when using pagerank_ext method"""
write_message( "Running the PageRank with external links method", verbose=5)
len_ = len(dict_of_ids)
sparse, semi_sparse = construct_sparse_matrix_ext(cit, ref, \
ext_links, dict_of_ids, alpha, beta)
weights = pagerank_ext(conv_threshold, check_point, \
len_ + 1, sparse, semi_sparse)
dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2)
return dict_of_ranks
def run_pagerank_time(cit, dict_of_ids, len_, ref, damping_factor, \
conv_threshold, check_point, date_coef, dates):
"""returns the final form of the ranks when using
pagerank + time decay method"""
write_message("Running the PageRank_time method", verbose=5)
sparse, semi_sparse, semi_sparse_coeficient = \
construct_sparse_matrix_time(cit, ref, dict_of_ids, \
damping_factor, date_coef)
weights = pagerank_time(conv_threshold, check_point, len_, \
sparse, semi_sparse, semi_sparse_coeficient, date_coef)
dict_of_ranks = get_ranks(weights, dict_of_ids, 100000, dates, 2)
return dict_of_ranks
def run_citation_rank_time(cit, dict_of_ids, date_coef, dates):
"""returns the final form of the ranks when using citation count
as function of time method"""
write_message("Running the citation rank with time decay method", verbose=5)
dict_of_ranks = citation_rank_time(cit, dict_of_ids, date_coef, dates, 2)
return dict_of_ranks
def spearman_rank_correlation_coef(rank1, rank2, len_):
"""rank1 and rank2 are arrays containing the recids in the ranking order
returns the corelation coeficient (-1 <= c <= 1) between 2 rankings
the closec c is to 1, the more correlated are the two ranking methods"""
total = 0
for i in range(len_):
rank_value = rank2.index(rank1[i])
total += ( i - rank_value)*( i - rank_value)
return 1 - (6.0 * total) / (len_*(len_*len_ - 1))
def remove_loops(cit, dates, dict_of_ids):
"""when using time decay, new papers that are part of a loop
are accumulating a lot of fake weight"""
new_cit = {}
for recid in cit:
new_cit[recid] = []
for cited_by in cit[recid]:
if dates[dict_of_ids[cited_by]] >= dates[dict_of_ids[recid]]:
if cited_by in cit:
if recid not in cit[cited_by]:
new_cit[recid].append(cited_by)
else:
write_message("Loop removed: %s <-> %s" \
%(cited_by, recid), verbose=9)
else:
new_cit[recid].append(cited_by)
else:
write_message("Loop removed: %s <-> %s" \
%(cited_by, recid), verbose=9)
write_message("Simple loops removed", verbose=5)
return new_cit
def calculate_time_weights(len_, time_decay, dates):
"""calculates the time coeficients for each paper"""
current_year = int(datetime.datetime.now().strftime("%Y"))
date_coef = {}
for j in range(len_):
date_coef[j] = exp(time_decay*(dates[j] - current_year))
write_message("Time weights calculated", verbose=5)
write_message("Time weights: %s" % str(date_coef), verbose=9)
return date_coef
def get_dates(function, config, dict_of_ids):
"""returns a dictionary containing the year of
publishing for each paper"""
try:
file_for_dates = config.get(function, "file_with_dates")
dates = get_dates_from_file(file_for_dates, dict_of_ids)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("If you want to read the dates from file set up the \
'file_for_dates' variable in the config file [%s]" %err, verbose=3)
dates = get_dates_from_db(dict_of_ids)
return dates
def citerank(rank_method_code):
"""new ranking method based on the citation graph"""
write_message("Running rank method: %s" % rank_method_code, verbose=0)
try:
file_ = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg"
config = ConfigParser.ConfigParser()
config.readfp(open(file_))
except StandardError:
write_message("Cannot find configuration file: %s" % file_, sys.stderr)
raise StandardError
# the file for citations needs to have the following format:
#each line needs to be x[tab]y, where x cites y; x,y are recids
function = config.get("rank_method", "function")
try:
file_for_citations = config.get(function, "file_with_citations")
cit, dict_of_ids = get_citations_from_file(file_for_citations)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("If you want to read the citation data from file set up \
the file_for_citations parameter in the config file [%s]" %err, verbose=2)
cit, dict_of_ids = get_citations_from_db()
len_ = len(dict_of_ids.keys())
write_message("Number of nodes(papers) to rank : %s" % str(len_), verbose=3)
if len_ == 0:
write_message("Error: No citations to read!", sys.stderr)
raise Exception
try:
method = config.get(function, "citerank_method")
except ConfigParser.NoOptionError, err:
write_message("Exception: %s " %err, sys.stderr)
raise Exception
write_message("Running %s method." % method, verbose=2)
dates = get_dates(function, config, dict_of_ids)
if method == "citation_time":
try:
time_decay = float(config.get(function, "time_decay"))
except (ConfigParser.NoOptionError, ValueError), err:
write_message("Exception: %s" % err, sys.stderr)
raise Exception
date_coef = calculate_time_weights(len_, time_decay, dates)
#cit = remove_loops(cit, dates, dict_of_ids)
dict_of_ranks = \
run_citation_rank_time(cit, dict_of_ids, date_coef, dates)
else:
try:
conv_threshold = float(config.get(function, "conv_threshold"))
check_point = int(config.get(function, "check_point"))
damping_factor = float(config.get(function, "damping_factor"))
write_message("Parameters: d = %s, conv_threshold = %s, \
check_point = %s" %(str(damping_factor), \
str(conv_threshold), str(check_point)), verbose=5)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("Exception: %s" % err, sys.stderr)
raise Exception
if method == "pagerank_classic":
ref = construct_ref_array(cit, dict_of_ids, len_)
use_ext_cit = ""
try:
use_ext_cit = config.get(function, "use_external_citations")
write_message("Pagerank will use external citations: %s" \
%str(use_ext_cit), verbose=5)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("%s" % err, verbose=2)
if use_ext_cit == "yes":
try:
ext_citation_file = config.get(function, "ext_citation_file")
ext_links = get_external_links_from_file\
(ext_citation_file, ref, dict_of_ids)
except (ConfigParser.NoOptionError, StandardError):
write_message("If you want to read the external citation data \
from file set up the ext_citation_file parameter in the config. file", verbose=3)
try:
reference_tag = config.get(function, "ext_reference_tag")
dummy = int(reference_tag[0:3])
except (ConfigParser.NoOptionError, StandardError):
write_message("You need to set up correctly the \
reference_tag in the cfg file", sys.stderr)
raise Exception
ext_links = get_external_links_from_db(ref, \
dict_of_ids, reference_tag)
avg = avg_ext_links_with_0(ext_links)
if avg < 1:
write_message("This method can't be ran. There is not enough \
information about the external citation. Hint: check the reference tag", sys.stderr)
raise Exception
avg_ext_links_without_0(ext_links)
try:
alpha = float(config.get(function, "ext_alpha"))
beta = float(config.get(function, "ext_beta"))
except (ConfigParser.NoOptionError, StandardError), err:
write_message("Exception: %s" % err, sys.stderr)
raise Exception
dict_of_ranks = run_pagerank_ext(cit, dict_of_ids, ref, \
ext_links, conv_threshold, check_point, alpha, beta, dates)
else:
dict_of_ranks = run_pagerank(cit, dict_of_ids, len_, ref, \
damping_factor, conv_threshold, check_point, dates)
elif method == "pagerank_time":
try:
time_decay = float(config.get(function, "time_decay"))
write_message("Parameter: time_decay = %s" %str(time_decay), verbose=5)
except (ConfigParser.NoOptionError, StandardError), err:
write_message("Exception: %s" % err, sys.stderr)
raise Exception
date_coef = calculate_time_weights(len_, time_decay, dates)
cit = remove_loops(cit, dates, dict_of_ids)
ref = construct_ref_array(cit, dict_of_ids, len_)
dict_of_ranks = run_pagerank_time(cit, dict_of_ids, len_, ref, \
damping_factor, conv_threshold, check_point, date_coef, dates)
else:
write_message("Error: Unknown ranking method. \
Please check the ranking_method parameter in the config. file.", sys.stderr)
raise Exception
try:
filename_ranks = config.get(function, "output_ranks_to_filename")
max_ranks = config.get(function, "output_rank_limit")
if not max_ranks.isdigit():
max_ranks = len_
else:
max_ranks = int(max_ranks)
if max_ranks > len_:
max_ranks = len_
ranks = sort_weights(dict_of_ranks)
write_message("Ranks: %s" % str(ranks), verbose=9)
write_first_ranks_to_file(ranks, dict_of_ranks, \
max_ranks, filename_ranks)
except (ConfigParser.NoOptionError, StandardError):
write_message("If you want the ranks to be printed in a file you have \
to set output_ranks_to_filename and output_rank_limit \
parameters in the configuration file", verbose=3)
into_db(dict_of_ranks, rank_method_code)

Event Timeline