bibrank_citation_indexer.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, May 11, 12:46

bibrank_citation_indexer.py
View Options

	# -- coding: utf-8 --
	##
	## $Id$
	##
	## This file is part of CDS Invenio.
	## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
	##
	## CDS Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## CDS Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	__revision__ = "$Id$"

	import time
	import sys
	import os
	import marshal
	import traceback
	from zlib import decompress, compress, error

	from invenio.dbquery import run_sql, serialize_via_marshal, \
	deserialize_via_marshal
	from invenio.search_engine import print_record, search_pattern
	from invenio.bibrecord import create_records, record_get_field_values, \
	record_get_field_value
	from invenio.bibformat_utils import parse_tag
	from invenio.bibtask import write_message, task_get_option

	class memoise:
	def __init__(self, function):
	self.memo = {}
	self.function = function
	def __call__(self, *args):
	if self.memo.has_key(args):
	return self.memo[args]
	else:
	object = self.memo[args] = self.function(*args)
	return object

	def get_recids_matching_query(pvalue, fvalue):
	"""Return list of recIDs matching query for PVALUE and FVALUE."""
	rec_id = list(search_pattern(p=pvalue, f=fvalue, m='e'))
	return rec_id
	get_recids_matching_query = memoise(get_recids_matching_query)

	def get_citation_weight(rank_method_code, config):
	"""return a dictionary which is used by bibrank daemon for generating
	the index of sorted research results by citation information
	"""
	begin_time = time.time()
	last_update_time = get_bibrankmethod_lastupdate(rank_method_code)
	#addition: YOU DO NEED TO RUN WITH OPTION -R SOMETIMES. This is
	#because among the new set (X) there can be records such that the old
	#records Y cite them. But this kind of situation is not detected
	#unless you go though all the records Y+X.
	if task_get_option("quick") == "no":
	last_update_time = "0000-00-00 00:00:00"
	#if task_get_option('verbose') >= 3:
	last_modified_records = get_last_modified_rec(last_update_time)
	#id option forces re-indexing a certain range even if there are no new records
	if last_modified_records or task_get_option("id"):
	if task_get_option("id"):
	#construct a range of records to index
	id = task_get_option("id")
	first = id[0][0]
	last = id[0][1]
	#make range
	updated_recid_list = range(first,last)
	else:
	updated_recid_list = create_recordid_list(last_modified_records)

	write_message("Last update "+str(last_update_time)+" records: "+str(len(last_modified_records))+" updates: "+str(len(updated_recid_list)), sys.stderr)

	#write_message("updated_recid_list: "+str(updated_recid_list),sys.stderr)
	result_intermediate = last_updated_result(rank_method_code, updated_recid_list)
	#result_intermed should be warranted to exists!
	citation_weight_dic_intermediate = result_intermediate[0]
	citation_list_intermediate = result_intermediate[1]
	reference_list_intermediate = result_intermediate[2]
	citation_informations = get_citation_informations(updated_recid_list, config)
	#write_message("citation_informations: "+str(citation_informations),sys.stderr)
	#create_analysis_tables() #temporary.. needed to test how much faster in-mem indexing is
	dic = ref_analyzer(citation_informations, citation_weight_dic_intermediate, citation_list_intermediate, reference_list_intermediate,config,updated_recid_list) #dic is docid-numberofreferences like {1: 2, 2: 0, 3: 1}
	#write_message("Docid-number of known references "+str(dic),sys.stderr)
	end_time = time.time()
	print "Total time of software: ", (end_time - begin_time)
	else:
	dic = {}
	print "No new records added since last time this rank method was executed"
	return dic

	def get_bibrankmethod_lastupdate(rank_method_code):
	"""return the last excution date of bibrank method
	"""
	query = """select last_updated from rnkMETHOD where name ='%s'""" % rank_method_code
	last_update_time = run_sql(query)
	r = last_update_time[0][0]
	if r is None:
	return "0000-00-00 00:00:00"
	return r

	def get_last_modified_rec(bibrank_method_lastupdate):
	""" return the list of recods which have been modified after the last execution
	of bibrank method. The result is expected to have ascending numerical order.
	"""
	query = """SELECT id FROM bibrec WHERE modification_date>= '%s' """ % bibrank_method_lastupdate
	query += "order by id ASC"
	list = run_sql(query)
	return list

	def create_recordid_list(rec_ids):
	"""Create a list of record ids out of RECIDS. The result is expected to have ascending numerical order.
	"""
	rec_list = []
	for row in rec_ids:
	rec_list.append(row[0])
	return rec_list

	def create_record_tuple(list):
	"""Creates a tuple of record id from a list of id.
	The result is expected to have ascending numerical order.
	"""
	list_length = len(list)
	if list_length:
	rec_tuple = '('
	for row in list[0:list_length-1]:
	rec_tuple += str(row)
	rec_tuple += ','
	rec_tuple += str(list[list_length-1])
	rec_tuple += ')'
	else: rec_tuple = '()'
	return rec_tuple

	def last_updated_result(rank_method_code, recid_list):
	""" return the last value of dictionary in rnkMETHODDATA table if it exists and
	initialize the value of last updated records by zero,otherwise an initial dictionary
	with zero as value for all recids
	"""
	result = make_initial_result()
	query = """select relevance_data from rnkMETHOD, rnkMETHODDATA where
	rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD and rnkMETHOD.Name = '%s'"""% rank_method_code
	dict = run_sql(query)
	if dict and dict[0] and dict[0][0]:
	#has to be prepared for corrupted data!
	try:
	dic = marshal.loads(decompress(dict[0][0]))
	except error:
	return result
	query = "select object_value from rnkCITATIONDATA where object_name='citationdict'"
	cit_compressed = run_sql(query)
	cit = []
	if cit_compressed and cit_compressed[0] and cit_compressed[0][0]:
	cit = marshal.loads(decompress(cit_compressed[0][0]))
	if cit:
	query = "select object_value from rnkCITATIONDATA where object_name='reversedict'"
	ref_compressed = run_sql(query)
	if ref_compressed and ref_compressed[0] and ref_compressed[0][0]:
	ref = marshal.loads(decompress(ref_compressed[0][0]))
	result = get_initial_result(dic, cit, ref, recid_list)
	return result

	def get_initial_result(dic, cit, ref, recid_list):
	"""initialize the citation weights of the last updated record with zero for
	recalculating it later
	"""
	for recid in recid_list:
	dic[recid] = 0
	cit[recid] = []
	if ref.has_key(recid) and ref[recid]:
	for id in ref[recid]:
	if cit.has_key(id) and recid in cit[id]:
	cit[id].remove(recid)
	if dic.has_key(id):
	dic[id] -= 1
	if cit.has_key(recid) and cit[recid]:
	for id in cit[recid]:
	if ref.has_key(id) and recid in ref[id]:
	ref[id].remove(recid)
	ref[recid] = []
	return [dic, cit, ref]

	def make_initial_result():
	"""return an initial dictinary with recID as key and zero as value
	"""
	dic = {}
	cit = {}
	ref = {}
	query = "select id from bibrec"
	res = run_sql(query)
	for key in res:
	dic[key[0]] = 0
	cit[key[0]] = []
	ref[key[0]] = []
	return [dic, cit, ref]

	def get_citation_informations(recid_list, config):
	"""returns a 3-part dictionary that contains the citation information of cds records
	examples: [ {} {} {} ]
	[ { 93: ['astro-ph/9812088']},{ 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
	NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more.
	"""
	begin_time = os.times()[4]
	d_reports_numbers = {}
	d_references_report_numbers = {}
	d_references_s = {}
	d_records_s = {}
	citation_informations = []
	record_pri_number_tag = config.get(config.get("rank_method", "function"),"publication_primary_number_tag")
	record_add_number_tag = config.get(config.get("rank_method", "function"),"publication_aditional_number_tag")
	reference_number_tag = config.get(config.get("rank_method", "function"),"publication_reference_number_tag")
	reference_tag = config.get(config.get("rank_method", "function"),"publication_reference_tag")
	record_publication_info_tag = config.get(config.get("rank_method", "function"),"publication_info_tag")

	p_record_pri_number_tag = parse_tag(record_pri_number_tag)
	p_record_add_number_tag = parse_tag(record_add_number_tag)
	p_reference_number_tag = parse_tag(reference_number_tag)
	p_reference_tag = parse_tag(reference_tag)
	p_record_publication_info_tag = parse_tag(record_publication_info_tag)

	for recid in recid_list:
	xml = print_record(int(recid),'xm')
	rs = create_records(xml)
	recs = map((lambda x:x[0]), rs)
	l_report_numbers = []
	for rec in recs:
	pri_report_number = record_get_field_values(rec, p_record_pri_number_tag[0],
	ind1=p_record_pri_number_tag[1],
	ind2=p_record_pri_number_tag[2],
	code=record_pri_number_tag[3])
	add_report_numbers = record_get_field_values(rec, p_record_add_number_tag[0],
	ind1=p_record_add_number_tag[1],
	ind2=p_record_add_number_tag[2],
	code=record_add_number_tag[3])
	if pri_report_number:
	l_report_numbers.extend(pri_report_number)
	if add_report_numbers:
	l_report_numbers.extend(add_report_numbers)
	d_reports_numbers[recid] = l_report_numbers
	reference_report_number = record_get_field_values(rec, p_reference_number_tag[0],
	ind1=p_reference_number_tag[1],
	ind2=p_reference_number_tag[2],
	code=p_reference_number_tag[3])
	if reference_report_number:
	d_references_report_numbers[recid] = reference_report_number
	references_s = record_get_field_values(rec, p_reference_tag[0],
	ind1=p_reference_tag[1],
	ind2=p_reference_tag[2],
	code=p_reference_tag[3])
	if references_s:
	d_references_s[recid] = references_s
	record_s = record_get_field_values(rec,
	p_record_publication_info_tag[0],
	ind1=p_record_publication_info_tag[1],
	ind2=p_record_publication_info_tag[2],
	code=p_record_publication_info_tag[3])
	if record_s:
	d_records_s[recid] = record_s[0]
	citation_informations.append(d_reports_numbers)
	citation_informations.append(d_references_report_numbers)
	citation_informations.append(d_references_s)
	citation_informations.append(d_records_s)
	end_time = os.times()[4]
	print "Execution time for generating citation informations by parsing xml contents: ", (end_time - begin_time)
	return citation_informations

	def get_self_citations(new_record_list, citationdic, initial_selfcitdict, config):
	"""Check which items have been cited by one of the authors of the
	citing item: go through id's in new_record_list, use citationdic to get citations,
	update "selfcites". Selfcites is originally initial_selfcitdict. Return selfcites.
	"""
	i = 0 #just for debugging ..
	selfcites = initial_selfcitdict
	for k in new_record_list:
	i = i+1
	if task_get_option('verbose') >= 3:
	if (i % 100 == 0):
	write_message("Done "+str(i)+" records",sys.stderr)
	#get the author of k
	xml = print_record(int(k),'xm')
	rs = create_records(xml)
	recs = map((lambda x:x[0]), rs)
	for rec in recs:
	#author tag
	author = record_get_field_value(rec,"100","","","a")
	otherauthors = record_get_field_values(rec,"700","","","a")
	moreauthors = record_get_field_values(rec,"720","","","a")
	authorlist = [author]
	authorlist.extend(otherauthors)
	authorlist.extend(moreauthors)
	#print "record "+str(k)+" by "+str(authorlist)
	#print "is cited by"
	#get the "x-cites-this" list
	if citationdic.has_key(k):
	xct = citationdic[k]
	for c in xct:
	cxml = print_record(int(c),'xm')
	crs = create_records(cxml)
	crecs = map((lambda x:x[0]), crs)
	for crec in crecs:
	cauthor = record_get_field_value(crec,"100","","","a")
	cotherauthors = record_get_field_values(crec,"700","","","a")
	cmoreauthors = record_get_field_values(crec,"720","","","a")
	cauthorlist = [cauthor]
	cauthorlist.extend(cotherauthors)
	cauthorlist.extend(cmoreauthors)
	#print str(c)+" by "+str(cauthorlist)
	for ca in cauthorlist:
	if (ca in authorlist):
	if selfcites.has_key(k):
	val = selfcites[k]
	#add only if not there already
	if val:
	if not c in val:
	val.append(c)
	selfcites[k] = val
	else:
	selfcites[k] = [c]
	return selfcites

	def get_author_citations(updated_redic_list, citedbydict, initial_author_dict):
	"""Traverses citedbydict in order to build "which author is quoted where" dict.
	The keys of this are author names. An entry like "Apollinaire"->[1,2,3] means
	Apollinaire is cited in records 1,2 and 3.
	Input: citedbydict, updated_redic_list = records to be searched, initial_author_dict:
	the dicts from the database.
	Output: authorciteddict. It is initially set to initial_author_dict
	"""
	author_cited_in = initial_author_dict
	if citedbydict:
	i = 0 #just a counter for debug
	write_message("Checking records referred to in new records",sys.stderr)
	for u in updated_redic_list:
	if citedbydict.has_key(u):
	these_cite_k = citedbydict[u]
	if (these_cite_k is None):
	these_cite_k = [] #verify it is an empty list, not None
	xml = print_record(int(u),'xm')
	rs = create_records(xml)
	recs = map((lambda x:x[0]), rs)
	authors = get_authors_in_records(recs)
	for a in authors:
	if a and author_cited_in.has_key(a):
	#add all elements in these_cite_k
	#that are not there already
	for citer in these_cite_k:
	tmplist = author_cited_in[a]
	if (tmplist.count(citer) == 0):
	tmplist.append(citer)
	author_cited_in[a] = tmplist
	else:
	author_cited_in[a] = these_cite_k

	#go through the dictionary again: all keys but search only if new records are cited
	write_message("Checking authors in new records",sys.stderr)
	for k in citedbydict.keys():
	these_cite_k = citedbydict[k]
	if (these_cite_k is None):
	these_cite_k = [] #verify it is an empty list, not None
	#do things only if these_cite_k contains any new stuff
	intersec_list = list(set(these_cite_k)&set(updated_redic_list))
	if intersec_list:
	xml = print_record(int(u),'xm')
	rs = create_records(xml)
	recs = map((lambda x:x[0]), rs)
	authors = get_authors_in_records(recs)
	for a in authors:
	if a and author_cited_in.has_key(a):
	#add all elements in these_cite_k
	#that are not there already
	for citer in these_cite_k:
	tmplist = author_cited_in[a]
	if (tmplist.count(citer) == 0):
	tmplist.append(citer)
	author_cited_in[a] = tmplist
	else:
	author_cited_in[a] = these_cite_k

	return author_cited_in


	def get_authors_in_records(recs):
	"""An aux function to avoid code duplication"""
	authors = []
	for rec in recs:
	#make a list of entries in all author tags
	author = record_get_field_value(rec,"100","","","a")
	otherauthors = record_get_field_values(rec,"700","","","a")
	moreauthors = record_get_field_values(rec,"720","","","a")
	authors = moreauthors
	authors.extend(otherauthors)
	authors.append(author)
	return authors


	def ref_analyzer(citation_informations, initialresult, initial_citationlist, initial_referencelist,config, updated_rec_list ):
	"""Analyze the citation informations and calculate the citation weight
	and cited by list dictionary.
	"""
	pubrefntag = record_pri_number_tag = config.get(config.get("rank_method", "function"),"publication_reference_number_tag")
	pubreftag = record_pri_number_tag = config.get(config.get("rank_method", "function"),"publication_reference_tag")
	#pubrefntab is prob 999C5r, pubreftab 999c5s
	citation_list = initial_citationlist
	reference_list = initial_referencelist
	result = initialresult
	d_reports_numbers = citation_informations[0]
	d_references_report_numbers = citation_informations[1]
	d_references_s = citation_informations[2] #of type: {77: ['Nucl. Phys. B 72 (1974) 461','blah blah'], 93: ['..'], ..}
	d_records_s = citation_informations[3]
	t1 = os.times()[4]
	if task_get_option('verbose') >= 1:
	write_message("Phase 1: d_references_report_numbers",sys.stderr)
	#d_references_report_numbers: e.g 8 -> ([astro-ph/9889],[hep-ph/768])
	#meaning: rec 8 contains these in bibliography

	#debug: add ref lit to tmpcit
	#for k in reference_list.keys():
	# li = reference_list[k]
	# for l in li:
	# write_citer_cited(k,l)

	for recid, refnumbers in d_references_report_numbers.iteritems():
	for refnumber in refnumbers:
	if refnumber:
	p = refnumber
	f = 'reportnumber'
	#sanitise p
	p.replace("\n",'')
	#search for "hep-th/5644654 or such" in existing records
	rec_id = get_recids_matching_query(p, f)
	if rec_id and rec_id[0]:
	write_citer_cited(recid,rec_id[0])
	remove_from_missing(p)
	if result.has_key(rec_id[0]):
	result[rec_id[0]] += 1
	# Citation list should have rec_id[0] but check anyway
	if citation_list.has_key(rec_id[0]):
	citation_list[rec_id[0]].append(recid)
	else:
	citation_list[rec_id[0]] = [recid]
	if reference_list.has_key(recid):
	reference_list[recid].append(rec_id[0])
	else:
	reference_list[recid] = [rec_id[0]]
	else:
	#the reference we wanted was not found among our records.
	#put the reference in the "missing"
	insert_into_missing(recid,p)
	t2 = os.times()[4]
	if task_get_option('verbose') >= 1:
	write_message("Phase 2: d_references_s",sys.stderr)
	for recid, refss in d_references_s.iteritems():
	for refs in refss:
	if refs:
	p = refs
	f = 'publref'
	rec_id = get_recids_matching_query(p, f)
	if rec_id and not recid in citation_list[rec_id[0]]:
	result[rec_id[0]] += 1
	citation_list[rec_id[0]].append(recid)
	if rec_id and not rec_id[0] in reference_list[recid]:
	reference_list[recid].append(rec_id[0])
	t3 = os.times()[4]
	if task_get_option('verbose') >= 1:
	write_message("Phase 3: d_reports_numbers",sys.stderr)

	for rec_id, recnumbers in d_reports_numbers.iteritems():
	for recnumber in recnumbers:
	if recnumber:
	p = recnumber
	recid_list = get_recids_matching_query(p, pubrefntag)
	if recid_list:
	for recid in recid_list:
	if not citation_list.has_key(rec_id):
	citation_list[rec_id] = []
	if not recid in citation_list[rec_id]:
	result[rec_id] += 1
	citation_list[rec_id].append(recid)
	if not reference_list.has_key(recid):
	reference_list[recid] = []
	if not rec_id in reference_list[recid]:
	reference_list[recid].append(rec_id)
	if task_get_option('verbose') >= 1:
	write_message("Phase 4: d_records_s",sys.stderr)
	t4 = os.times()[4]
	for recid, recs in d_records_s.iteritems():
	tmp = recs.find("-")
	if tmp < 0:
	recs_modified = recs
	else:
	recs_modified = recs[:tmp]
	p = recs_modified
	rec_ids = get_recids_matching_query(p, pubreftab)
	if rec_ids:
	for rec_id in rec_ids:
	if not rec_id in citation_list[recid]:
	result[recid] += 1
	citation_list[recid].append(rec_id)
	if not recid in reference_list[rec_id]:
	reference_list[rec_id].append(recid)

	if task_get_option('verbose') >= 1:
	write_message("Phase 5: reverse lists",sys.stderr)

	#remove empty lists in citation and reference
	keys = citation_list.keys()
	for k in keys:
	if not citation_list[k]:
	del citation_list[k]

	keys = reference_list.keys()
	for k in keys:
	if not reference_list[k]:
	del reference_list[k]

	if task_get_option('verbose') >= 1:
	write_message("Phase 6: self-citations",sys.stderr)
	selfdic = {}
	#get the initial self citation dict
	initial_self_dict = get_cit_dict("selfcitdict")
	#add new records to selfdic
	selfdic = get_self_citations(updated_rec_list, citation_list, initial_self_dict, config)
	#selfdic consists of
	#key k -> list of values [v1,v2,..]
	#where k is a record with author A and k cites v1,v2.. and A appears in v1,v2..

	#create a reverse "x cited by y" self cit dict
	selfcitedbydic = {}
	for k in selfdic.keys():
	vlist = selfdic[k]
	for v in vlist:
	if selfcitedbydic.has_key(v):
	tmplist = selfcitedbydic[v]
	tmplist.append(k)
	else:
	tmplist = [k]
	selfcitedbydic[v] = tmplist

	if task_get_option('verbose') >= 1:
	write_message("Getting author citations",sys.stderr)


	#get author citations for records in updated_rec_list
	initial_author_dict = get_initial_author_dict()
	authorcitdic = get_author_citations(updated_rec_list, citation_list, initial_author_dict)


	if task_get_option('verbose') >= 3:
	#print only 100 first to prevent flood
	tmpdict = {}
	tmp =citation_list.keys()[0:100]
	for t in tmp:
	tmpdict[t] = citation_list[t]
	write_message("citation_list (x is cited by y): "+str(tmpdict),sys.stderr)
	write_message("size: "+str(len(citation_list.keys())),sys.stderr)
	tmp = reference_list.keys()[0:100]
	tmpdict = {}
	for t in tmp:
	tmpdict[t] = reference_list[t]
	write_message("reference_list (x cites y): "+str(tmpdict),sys.stderr)
	write_message("size: "+str(len(reference_list.keys())),sys.stderr)
	tmp = selfcitedbydic.keys()[0:100]
	tmpdict = {}
	for t in tmp:
	tmpdict[t] = selfcitedbydic[t]
	write_message("selfcitedbydic (x is cited by y and one of the authors of x same as y's): "+str(tmpdict),sys.stderr)
	write_message("size: "+str(len(selfcitedbydic.keys())),sys.stderr)
	tmp = selfdic.keys()[0:100]
	tmpdict = {}
	for t in tmp:
	tmpdict[t] = selfdic[t]
	write_message("selfdic (x cites y and one of the authors of x same as y's): "+str(tmpdict),sys.stderr)
	write_message("size: "+str(len(selfdic.keys())),sys.stderr)
	tmp = authorcitdic.keys()[0:100]
	tmpdict = {}
	for t in tmp:
	tmpdict[t] = authorcitdic[t]
	write_message("authorcitdic (author is cited in recs): "+str(tmpdict),sys.stderr)
	write_message("size: "+str(len(authorcitdic.keys())),sys.stderr)
	insert_cit_ref_list_intodb(citation_list, reference_list, selfcitedbydic, selfdic, authorcitdic)

	t5 = os.times()[4]
	print "\nExecution time for analyzing the citation information generating the dictionary: "
	print "checking ref number: ", (t2-t1)
	print "checking ref ypvt: ", (t3-t2)
	print "checking rec number: ", (t4-t3)
	print "checking rec ypvt: ", (t5-t4)
	print "total time of ref_analyze: ", (t5-t1)
	return result

	def get_decompressed_xml(xml):
	"""return a decompressed content of xml into a xml content
	"""
	decompressed_xml = create_records(decompress(xml))
	return decompressed_xml

	def insert_cit_ref_list_intodb(citation_dic, reference_dic, selfcbdic,selfdic, authorcitdic):
	"""Insert the reference and citation list into the database"""
	date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
	insert_into_cit_db(reference_dic,"reversedict")
	insert_into_cit_db(citation_dic,"citationdict")
	insert_into_cit_db(selfcbdic,"selfcitedbydict")
	insert_into_cit_db(selfdic,"selfcitdict")

	#update author-citations.. but make sure the table exists
	sql = """CREATE TABLE IF NOT EXISTS rnkAUTHORDATAR (aterm varchar(50) default NULL,
	hitlist longblob, UNIQUE KEY aterm (aterm))"""
	try:
	run_sql(sql)
	except:
	pass

	for a in authorcitdic.keys():
	lserarr = (serialize_via_marshal(authorcitdic[a]))
	#author name: replace " with something else
	a.replace('"','\'')
	a = unicode(a,'utf-8')
	try:
	ablob = run_sql('select hitlist from rnkAUTHORDATAR where aterm ="'+a+'"')
	if not (ablob):
	#print "insert into rnkAUTHORDATAR(aterm,hitlist) values (%s,%s)" , (a,lserarr)
	run_sql("insert into rnkAUTHORDATAR(aterm,hitlist) values (%s,%s)" , (a,lserarr))
	else:
	#print "UPDATE rnkAUTHORDATAR SET hitlist = %s where aterm=%s""" , (lserarr,a)
	run_sql("UPDATE rnkAUTHORDATAR SET hitlist = %s where aterm=%s""" , (lserarr,a))
	except:
	print "Critical error: could not write rnkAUTHORDATAR into db. aterm="+a+" hitlist="+str(lsearr)
	traceback.print_tb(sys.exc_info()[2])

	def insert_into_cit_db(dic,name):
	"""an aux thing to avoid repeating code"""
	date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
	try:
	s = serialize_via_marshal(dic)
	print "size of "+name+" "+str(len(s))
	run_sql("UPDATE rnkCITATIONDATA SET object_value = %s where object_name='"+name+"'", (s, ))
	run_sql("UPDATE rnkCITATIONDATA SET last_updated = '"+date+"' where object_name='"+name+"'")
	except:
	print "Critical error: could not write "+name+" into db"
	traceback.print_tb(sys.exc_info()[2])


	def get_cit_dict(name):
	"""get a named citation dict from the db"""
	dict = {}
	try:
	query = "select object_value from rnkCITATIONDATA where object_name='"+name+"'"
	dict = run_sql(query)
	if dict and dict[0] and dict[0][0]:
	dict_from_db = marshal.loads(decompress(dict[0][0]))
	return dict_from_db
	except:
	print "Critical error: could not read "+name+" from db"
	traceback.print_tb(sys.exc_info()[2])
	return dict

	def get_initial_author_dict():
	"""read author->citedinlist dict from the db"""
	dict = {}
	try:
	ah = run_sql("select aterm,hitlist from rnkAUTHORDATAR")
	for (a,h) in ah:
	dict[a] = deserialize_via_marshal(h)
	return dict
	except:
	print "Critical error: could not read "+name+" from db"
	traceback.print_tb(sys.exc_info()[2])
	dict = {}
	return dict


	def insert_into_missing(recid,report):
	"""put the referingrecordnum-publicationstring into the "we are missing these" table"""
	report.replace('"','\'')
	sql = "select id_bibrec from rnkCITATIONDATAEXT where id_bibrec="+str(recid)+" and pubinfo=\""+report+"\""
	wasalready = run_sql(sql)
	if not wasalready:
	sql = "insert into rnkCITATIONDATAEXT(id_bibrec,pubinfo) values ("+str(recid)+",\""+report+"\")"
	try:
	run_sql(sql)
	except:
	#we should complain but it can result to million lines of warnings so just pass..
	pass

	def remove_from_missing(report):
	"""remove the recid-ref -pairs from the "missing" table for report x: prob
	in the case ref got in our library collection"""
	report.replace('"','\'')
	sql = "delete from rnkCITATIONDATAEXT where pubinfo=\""+report+"\""
	try:
	run_sql(sql)
	except:
	#we should complain but it can result to million lines of warnings so just pass..
	pass


	def create_analysis_tables():
	"""temporary simple table + index"""
	sql1 = "CREATE TABLE IF NOT EXISTS tmpcit (citer mediumint(10), cited mediumint(10)) TYPE=MyISAM"
	sql2 = "CREATE UNIQUE INDEX citercited on tmpcit(citer,cited)"
	sql3 = "CREATE INDEX citer on tmpcit(citer)"
	sql4 = "CREATE INDEX cited on tmpcit(cited)"
	try:
	run_sql(sql1)
	run_sql(sql2)
	run_sql(sql3)
	run_sql(sql4)
	except:
	pass

	def write_citer_cited(citer,cited):
	"""write an entry to tmp table"""
	sql = "insert into tmpcit(citer,cited) values ("+str(citer)+","+str(cited)+")"
	try:
	run_sql(sql)
	except:
	pass

bibrank_citation_indexer.pyNo OneTemporaryActions

File Metadata

bibrank_citation_indexer.pyView Options

Event Timeline

bibrank_citation_indexer.py
No OneTemporary
Actions

bibrank_citation_indexer.py
View Options