search_engine_summarizer.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Jun 30, 14:07

search_engine_summarizer.py
View Options

	# -- coding: utf-8 --

	## This file is part of CDS Invenio.
	## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
	##
	## CDS Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## CDS Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""
	Search Engine Summarizer, producing summary formats such as citesummary.
	The main API is summarize_records().
	"""

	__lastupdated__ = """$Date$"""

	__revision__ = "$Id$"

	from invenio.config import CFG_INSPIRE_SITE
	from invenio.bibrank_citation_searcher import get_cited_by_list
	import search_engine
	import invenio.template
	websearch_templates = invenio.template.load('websearch')

	## CFG_CITESUMMARY_COLLECTIONS -- how do we break down cite summary
	## results according to collections?
	if CFG_INSPIRE_SITE:
	CFG_CITESUMMARY_COLLECTIONS = [['All papers', 'collection:citeable'],
	['Published only', 'collection:citeable collection:published']]
	else:
	CFG_CITESUMMARY_COLLECTIONS = [['All papers', ''],
	['Published only', 'collection:article']]

	## CFG_CITESUMMARY_FAME_THRESHOLDS -- how do we break down cite
	## summary results into famous and less famous paper groups?
	CFG_CITESUMMARY_FAME_THRESHOLDS = [
	(500, 1000000, 'Renowned papers (500+)'),
	(250, 499, 'Famous papers (250-499)'),
	(100, 249, 'Very well-known papers (100-249)'),
	(50, 99, 'Well-known papers (50-99)'),
	(10, 49, 'Known papers (10-49)'),
	(1, 9, 'Less known papers (1-9)'),
	(0, 0, 'Unknown papers (0)')
	]

	def summarize_records(recids, of, ln, searchpattern="", searchfield="", req=None):
	"""Write summary report for records RECIDS in the format OF in language LN.
	SEARCHPATTERN and SEARCHFIELD are search query that led to RECIDS,
	for instance p='Smith, Paul' and f='author'. They are used for links.
	REQ is the Apache/mod_python request object.
	"""
	import search_engine
	if of == 'hcs':
	# this is HTML cite summary

	# 1) hcs prologue:
	d_recids = {}
	d_total_recs = {}
	for coll, colldef in CFG_CITESUMMARY_COLLECTIONS:
	if not colldef:
	d_recids[coll] = recids
	else:
	d_recids[coll] = recids & search_engine.search_pattern(p=colldef)
	d_total_recs[coll] = len(d_recids[coll])
	req.write(websearch_templates.tmpl_citesummary_prologue(d_total_recs, CFG_CITESUMMARY_COLLECTIONS, searchpattern, searchfield, ln))

	# 2) hcs overview:
	d_recid_citers = {}
	d_total_cites = {}
	d_avg_cites = {}
	for coll, colldef in CFG_CITESUMMARY_COLLECTIONS:
	d_total_cites[coll] = 0
	d_avg_cites[coll] = 0
	d_recid_citers[coll] = get_cited_by_list(d_recids[coll])
	for recid, lciters in d_recid_citers[coll]:
	if lciters:
	d_total_cites[coll] += len(lciters)
	if d_total_cites[coll] != 0:
	d_avg_cites[coll] = d_total_cites[coll] * 1.0 / d_total_recs[coll]
	req.write(websearch_templates.tmpl_citesummary_overview(d_total_cites, d_avg_cites, CFG_CITESUMMARY_COLLECTIONS, ln))

	# 3) hcs break down by fame:
	for low, high, fame in CFG_CITESUMMARY_FAME_THRESHOLDS:
	d_cites = {}
	for coll, colldef in CFG_CITESUMMARY_COLLECTIONS:
	d_cites[coll] = 0
	for recid, lciters in d_recid_citers[coll]:
	numcites = 0
	if lciters:
	numcites = len(lciters)
	if numcites >= low and numcites <= high:
	d_cites[coll] += 1
	req.write(websearch_templates.tmpl_citesummary_breakdown_by_fame(d_cites, low, high, fame, CFG_CITESUMMARY_COLLECTIONS, searchpattern, searchfield, ln))

	# 4) hcs epilogue:
	req.write(websearch_templates.tmpl_citesummary_epilogue(ln))
	return ''

	elif of == 'xcs':
	# this is XML cite summary
	citedbylist = get_cited_by_list(recids)
	return print_citation_summary_xml(citedbylist)

	#for citation summary, code xcs/hcs (unless changed)
	def print_citation_summary_xml(citedbylist):
	"""Prints citation summary in xml."""
	alldict = calculate_citations(citedbylist)
	avgstr = str(alldict['avgcites'])
	totalcites = str(alldict['totalcites'])
	#format avg so that it does not span 10 digits
	avgstr = avgstr[0:4]
	reciddict = alldict['reciddict']
	#output formatting
	outp = "<citationsummary records=\""+str(len(citedbylist))
	outp += "\" citations=\""+str(totalcites)+"\">"
	for low, high, name in CFG_CITESUMMARY_FAME_THRESHOLDS:
	#get the name, print the value
	if reciddict.has_key(name):
	recs = reciddict[name]
	outp += "<citationclass>"+name
	outp += "<records>"+str(recs)+"</records>"
	outp += "</citationclass>\n"
	outp = outp + "</citationsummary>"
	#req.write(outp)
	return outp #just to return something

	def calculate_citations(citedbylist):
	"""calculates records in classes of citations
	defined by thresholds. returns a dictionary that
	contains total, avg, records and a dictionary
	of threshold names and number corresponding to it"""
	totalcites = 0
	avgcites = 0
	reciddict = {}
	for recid, cites in citedbylist:
	numcites = 0
	if cites:
	numcites = len(cites)
	totalcites = totalcites + numcites
	#take the numbers in CFG_CITESUMMARY_FAME_THRESHOLDS
	for low, high, name in CFG_CITESUMMARY_FAME_THRESHOLDS:
	if (numcites >= low) and (numcites <= high):
	if reciddict.has_key(name):
	tmp = reciddict[name]
	tmp.append(recid)
	reciddict[name] = tmp
	else:
	reciddict[name] = [recid]
	if (len(citedbylist) == 0):
	avgcites = 0
	else:
	avgcites = totalcites*1.0/len(citedbylist)

	#create a dictionary that contains all the values
	alldict = {}
	alldict['records'] = len(citedbylist)
	alldict['totalcites'] = totalcites
	alldict['avgcites'] = avgcites
	alldict['reciddict'] = reciddict
	return alldict

search_engine_summarizer.pyNo OneTemporaryActions

File Metadata

search_engine_summarizer.pyView Options

Event Timeline

search_engine_summarizer.py
No OneTemporary
Actions

search_engine_summarizer.py
View Options