citation_indexer.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Sep 11, 13:20

citation_indexer.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	__revision__ = "$Id$"

	import re
	import time
	import os
	import sys
	import ConfigParser
	from itertools import islice
	from datetime import datetime
	from six import iteritems

	from invenio.legacy.dbquery import run_sql, serialize_via_marshal, \
	deserialize_via_marshal
	from invenio.modules.indexer.tokenizers.BibIndexJournalTokenizer import \
	CFG_JOURNAL_PUBINFO_STANDARD_FORM, \
	CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
	from invenio.legacy.search_engine import search_pattern, search_unit
	from invenio.legacy.bibrecord import get_fieldvalues
	from invenio.modules.formatter.utils import parse_tag
	from invenio.modules.knowledge.api import get_kb_mappings
	from invenio.legacy.bibsched.bibtask import write_message, task_get_option, \
	task_update_progress, task_sleep_now_if_required, \
	task_get_task_param
	from invenio.ext.logging import register_exception
	from invenio.legacy.bibindex.engine import get_field_tags


	INTBITSET_OF_DELETED_RECORDS = search_unit(p='DELETED', f='980', m='a')

	re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK)

	def get_recids_matching_query(p, f, m='e'):
	"""Return set of recIDs matching query for pattern p in field f."""
	return search_pattern(p=p, f=f, m=m) - INTBITSET_OF_DELETED_RECORDS


	def get_citation_weight(rank_method_code, config, chunk_size=20000):
	"""return a dictionary which is used by bibrank daemon for generating
	the index of sorted research results by citation information
	"""
	begin_time = time.time()

	quick = task_get_option("quick") != "no"

	# id option forces re-indexing a certain range
	# even if there are no new recs
	if task_get_option("id"):
	# construct a range of records to index
	updated_recids = []
	for first, last in task_get_option("id"):
	updated_recids += range(first, last+1)
	if len(updated_recids) > 10000:
	str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
	else:
	str_updated_recids = str(updated_recids)
	write_message('Records to process: %s' % str_updated_recids)
	index_update_time = None
	else:
	bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
	if not quick:
	bibrank_update_time = "0000-00-00 00:00:00"
	write_message("bibrank: %s" % bibrank_update_time)
	index_update_time = get_bibindex_update_time()
	write_message("bibindex: %s" % index_update_time)
	if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
	index_update_time = "0000-00-00 00:00:00"
	updated_recids = get_modified_recs(bibrank_update_time,
	index_update_time)
	if len(updated_recids) > 10000:
	str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
	else:
	str_updated_recids = str(updated_recids)
	write_message("%s records to update" % str_updated_recids)

	if updated_recids:
	# result_intermediate should be warranted to exists!
	# but if the user entered a "-R" (do all) option, we need to
	# make an empty start set
	if quick:
	dicts = {
	'cites_weight': last_updated_result(rank_method_code),
	'cites': get_cit_dict("citationdict"),
	'refs': get_cit_dict("reversedict"),
	'selfcites': get_cit_dict("selfcitdict"),
	'selfrefs': get_cit_dict("selfcitedbydict"),
	'authorcites': get_initial_author_dict(),
	}
	else:
	dicts = {
	'cites_weight': {},
	'cites': {},
	'refs': {},
	'selfcites': {},
	'selfrefs': {},
	'authorcites': {},
	}

	# Process fully the updated records
	process_and_store(updated_recids, config, dicts, chunk_size, quick)

	end_time = time.time()
	write_message("Total time of get_citation_weight(): %.2f sec" % \
	(end_time - begin_time))
	task_update_progress("citation analysis done")
	cites_weight = dicts['cites_weight']
	else:
	cites_weight = {}
	write_message("No new records added since last time this " \
	"rank method was executed")

	return cites_weight, index_update_time


	def process_and_store(recids, config, dicts, chunk_size, quick):
	# Process recent records first
	# The older records were most likely added by the above steps
	# to be reprocessed so they only have minor changes
	recids_iter = iter(sorted(recids, reverse=True))

	# Split records to process into chunks so that we do not
	# fill up too much memory
	while True:
	task_sleep_now_if_required()
	chunk = list(islice(recids_iter, chunk_size))
	if not chunk:
	if not quick:
	store_dicts(dicts)
	break
	write_message("Processing chunk #%s to #%s" % (chunk[0], chunk[-1]))

	# dicts are modified in-place
	process_chunk(chunk, config, dicts)

	if quick:
	# Store partial result as it is just an update and not
	# a creation from scratch
	store_dicts(dicts)


	def process_chunk(recids, config, dicts):
	cites_weight = dicts['cites_weight']
	cites = dicts['cites']
	refs = dicts['refs']

	old_refs = {}
	for recid in recids:
	old_refs[recid] = set(refs.get(recid, []))

	old_cites = {}
	for recid in recids:
	old_cites[recid] = set(cites.get(recid, []))

	process_inner(recids, config, dicts)

	# Records cited by updated_recid_list
	# They can only loose references as added references
	# are already added to the dicts at this point
	for somerecid in recids:
	for recid in set(old_cites[somerecid]) - set(cites.get(somerecid, [])):
	refs[recid] = list(set(refs.get(recid, [])) - set([somerecid]))
	if not refs[recid]:
	del refs[recid]

	# Records referenced by updated_recid_list
	# They can only loose citations as added citations
	# are already added to the dicts at this point
	for somerecid in recids:
	for recid in set(old_refs[somerecid]) - set(refs.get(somerecid, [])):
	cites[recid] = list(set(cites.get(recid, [])) - set([somerecid]))
	cites_weight[recid] = len(cites[recid])
	if not cites[recid]:
	del cites[recid]
	del cites_weight[recid]


	def process_inner(recids, config, dicts, do_catchup=True):
	tags = get_tags_config(config)

	# call the procedure that does the hard work by reading fields of
	# citations and references in the updated_recid's (but nothing else)!
	write_message("Entering get_citation_informations", verbose=9)
	citation_informations = get_citation_informations(recids, tags,
	fetch_catchup_info=do_catchup)

	write_message("Entering ref_analyzer", verbose=9)
	# call the analyser that uses the citation_informations to really
	# search x-cites-y in the coll..
	return ref_analyzer(citation_informations,
	dicts,
	recids,
	tags,
	do_catchup=do_catchup)


	def get_bibrankmethod_lastupdate(rank_method_code):
	"""return the last excution date of bibrank method
	"""
	query = """SELECT DATE_FORMAT(last_updated, '%%Y-%%m-%%d %%H:%%i:%%s')
	FROM rnkMETHOD WHERE name =%s"""
	last_update_time = run_sql(query, [rank_method_code])
	try:
	r = last_update_time[0][0]
	except IndexError:
	r = "0000-00-00 00:00:00"

	return r


	def get_bibindex_update_time():
	try:
	# check indexing times of `journal' and `reportnumber`
	# indexes, and only fetch records which have been indexed
	sql = "SELECT DATE_FORMAT(MIN(last_updated), " \
	"'%%Y-%%m-%%d %%H:%%i:%%s') FROM idxINDEX WHERE name IN (%s,%s)"
	index_update_time = run_sql(sql, ('journal', 'reportnumber'), 1)[0][0]
	except IndexError:
	write_message("Not running citation indexer since journal/reportnumber"
	" indexes are not created yet.")
	index_update_time = "0000-00-00 00:00:00"

	return index_update_time


	def get_modified_recs(bibrank_method_lastupdate, indexes_lastupdate):
	"""Get records to be updated by bibrank indexing

	Return the list of records which have been modified between the last
	execution of bibrank method and the latest journal/report index updates.
	The result is expected to have ascending id order.
	"""
	query = """SELECT id FROM bibrec
	WHERE modification_date >= %s
	AND modification_date < %s
	ORDER BY id ASC"""
	records = run_sql(query, (bibrank_method_lastupdate, indexes_lastupdate))
	return [r[0] for r in records]


	def last_updated_result(rank_method_code):
	""" return the last value of dictionary in rnkMETHODDATA table if it
	exists and initialize the value of last updated records by zero,
	otherwise an initial dictionary with zero as value for all recids
	"""
	query = """SELECT relevance_data FROM rnkMETHOD, rnkMETHODDATA WHERE
	rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD
	AND rnkMETHOD.Name = '%s'""" % rank_method_code

	try:
	rdict = run_sql(query)[0][0]
	except IndexError:
	dic = {}
	else:
	dic = deserialize_via_marshal(rdict)

	return dic


	def format_journal(format_string, mappings):
	"""format the publ infostring according to the format"""

	def replace(char, data):
	return data.get(char, char)

	return ''.join(replace(c, mappings) for c in format_string)


	def get_tags_config(config):
	"""Fetch needs config from our config file"""
	# Probably "citation" unless this file gets renamed
	function = config.get("rank_method", "function")
	write_message("config function %s" % function, verbose=9)

	tags = {}

	# 037a: contains (often) the "hep-ph/0501084" tag of THIS record
	try:
	tag = config.get(function, "primary_report_number")
	except ConfigParser.NoOptionError:
	tags['record_pri_number'] = None
	else:
	tags['record_pri_number'] = tagify(parse_tag(tag))

	# 088a: additional short identifier for the record
	try:
	tag = config.get(function, "additional_report_number")
	except ConfigParser.NoOptionError:
	tags['record_add_number'] = None
	else:
	tags['record_add_number'] = tagify(parse_tag(tag))

	# 999C5r. this is in the reference list, refers to other records.
	# Looks like: hep-ph/0408002
	try:
	tag = config.get(function, "reference_via_report_number")
	except ConfigParser.NoOptionError:
	tags['refs_report_number'] = None
	else:
	tags['refs_report_number'] = tagify(parse_tag(tag))
	# 999C5s. this is in the reference list, refers to other records.
	# Looks like: Phys.Rev.,A21,78
	try:
	tag = config.get(function, "reference_via_pubinfo")
	except ConfigParser.NoOptionError:
	tags['refs_journal'] = None
	else:
	tags['refs_journal'] = tagify(parse_tag(tag))
	# 999C5a. this is in the reference list, refers to other records.
	# Looks like: 10.1007/BF03170733
	try:
	tag = config.get(function, "reference_via_doi")
	except ConfigParser.NoOptionError:
	tags['refs_doi'] = None
	else:
	tags['refs_doi'] = tagify(parse_tag(tag))

	# Fields needed to construct the journals for this record
	try:
	tag = {
	'pages': config.get(function, "pubinfo_journal_page"),
	'year': config.get(function, "pubinfo_journal_year"),
	'journal': config.get(function, "pubinfo_journal_title"),
	'volume': config.get(function, "pubinfo_journal_volume"),
	}
	except ConfigParser.NoOptionError:
	tags['publication'] = None
	else:
	tags['publication'] = {
	'pages': tagify(parse_tag(tag['pages'])),
	'year': tagify(parse_tag(tag['year'])),
	'journal': tagify(parse_tag(tag['journal'])),
	'volume': tagify(parse_tag(tag['volume'])),
	}

	# Fields needed to lookup the DOIs
	tags['doi'] = get_field_tags('doi')

	# 999C5s. A standardized way of writing a reference in the reference list.
	# Like: Nucl. Phys. B 710 (2000) 371
	try:
	tags['publication_format'] = config.get(function,
	"pubinfo_journal_format")
	except ConfigParser.NoOptionError:
	tags['publication_format'] = CFG_JOURNAL_PUBINFO_STANDARD_FORM

	# Print values of tags for debugging
	write_message("tag values: %r" % [tags], verbose=9)

	return tags


	def get_journal_info(recid, tags):
	record_info = []
	# TODO: handle recors with multiple journals
	tagsvalues = {} # we store the tags and their values here
	# like c->444 y->1999 p->"journal of foo",
	# v->20
	tmp = get_fieldvalues(recid, tags['publication']['journal'])
	if tmp:
	tagsvalues["p"] = tmp[0]
	tmp = get_fieldvalues(recid, tags['publication']['volume'])
	if tmp:
	tagsvalues["v"] = tmp[0]
	tmp = get_fieldvalues(recid, tags['publication']['year'])
	if tmp:
	tagsvalues["y"] = tmp[0]
	tmp = get_fieldvalues(recid, tags['publication']['pages'])
	if tmp:
	# if the page numbers have "x-y" take just x
	pages = tmp[0]
	hpos = pages.find("-")
	if hpos > 0:
	pages = pages[:hpos]
	tagsvalues["c"] = pages

	# check if we have the required data
	ok = True
	for c in tags['publication_format']:
	if c in ('p', 'v', 'y', 'c'):
	if c not in tagsvalues:
	ok = False

	if ok:
	publ = format_journal(tags['publication_format'], tagsvalues)
	record_info += [publ]

	alt_volume = get_alt_volume(tagsvalues['v'])
	if alt_volume:
	tagsvalues2 = tagsvalues.copy()
	tagsvalues2['v'] = alt_volume
	publ = format_journal(tags['publication_format'], tagsvalues2)
	record_info += [publ]

	# Add codens
	for coden in get_kb_mappings('CODENS',
	value=tagsvalues['p']):
	tagsvalues2 = tagsvalues.copy()
	tagsvalues2['p'] = coden['key']
	publ = format_journal(tags['publication_format'], tagsvalues2)
	record_info += [publ]

	return record_info


	def get_alt_volume(volume):
	alt_volume = None
	if re.match(ur'[a-zA-Z]\d+', volume, re.U\|re.I):
	alt_volume = volume[1:] + volume[0]
	elif re.match(ur'\d+[a-zA-Z]', volume, re.U\|re.I):
	alt_volume = volume[-1] + volume[:-1]
	return alt_volume


	def get_citation_informations(recid_list, tags, fetch_catchup_info=True):
	"""scans the collections searching references (999C5x -fields) and
	citations for items in the recid_list
	returns a 4 list of dictionaries that contains the citation information
	of cds records
	examples: [ {} {} {} {} ]
	[ {5: 'SUT-DP-92-70-5'},
	{ 93: ['astro-ph/9812088']},
	{ 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
	NB: stuff here is for analysing new or changed records.
	see "ref_analyzer" for more.
	"""
	begin_time = os.times()[4]

	records_info = {
	'report-numbers': {},
	'journals': {},
	'doi': {},
	}

	references_info = {
	'report-numbers': {},
	'journals': {},
	'doi': {},
	}

	# perform quick check to see if there are some records with
	# reference tags, because otherwise get.cit.inf would be slow even
	# if there is nothing to index:
	if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % tags['refs_journal'][0:2],
	(tags['refs_journal'], )) or \
	run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % tags['refs_report_number'][0:2],
	(tags['refs_report_number'], )):

	done = 0 # for status reporting
	for recid in recid_list:
	if done % 10 == 0:
	task_sleep_now_if_required()
	# in fact we can sleep any time here

	if done % 1000 == 0:
	mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
	write_message(mesg)
	task_update_progress(mesg)

	done += 1

	if recid in INTBITSET_OF_DELETED_RECORDS:
	# do not treat this record since it was deleted; we
	# skip it like this in case it was only soft-deleted
	# e.g. via bibedit (i.e. when collection tag 980 is
	# DELETED but other tags like report number or journal
	# publication info remained the same, so the calls to
	# get_fieldvalues() below would return old values)
	continue

	if tags['refs_report_number']:
	references_info['report-numbers'][recid] \
	= get_fieldvalues(recid,
	tags['refs_report_number'],
	sort=False)
	msg = "references_info['report-numbers'][%s] = %r" \
	% (recid, references_info['report-numbers'][recid])
	write_message(msg, verbose=9)
	if tags['refs_journal']:
	references_info['journals'][recid] = []
	for ref in get_fieldvalues(recid,
	tags['refs_journal'],
	sort=False):
	try:
	# Inspire specific parsing
	journal, volume, page = ref.split(',')
	except ValueError:
	pass
	else:
	alt_volume = get_alt_volume(volume)
	if alt_volume:
	alt_ref = ','.join([journal, alt_volume, page])
	references_info['journals'][recid] += [alt_ref]
	references_info['journals'][recid] += [ref]
	msg = "references_info['journals'][%s] = %r" \
	% (recid, references_info['journals'][recid])
	write_message(msg, verbose=9)
	if tags['refs_doi']:
	references_info['doi'][recid] \
	= get_fieldvalues(recid, tags['refs_doi'], sort=False)
	msg = "references_info['doi'][%s] = %r" \
	% (recid, references_info['doi'][recid])
	write_message(msg, verbose=9)

	if not fetch_catchup_info:
	# We do not need the extra info
	continue

	if tags['record_pri_number'] or tags['record_add_number']:
	records_info['report-numbers'][recid] = []

	if tags['record_pri_number']:
	records_info['report-numbers'][recid] \
	+= get_fieldvalues(recid,
	tags['record_pri_number'],
	sort=False)
	if tags['record_add_number']:
	records_info['report-numbers'][recid] \
	+= get_fieldvalues(recid,
	tags['record_add_number'],
	sort=False)

	msg = "records_info[%s]['report-numbers'] = %r" \
	% (recid, records_info['report-numbers'][recid])
	write_message(msg, verbose=9)

	if tags['doi']:
	records_info['doi'][recid] = []
	for tag in tags['doi']:
	records_info['doi'][recid] += get_fieldvalues(recid,
	tag,
	sort=False)
	msg = "records_info[%s]['doi'] = %r" \
	% (recid, records_info['doi'][recid])
	write_message(msg, verbose=9)

	# get a combination of
	# journal vol (year) pages
	if tags['publication']:
	records_info['journals'][recid] = get_journal_info(recid, tags)
	msg = "records_info[%s]['journals'] = %r" \
	% (recid, records_info['journals'][recid])
	write_message(msg, verbose=9)

	else:
	mesg = "Warning: there are no records with tag values for " \
	"%s or %s. Nothing to do." % \
	(tags['refs_journal'], tags['refs_report_number'])
	write_message(mesg)

	mesg = "get cit.inf done fully"
	write_message(mesg)
	task_update_progress(mesg)

	end_time = os.times()[4]
	write_message("Execution time for generating citation info "
	"from record: %.2f sec" % (end_time - begin_time))

	return records_info, references_info


	def standardize_report_number(report_number):
	# Remove category for arxiv papers
	report_number = re.sub(ur'(?:arXiv:)?(\d{4}\.\d{4}) \[[a-zA-Z\.-]+\]',
	ur'arXiv:\g<1>',
	report_number,
	re.I \| re.U)
	return report_number


	def ref_analyzer(citation_informations, dicts,
	updated_recids, tags, do_catchup=True):
	"""Analyze the citation informations and calculate the citation weight
	and cited by list dictionary.
	"""
	citations_weight = dicts['cites_weight']
	citations = dicts['cites']
	references = dicts['refs']
	selfcites = dicts['selfcites']
	selfrefs = dicts['selfrefs']
	authorcites = dicts['authorcites']

	def step(msg_prefix, recid, done, total):
	if done % 30 == 0:
	task_sleep_now_if_required()

	if done % 1000 == 0:
	mesg = "%s done %s of %s" % (msg_prefix, done, total)
	write_message(mesg)
	task_update_progress(mesg)

	write_message("Processing: %s" % recid, verbose=9)

	def add_to_dicts(citer, cited):
	# Make sure we don't add ourselves
	# Workaround till we know why we are adding ourselves.
	if citer == cited:
	return
	if cited not in citations_weight:
	citations_weight[cited] = 0
	# Citations and citations weight
	if citer not in citations.setdefault(cited, []):
	citations[cited].append(citer)
	citations_weight[cited] += 1
	# References
	if cited not in references.setdefault(citer, []):
	references[citer].append(cited)

	# dict of recid -> institute_give_publ_id
	records_info, references_info = citation_informations

	t1 = os.times()[4]

	write_message("Phase 0: temporarily remove changed records from " \
	"citation dictionaries; they will be filled later")
	if do_catchup:
	for somerecid in updated_recids:
	try:
	del citations[somerecid]
	except KeyError:
	pass

	for somerecid in updated_recids:
	try:
	del references[somerecid]
	except KeyError:
	pass

	# Try to find references based on 999C5r
	# e.g 8 -> ([astro-ph/9889],[hep-ph/768])
	# meaning: rec 8 contains these in bibliography
	write_message("Phase 1: Report numbers references")
	done = 0
	for thisrecid, refnumbers in iteritems(references_info['report-numbers']):
	step("Report numbers references", thisrecid, done,
	len(references_info['report-numbers']))
	done += 1

	for refnumber in (r for r in refnumbers if r):
	field = 'reportnumber'
	refnumber = standardize_report_number(refnumber)
	# Search for "hep-th/5644654 or such" in existing records
	recids = get_recids_matching_query(p=refnumber, f=field)
	write_message("These match searching %s in %s: %s" % \
	(refnumber, field, list(recids)), verbose=9)

	if not recids:
	insert_into_missing(thisrecid, refnumber)
	else:
	remove_from_missing(refnumber)

	if len(recids) > 1:
	store_citation_warning('multiple-matches', refnumber)
	msg = "Whoops: record '%d' report number value '%s' " \
	"matches many records; taking only the first one. %s" % \
	(thisrecid, refnumber, repr(recids))
	write_message(msg, stream=sys.stderr)

	for recid in list(recids)[:1]: # take only the first one
	add_to_dicts(thisrecid, recid)

	mesg = "done fully"
	write_message(mesg)
	task_update_progress(mesg)

	t2 = os.times()[4]

	# Try to find references based on 999C5s
	# e.g. Phys.Rev.Lett. 53 (1986) 2285
	write_message("Phase 2: Journal references")
	done = 0
	for thisrecid, refs in iteritems(references_info['journals']):
	step("Journal references", thisrecid, done,
	len(references_info['journals']))
	done += 1

	for reference in (r for r in refs if r):
	p = reference
	field = 'journal'

	# check reference value to see whether it is well formed:
	if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
	store_citation_warning('not-well-formed', p)
	msg = "Whoops, record '%d' reference value '%s' " \
	"is not well formed; skipping it." % (thisrecid, p)
	write_message(msg, stream=sys.stderr)
	continue # skip this ill-formed value

	recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS
	write_message("These match searching %s in %s: %s" \
	% (reference, field, list(recids)), verbose=9)

	if not recids:
	insert_into_missing(thisrecid, p)
	else:
	remove_from_missing(p)

	if len(recids) > 1:
	store_citation_warning('multiple-matches', p)
	msg = "Whoops: record '%d' reference value '%s' " \
	"matches many records; taking only the first one. %s" % \
	(thisrecid, p, repr(recids))
	write_message(msg, stream=sys.stderr)

	for recid in list(recids)[:1]: # take only the first one
	add_to_dicts(thisrecid, recid)

	mesg = "done fully"
	write_message(mesg)
	task_update_progress(mesg)

	t3 = os.times()[4]

	# Try to find references based on 999C5a
	# e.g. 10.1007/BF03170733
	write_message("Phase 3: DOI references")
	done = 0
	for thisrecid, refs in iteritems(references_info['doi']):
	step("DOI references", thisrecid, done, len(references_info['doi']))
	done += 1

	for reference in (r for r in refs if r):
	p = reference
	field = 'doi'

	recids = get_recids_matching_query(p, field)
	write_message("These match searching %s in %s: %s" \
	% (reference, field, list(recids)), verbose=9)

	if not recids:
	insert_into_missing(thisrecid, p)
	else:
	remove_from_missing(p)

	if len(recids) > 1:
	store_citation_warning('multiple-matches', p)
	msg = "Whoops: record '%d' DOI value '%s' " \
	"matches many records; taking only the first one. %s" % \
	(thisrecid, p, repr(recids))
	write_message(msg, stream=sys.stderr)

	for recid in list(recids)[:1]: # take only the first one
	add_to_dicts(thisrecid, recid)

	mesg = "done fully"
	write_message(mesg)
	task_update_progress(mesg)

	t4 = os.times()[4]

	# Search for stuff like CERN-TH-4859/87 in list of refs
	write_message("Phase 4: report numbers catchup")
	done = 0
	for thisrecid, reportcodes in iteritems(records_info['report-numbers']):
	step("Report numbers catchup", thisrecid, done,
	len(records_info['report-numbers']))
	done += 1

	for reportcode in (r for r in reportcodes if r):
	if reportcode.startswith('arXiv'):
	std_reportcode = standardize_report_number(reportcode)
	report_pattern = r'^%s( \[[a-zA-Z.-]\])?' % \
	re.escape(std_reportcode)
	recids = get_recids_matching_query(report_pattern,
	tags['refs_report_number'],
	'r')
	else:
	recids = get_recids_matching_query(reportcode,
	tags['refs_report_number'],
	'e')
	for recid in recids:
	add_to_dicts(recid, thisrecid)

	mesg = "done fully"
	write_message(mesg)
	task_update_progress(mesg)

	# Find this record's pubinfo in other records' bibliography
	write_message("Phase 5: journals catchup")
	done = 0
	t5 = os.times()[4]
	for thisrecid, rec_journals in iteritems(records_info['journals']):
	step("Journals catchup", thisrecid, done,
	len(records_info['journals']))
	done += 1

	for journal in rec_journals:
	journal = journal.replace("\"", "")
	# Search the publication string like
	# Phys. Lett., B 482 (2000) 417 in 999C5s
	recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \
	- INTBITSET_OF_DELETED_RECORDS
	write_message("These records match %s in %s: %s" \
	% (journal, tags['refs_journal'], list(recids)), verbose=9)

	for recid in recids:
	add_to_dicts(recid, thisrecid)

	mesg = "done fully"
	write_message(mesg)
	task_update_progress(mesg)

	write_message("Phase 6: DOI catchup")
	done = 0
	t6 = os.times()[4]
	for thisrecid, dois in iteritems(records_info['doi']):
	step("DOI catchup", thisrecid, done, len(records_info['doi']))
	done += 1

	for doi in dois:
	# Search the publication string like
	# Phys. Lett., B 482 (2000) 417 in 999C5a
	recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \
	- INTBITSET_OF_DELETED_RECORDS
	write_message("These records match %s in %s: %s" \
	% (doi, tags['refs_doi'], list(recids)), verbose=9)

	for recid in recids:
	add_to_dicts(recid, thisrecid)

	mesg = "done fully"
	write_message(mesg)
	task_update_progress(mesg)

	write_message("Phase 7: remove empty lists from dicts")

	# Remove empty lists in citation and reference
	keys = citations.keys()
	for k in keys:
	if not citations[k]:
	del citations[k]

	keys = references.keys()
	for k in keys:
	if not references[k]:
	del references[k]

	if task_get_task_param('verbose') >= 3:
	# Print only X first to prevent flood
	write_message("citation_list (x is cited by y):")
	write_message(dict(islice(iteritems(citations), 10)))
	write_message("size: %s" % len(citations))
	write_message("reference_list (x cites y):")
	write_message(dict(islice(iteritems(references), 10)))
	write_message("size: %s" % len(references))
	write_message("selfcitedbydic (x is cited by y and one of the " \
	"authors of x same as y's):")
	write_message(dict(islice(iteritems(selfcites), 10)))
	write_message("size: %s" % len(selfcites))
	write_message("selfdic (x cites y and one of the authors of x " \
	"same as y's):")
	write_message(dict(islice(iteritems(selfrefs), 10)))
	write_message("size: %s" % len(selfrefs))
	write_message("authorcitdic (author is cited in recs):")
	write_message(dict(islice(iteritems(authorcites), 10)))
	write_message("size: %s" % len(authorcites))

	t7 = os.times()[4]

	write_message("Execution time for analyzing the citation information " \
	"generating the dictionary:")
	write_message("... checking ref report numbers: %.2f sec" % (t2-t1))
	write_message("... checking ref journals: %.2f sec" % (t3-t2))
	write_message("... checking ref DOI: %.2f sec" % (t4-t3))
	write_message("... checking rec report numbers: %.2f sec" % (t5-t4))
	write_message("... checking rec journals: %.2f sec" % (t6-t5))
	write_message("... checking rec DOI: %.2f sec" % (t7-t6))
	write_message("... total time of ref_analyze: %.2f sec" % (t7-t1))

	return citations_weight, citations, references, selfcites, \
	selfrefs, authorcites


	def store_dicts(dicts):
	"""Insert the reference and citation list into the database"""
	insert_into_cit_db(dicts['refs'], "reversedict")
	insert_into_cit_db(dicts['cites'], "citationdict")
	insert_into_cit_db(dicts['selfcites'], "selfcitedbydict")
	insert_into_cit_db(dicts['selfrefs'], "selfcitdict")


	def insert_into_cit_db(dic, name):
	"""Stores citation dictionary in the database"""
	ndate = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
	s = serialize_via_marshal(dic)
	write_message("size of %s %s" % (name, len(s)))
	# check that this column really exists
	run_sql("""REPLACE INTO rnkCITATIONDATA(object_name, object_value,
	last_updated) VALUES (%s, %s, %s)""", (name, s, ndate))


	def get_cit_dict(name):
	"""get a named citation dict from the db"""
	cdict = run_sql("""SELECT object_value FROM rnkCITATIONDATA
	WHERE object_name = %s""", (name, ))

	if cdict and cdict[0] and cdict[0][0]:
	dict_from_db = deserialize_via_marshal(cdict[0][0])
	else:
	dict_from_db = {}

	return dict_from_db


	def get_initial_author_dict():
	"""read author->citedinlist dict from the db"""
	adict = {}
	try:
	ah = run_sql("SELECT aterm,hitlist FROM rnkAUTHORDATA")
	for (a, h) in ah:
	adict[a] = deserialize_via_marshal(h)
	return adict
	except:
	register_exception(prefix="could not read rnkAUTHORDATA",
	alert_admin=True)
	return {}


	def insert_into_missing(recid, report):
	"""put the referingrecordnum-publicationstring into
	the "we are missing these" table"""
	if len(report) >= 255:
	# Invalid report, it is too long
	# and does not fit in the database column
	# (currently varchar 255)
	return
	wasalready = run_sql("""SELECT id_bibrec
	FROM rnkCITATIONDATAEXT
	WHERE id_bibrec = %s
	AND extcitepubinfo = %s""",
	(recid, report))
	if not wasalready:
	run_sql("""INSERT INTO rnkCITATIONDATAEXT(id_bibrec, extcitepubinfo)
	VALUES (%s,%s)""", (recid, report))


	def remove_from_missing(report):
	"""remove the recid-ref -pairs from the "missing" table for report x: prob
	in the case ref got in our library collection"""
	run_sql("""DELETE FROM rnkCITATIONDATAEXT
	WHERE extcitepubinfo = %s""", (report,))


	def create_analysis_tables():
	"""temporary simple table + index"""
	sql1 = "CREATE TABLE IF NOT EXISTS tmpcit (citer mediumint(10), \
	cited mediumint(10)) TYPE=MyISAM"
	sql2 = "CREATE UNIQUE INDEX citercited ON tmpcit(citer, cited)"
	sql3 = "CREATE INDEX citer ON tmpcit(citer)"
	sql4 = "CREATE INDEX cited ON tmpcit(cited)"
	run_sql(sql1)
	run_sql(sql2)
	run_sql(sql3)
	run_sql(sql4)


	def write_citer_cited(citer, cited):
	"""write an entry to tmp table"""
	run_sql("INSERT INTO tmpcit(citer, cited) VALUES (%s,%s)", (citer, cited))


	def print_missing(num):
	"""
	Print the contents of rnkCITATIONDATAEXT table containing external
	records that were cited by NUM or more internal records.

	NUM is by default taken from the -E command line option.
	"""
	if not num:
	num = task_get_option("print-extcites")

	write_message("Listing external papers cited by %i or more \
	internal records:" % num)

	res = run_sql("""SELECT COUNT(id_bibrec), extcitepubinfo
	FROM rnkCITATIONDATAEXT
	GROUP BY extcitepubinfo HAVING COUNT(id_bibrec) >= %s
	ORDER BY COUNT(id_bibrec) DESC""", (num,))
	for (cnt, brec) in res:
	print str(cnt)+"\t"+brec

	write_message("Listing done.")


	def tagify(parsedtag):
	"""aux auf to make '100__a' out of ['100','','','a']"""
	tag = ""
	for t in parsedtag:
	if t == '':
	t = '_'
	tag += t
	return tag


	def store_citation_warning(warning_type, cit_info):
	r = run_sql("""SELECT 1 FROM rnkCITATIONDATAERR
	WHERE type = %s
	AND citinfo = %s""", (warning_type, cit_info))
	if not r:
	run_sql("""INSERT INTO rnkCITATIONDATAERR (type, citinfo)
	VALUES (%s, %s)""", (warning_type, cit_info))

citation_indexer.pyNo OneTemporaryActions

File Metadata

citation_indexer.pyView Options

Event Timeline

citation_indexer.py
No OneTemporary
Actions

citation_indexer.py
View Options