Page MenuHomec4science

bibrank_citation_indexer.py
No OneTemporary

File Metadata

Created
Fri, May 24, 17:34

bibrank_citation_indexer.py

# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
__revision__ = "$Id$"
import re
import time
import os
import sys
import ConfigParser
from datetime import datetime
from itertools import islice
from invenio.intbitset import intbitset
from invenio.dbquery import run_sql
from invenio.bibindex_tokenizers.BibIndexJournalTokenizer import \
CFG_JOURNAL_PUBINFO_STANDARD_FORM, \
CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
from invenio.redisutils import get_redis
from invenio.search_engine import search_pattern, \
search_unit, \
get_collection_reclist
from invenio.bibformat_utils import parse_tag
from invenio.bibknowledge import get_kb_mappings
from invenio.bibtask import write_message, task_get_option, \
task_update_progress, task_sleep_now_if_required, \
task_get_task_param
from invenio.bibindex_engine_utils import get_field_tags
from invenio.docextract_record import get_record
from invenio.dbquery import serialize_via_marshal
re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK \
= re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK)
def compute_weights():
sql = "SELECT citee, COUNT(citer) FROM rnkCITATIONDICT GROUP BY citee"
weights = {}
for citee, c in run_sql(sql):
weights[citee] = c
return weights
def recids_cache(collections, cache={}):
if 'valid_recids' not in cache:
cache['valid_recids'] = intbitset()
for coll in collections.split(','):
cache['valid_recids'] += get_collection_reclist(coll)
return cache['valid_recids']
def deleted_recids_cache(cache={}):
if 'deleted_records' not in cache:
cache['deleted_records'] = search_unit(p='DELETED', f='980', m='a')
return cache['deleted_records']
def get_recids_matching_query(p, f, config, m='e'):
"""Return set of recIDs matching query for pattern p in field f.
@param p: pattern to search for
@type recID: unicode string
@param f: field to search in
@type recID: unicode string
@param config: bibrank configuration
@type recID: dict
@param m: type of matching (usually 'e' for exact or 'r' for regexp)
@type recID: string
"""
p = p.encode('utf-8')
f = f.encode('utf-8')
function = config.get("rank_method", "function")
collections = config.get(function, 'collections')
if collections:
ret = search_pattern(p=p, f=f, m=m) & recids_cache(collections)
else:
ret = search_pattern(p=p, f=f, m=m) - deleted_recids_cache()
return ret
def get_citation_weight(rank_method_code, config, chunk_size=25000):
"""return a dictionary which is used by bibrank daemon for generating
the index of sorted research results by citation information
"""
quick = task_get_option("quick") != "no"
# id option forces re-indexing a certain range
# even if there are no new recs
if task_get_option("id"):
# construct a range of records to index
updated_recids = []
for first, last in task_get_option("id"):
updated_recids += range(first, last+1)
if len(updated_recids) > 10000:
str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
else:
str_updated_recids = str(updated_recids)
write_message('Records to process: %s' % str_updated_recids)
index_update_time = None
else:
bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
if not quick:
bibrank_update_time = "0000-00-00 00:00:00"
write_message("bibrank: %s" % bibrank_update_time)
index_update_time = get_bibindex_update_time()
write_message("bibindex: %s" % index_update_time)
if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
index_update_time = "0000-00-00 00:00:00"
updated_recids = get_modified_recs(bibrank_update_time,
index_update_time)
if len(updated_recids) > 10000:
str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
else:
str_updated_recids = str(updated_recids)
write_message("%s records to update" % str_updated_recids)
if updated_recids:
begin_time = time.time()
try:
function = config.get("rank_method", "function")
config.get(function, 'collections')
except ConfigParser.NoOptionError:
config.set(function, 'collections', None)
# Process fully the updated records
weights = process_and_store(updated_recids, config, chunk_size)
end_time = time.time()
write_message("Total time of get_citation_weight(): %.2f sec" %
(end_time - begin_time))
task_update_progress("citation analysis done")
else:
weights = None
write_message("No new records added since last time this "
"rank method was executed")
return weights, index_update_time
def process_and_store(recids, config, chunk_size):
# Limit of # of citation we can loose in one chunk
function = config.get("rank_method", "function")
citation_loss_limit = int(config.get(function, "citation_loss_limit"))
# If we have nothing to process
# Do not update the weights dictionary
modified = False
# Process recent records first
# The older records were most likely added by the above steps
# to be reprocessed so they only have minor changes
recids_iter = iter(sorted(recids, reverse=True))
# Split records to process into chunks so that we do not
# fill up too much memory
while True:
task_sleep_now_if_required()
chunk = list(islice(recids_iter, chunk_size))
if not chunk:
break
write_message("Processing chunk #%s to #%s" % (chunk[0], chunk[-1]))
# The core work
cites, refs = process_chunk(chunk, config)
# Check that we haven't lost too many citations
cites_diff = compute_dicts_diff(chunk, refs, cites)
write_message("Citations balance %s" % cites_diff)
if citation_loss_limit and cites_diff <= -citation_loss_limit:
raise Exception('Lost too many references, aborting')
# Store processed citations/references
store_dicts(chunk, refs, cites)
modified = True
# Compute new weights dictionary
if modified:
weights = compute_weights()
else:
weights = None
store_weights_cache(weights)
return weights
def store_weights_cache(weights):
"""Store into key/value store"""
redis = get_redis()
redis.set('citations_weights', serialize_via_marshal(weights))
def process_chunk(recids, config):
tags = get_tags_config(config)
# call the procedure that does the hard work by reading fields of
# citations and references in the updated_recid's (but nothing else)!
write_message("Entering get_citation_informations", verbose=9)
citation_informations = get_citation_informations(recids, tags, config)
write_message("Entering ref_analyzer", verbose=9)
# call the analyser that uses the citation_informations to really
# search x-cites-y in the coll..
return ref_analyzer(citation_informations,
recids,
tags,
config)
def get_bibrankmethod_lastupdate(rank_method_code):
"""Return the last excution date of bibrank method
"""
query = """SELECT DATE_FORMAT(last_updated, '%%Y-%%m-%%d %%H:%%i:%%s')
FROM rnkMETHOD WHERE name =%s"""
last_update_time = run_sql(query, [rank_method_code])
try:
r = last_update_time[0][0]
except IndexError:
r = "0000-00-00 00:00:00"
return r
def get_bibindex_update_time():
"""Return the last indexing date of the journals and report number indexes
"""
try:
# check indexing times of `journal' and `reportnumber`
# indexes, and only fetch records which have been indexed
sql = "SELECT DATE_FORMAT(MIN(last_updated), " \
"'%%Y-%%m-%%d %%H:%%i:%%s') FROM idxINDEX WHERE name IN (%s,%s)"
index_update_time = run_sql(sql, ('journal', 'reportnumber'), 1)[0][0]
except IndexError:
write_message("Not running citation indexer since journal/reportnumber"
" indexes are not created yet.")
index_update_time = "0000-00-00 00:00:00"
return index_update_time
def get_modified_recs(bibrank_method_lastupdate, indexes_lastupdate):
"""Get records to be updated by bibrank indexing
Return the list of records which have been modified between the last
execution of bibrank method and the latest journal/report index updates.
The result is expected to have ascending id order.
"""
query = """SELECT id FROM bibrec
WHERE modification_date >= %s
AND modification_date < %s
ORDER BY id ASC"""
records = run_sql(query, (bibrank_method_lastupdate, indexes_lastupdate))
return [r[0] for r in records]
def format_journal(format_string, mappings):
"""format the publ infostring according to the format"""
def replace(char, data):
return data.get(char, char)
return ''.join(replace(c, mappings) for c in format_string)
def get_tags_config(config):
"""Fetch needs config from our config file"""
# Probably "citation" unless this file gets renamed
function = config.get("rank_method", "function")
write_message("config function %s" % function, verbose=9)
tags = {}
# 037a: contains (often) the "hep-ph/0501084" tag of THIS record
try:
tag = config.get(function, "primary_report_number")
except ConfigParser.NoOptionError:
tags['record_pri_number'] = None
else:
tags['record_pri_number'] = tagify(parse_tag(tag))
# 088a: additional short identifier for the record
try:
tag = config.get(function, "additional_report_number")
except ConfigParser.NoOptionError:
tags['record_add_number'] = None
else:
tags['record_add_number'] = tagify(parse_tag(tag))
# 999C5r. this is in the reference list, refers to other records.
# Looks like: hep-ph/0408002
try:
tag = config.get(function, "reference_via_report_number")
except ConfigParser.NoOptionError:
tags['refs_report_number'] = None
else:
tags['refs_report_number'] = tagify(parse_tag(tag))
# 999C5s. this is in the reference list, refers to other records.
# Looks like: Phys.Rev.,A21,78
try:
tag = config.get(function, "reference_via_pubinfo")
except ConfigParser.NoOptionError:
tags['refs_journal'] = None
else:
tags['refs_journal'] = tagify(parse_tag(tag))
# 999C5a. this is in the reference list, refers to other records.
# Looks like: 10.1007/BF03170733
try:
tag = config.get(function, "reference_via_doi")
except ConfigParser.NoOptionError:
tags['refs_doi'] = None
else:
tags['refs_doi'] = tagify(parse_tag(tag))
# 999C50. this is in the reference list, refers to other records.
# Looks like: 1205
try:
tag = config.get(function, "reference_via_record_id")
except ConfigParser.NoOptionError:
tags['refs_record_id'] = None
else:
tags['refs_record_id'] = tagify(parse_tag(tag))
# 999C5i. this is in the reference list, refers to other records.
# Looks like: 9781439520031
try:
tag = config.get(function, "reference_via_isbn")
except ConfigParser.NoOptionError:
tags['refs_isbn'] = None
else:
tags['refs_isbn'] = tagify(parse_tag(tag))
# Fields needed to construct the journals for this record
try:
tag = {
'pages': config.get(function, "pubinfo_journal_page"),
'year': config.get(function, "pubinfo_journal_year"),
'journal': config.get(function, "pubinfo_journal_title"),
'volume': config.get(function, "pubinfo_journal_volume"),
}
except ConfigParser.NoOptionError:
tags['publication'] = None
else:
tags['publication'] = {
'pages': tagify(parse_tag(tag['pages'])),
'year': tagify(parse_tag(tag['year'])),
'journal': tagify(parse_tag(tag['journal'])),
'volume': tagify(parse_tag(tag['volume'])),
}
# Fields needed to lookup the DOIs
tags['doi'] = get_field_tags('doi')
# Fields needed to lookup the ISBN
tags['isbn'] = get_field_tags('isbn')
# 999C5s. A standardized way of writing a reference in the reference list.
# Like: Nucl. Phys. B 710 (2000) 371
try:
tags['publication_format'] = config.get(function,
"pubinfo_journal_format")
except ConfigParser.NoOptionError:
tags['publication_format'] = CFG_JOURNAL_PUBINFO_STANDARD_FORM
# Print values of tags for debugging
write_message("tag values: %r" % [tags], verbose=9)
return tags
def get_journal_info(record, tags):
"""Fetch journal info from given record"""
record_info = []
journals_fields = record.find_fields(tags['publication']['journal'][:5])
for field in journals_fields:
# we store the tags and their values here
# like c->444 y->1999 p->"journal of foo",
# v->20
tagsvalues = {}
try:
tmp = field.get_subfield_values(tags['publication']['journal'][5])[0]
except IndexError:
pass
else:
tagsvalues["p"] = tmp
try:
tmp = field.get_subfield_values(tags['publication']['volume'][5])[0]
except IndexError:
pass
else:
tagsvalues["v"] = tmp
try:
tmp = field.get_subfield_values(tags['publication']['year'][5])[0]
except IndexError:
pass
else:
tagsvalues["y"] = tmp
try:
tmp = field.get_subfield_values(tags['publication']['pages'][5])[0]
except IndexError:
pass
else:
# if the page numbers have "x-y" take just x
tagsvalues["c"] = tmp.split('-', 1)[0]
# check if we have the required data
ok = True
for c in tags['publication_format']:
if c in ('p', 'v', 'y', 'c'):
if c not in tagsvalues:
ok = False
if ok:
publ = format_journal(tags['publication_format'], tagsvalues)
record_info += [publ]
alt_volume = get_alt_volume(tagsvalues['v'])
if alt_volume:
tagsvalues2 = tagsvalues.copy()
tagsvalues2['v'] = alt_volume
publ = format_journal(tags['publication_format'], tagsvalues2)
record_info += [publ]
# Add codens
for coden in get_kb_mappings('CODENS',
value=tagsvalues['p']):
tagsvalues2 = tagsvalues.copy()
tagsvalues2['p'] = coden['key']
publ = format_journal(tags['publication_format'], tagsvalues2)
record_info += [publ]
return record_info
def get_alt_volume(volume):
"""Get alternate volume form
We handle the inversed volume letter bug
Some metadata is wrong which leads to journals with the volume letter
at the end.
e.g. Phys.Rev.,51B,1 instead of Phys.Rev.,B51,1
"""
alt_volume = None
if re.match(ur'[a-zA-Z]\d+', volume, re.U|re.I):
alt_volume = volume[1:] + volume[0]
elif re.match(ur'\d+[a-zA-Z]', volume, re.U|re.I):
alt_volume = volume[-1] + volume[:-1]
return alt_volume
def get_citation_informations(recid_list, tags, config,
fetch_catchup_info=True):
"""Scans the collections searching references (999C5x -fields) and
citations for items in the recid_list
returns a 4 list of dictionaries that contains the citation information
of cds records
examples: [ {} {} {} {} ]
[ {5: 'SUT-DP-92-70-5'},
{ 93: ['astro-ph/9812088']},
{ 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
NB: stuff here is for analysing new or changed records.
see "ref_analyzer" for more.
"""
begin_time = os.times()[4]
records_info = {
'report-numbers': {},
'journals': {},
'doi': {},
'hdl': {},
'isbn': {},
'record_id': {},
}
references_info = {
'report-numbers': {},
'journals': {},
'doi': {},
'record_id': {},
'isbn': {},
'hdl': {},
}
# perform quick check to see if there are some records with
# reference tags, because otherwise get.cit.inf would be slow even
# if there is nothing to index:
for done, recid in enumerate(recid_list):
if done % 10 == 0:
task_sleep_now_if_required()
if done % 50 == 0:
mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
write_message(mesg)
task_update_progress(mesg)
record = get_record(recid)
records_info['record_id'][recid] = [unicode(recid)]
function = config.get("rank_method", "function")
if config.get(function, 'collections'):
if recid not in recids_cache(config.get(function, 'collections')):
# do not treat this record since it is not in the collections
# we want to process
continue
elif recid in deleted_recids_cache():
# do not treat this record since it was deleted; we
# skip it like this in case it was only soft-deleted
# e.g. via bibedit (i.e. when collection tag 980 is
# DELETED but other tags like report number or journal
# publication info remained the same, so the calls to
# get_fieldvalues() below would return old values)
continue
if tags['refs_report_number']:
references_info['report-numbers'][recid] = [t.value for t in
record.find_subfields(tags['refs_report_number'])]
msg = "references_info['report-numbers'][%s] = %r" \
% (recid, references_info['report-numbers'][recid])
write_message(msg, verbose=9)
if tags['refs_journal']:
references_info['journals'][recid] = []
for ref in record.find_subfields(tags['refs_journal']):
try:
# Inspire specific parsing
journal, volume, page = ref.value.split(',')
except ValueError:
pass
else:
alt_volume = get_alt_volume(volume)
if alt_volume:
alt_ref = ','.join([journal, alt_volume, page])
references_info['journals'][recid] += [alt_ref]
references_info['journals'][recid] += [ref.value]
msg = "references_info['journals'][%s] = %r" \
% (recid, references_info['journals'][recid])
write_message(msg, verbose=9)
if tags['refs_doi']:
references = [t.value for t in
record.find_subfields(tags['refs_doi'])]
dois = []
hdls = []
for ref in references:
if ref.startswith("hdl:"):
hdls.append(ref[4:])
elif ref.startswith("doi:"):
dois.append(ref[4:])
else:
dois.append(ref)
references_info['doi'][recid] = dois
references_info['hdl'][recid] = hdls
msg = "references_info['doi'][%s] = %r" % (recid, dois)
write_message(msg, verbose=9)
msg = "references_info['hdl'][%s] = %r" % (recid, hdls)
write_message(msg, verbose=9)
if tags['refs_record_id']:
references_info['record_id'][recid] = [t.value for t in
record.find_subfields(tags['refs_record_id'])]
msg = "references_info['record_id'][%s] = %r" \
% (recid, references_info['record_id'][recid])
write_message(msg, verbose=9)
if tags['refs_isbn']:
references_info['isbn'][recid] = [t.value for t in
record.find_subfields(tags['refs_isbn'])]
msg = "references_info['isbn'][%s] = %r" \
% (recid, references_info['isbn'][recid])
write_message(msg, verbose=9)
if not fetch_catchup_info:
# We do not need the extra info
continue
if tags['record_pri_number'] or tags['record_add_number']:
records_info['report-numbers'][recid] = []
if tags['record_pri_number']:
records_info['report-numbers'][recid] += [t.value for t in
record.find_subfields(tags['record_pri_number'])]
if tags['record_add_number']:
records_info['report-numbers'][recid] += [t.value for t in
record.find_subfields(tags['record_add_number'])]
msg = "records_info[%s]['report-numbers'] = %r" \
% (recid, records_info['report-numbers'][recid])
write_message(msg, verbose=9)
if tags['doi']:
records_info['doi'][recid] = []
records_info['hdl'][recid] = []
for tag in tags['doi']:
for field in record.find_fields(tag[:5]):
if 'DOI' in field.get_subfield_values('2'):
dois = field.get_subfield_values('a')
records_info['doi'][recid].extend(dois)
elif 'HDL' in field.get_subfield_values('2'):
hdls = field.get_subfield_values('a')
records_info['hdl'][recid].extend(hdls)
msg = "records_info[%s]['doi'] = %r" \
% (recid, records_info['doi'][recid])
write_message(msg, verbose=9)
msg = "records_info[%s]['hdl'] = %r" \
% (recid, records_info['hdl'][recid])
write_message(msg, verbose=9)
if tags['isbn']:
records_info['isbn'][recid] = []
for tag in tags['isbn']:
values = [t.value for t in record.find_subfields(tag)]
records_info['isbn'][recid] += values
msg = "records_info[%s]['isbn'] = %r" \
% (recid, records_info['isbn'][recid])
write_message(msg, verbose=9)
# get a combination of
# journal vol (year) pages
if tags['publication']:
records_info['journals'][recid] = get_journal_info(record, tags)
msg = "records_info[%s]['journals'] = %r" \
% (recid, records_info['journals'][recid])
write_message(msg, verbose=9)
mesg = "get cit.inf done fully"
write_message(mesg)
task_update_progress(mesg)
end_time = os.times()[4]
write_message("Execution time for generating citation info "
"from record: %.2f sec" % (end_time - begin_time))
return records_info, references_info
def standardize_report_number(report_number):
"""Format the report number to a standard form.
Currently we:
* remove category for arxiv papers
"""
report_number = re.sub(ur'(?:arXiv:)?(\d{4}\.\d{4}) \[[a-zA-Z\.-]+\]',
ur'arXiv:\g<1>',
report_number,
re.I | re.U)
return report_number
def ref_analyzer(citation_informations, updated_recids, tags, config):
"""Analyze the citation informations and calculate the citation weight
and cited by list dictionary.
"""
citations = {}
for recid in updated_recids:
citations[recid] = set()
references = {}
for recid in updated_recids:
references[recid] = set()
def step(msg_prefix, recid, done, total):
if done % 30 == 0:
task_sleep_now_if_required()
if done % 1000 == 0:
mesg = "%s done %s of %s" % (msg_prefix, done, total)
write_message(mesg)
task_update_progress(mesg)
write_message("Processing: %s" % recid, verbose=9)
def add_to_cites(citer, citee):
# Make sure we don't add ourselves
# Workaround till we know why we are adding ourselves.
if citer == citee:
return
citations[citee].add(citer)
if citer in updated_recids:
references[citer].add(citee)
def add_to_refs(citer, citee):
# Make sure we don't add ourselves
# Workaround till we know why we are adding ourselves.
if citer == citee:
return
if citee in updated_recids:
citations[citee].add(citer)
references[citer].add(citee)
# dict of recid -> institute_give_publ_id
records_info, references_info = citation_informations
t1 = os.times()[4]
# Try to find references based on 999C5r
# e.g 8 -> ([astro-ph/9889],[hep-ph/768])
# meaning: rec 8 contains these in bibliography
write_message("Phase 1: Report numbers references")
done = 0
for thisrecid, refnumbers in references_info['report-numbers'].iteritems():
step("Report numbers references", thisrecid, done,
len(references_info['report-numbers']))
done += 1
for refnumber in (r for r in refnumbers if r):
field = 'reportnumber'
refnumber = standardize_report_number(refnumber)
# Search for "hep-th/5644654 or such" in existing records
recids = get_recids_matching_query(p=refnumber,
f=field,
config=config)
write_message("These match searching %s in %s: %s" %
(refnumber, field, list(recids)), verbose=9)
if not recids:
insert_into_missing(thisrecid, refnumber)
else:
remove_from_missing(refnumber)
if len(recids) > 1:
store_citation_warning('multiple-matches', refnumber)
msg = "Whoops: record '%d' report number value '%s' " \
"matches many records; taking only the first one. %s" % \
(thisrecid, refnumber, repr(recids))
write_message(msg, stream=sys.stderr)
for recid in list(recids)[:1]: # take only the first one
add_to_refs(thisrecid, recid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
t2 = os.times()[4]
# Try to find references based on 999C5s
# e.g. Phys.Rev.Lett. 53 (1986) 2285
write_message("Phase 2: Journal references")
done = 0
for thisrecid, refs in references_info['journals'].iteritems():
step("Journal references", thisrecid, done,
len(references_info['journals']))
done += 1
for reference in (r for r in refs if r):
p = reference
field = 'journal'
# check reference value to see whether it is well formed:
if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
store_citation_warning('not-well-formed', p)
msg = "Whoops, record '%d' reference value '%s' " \
"is not well formed; skipping it." % (thisrecid, p)
write_message(msg, stream=sys.stderr)
continue # skip this ill-formed value
recids = get_recids_matching_query(p=p,
f=field,
config=config)
write_message("These match searching %s in %s: %s"
% (reference, field, list(recids)), verbose=9)
if not recids:
insert_into_missing(thisrecid, p)
else:
remove_from_missing(p)
if len(recids) > 1:
store_citation_warning('multiple-matches', p)
msg = "Whoops: record '%d' reference value '%s' " \
"matches many records; taking only the first one. %s" % \
(thisrecid, p, repr(recids))
write_message(msg, stream=sys.stderr)
for recid in list(recids)[:1]: # take only the first one
add_to_refs(thisrecid, recid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
t3 = os.times()[4]
# Try to find references based on 999C5a
# e.g. 10.1007/BF03170733
write_message("Phase 3: DOI references")
done = 0
for thisrecid, refs in references_info['doi'].iteritems():
step("DOI references", thisrecid, done, len(references_info['doi']))
done += 1
for reference in (r for r in refs if r):
p = reference
field = 'doi'
recids = get_recids_matching_query(p=p,
f=field,
config=config)
write_message("These match searching %s in %s: %s"
% (reference, field, list(recids)), verbose=9)
if not recids:
insert_into_missing(thisrecid, p)
else:
remove_from_missing(p)
if len(recids) > 1:
store_citation_warning('multiple-matches', p)
msg = "Whoops: record '%d' DOI value '%s' " \
"matches many records; taking only the first one. %s" % \
(thisrecid, p, repr(recids))
write_message(msg, stream=sys.stderr)
for recid in list(recids)[:1]: # take only the first one
add_to_refs(thisrecid, recid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
t4 = os.times()[4]
# Try to find references based on 999C5a (hdl references)
# e.g. 4263537/4000
write_message("Phase 4: HDL references")
done = 0
for thisrecid, refs in references_info['hdl'].iteritems():
step("HDL references", thisrecid, done, len(references_info['hdl']))
done += 1
for reference in (r for r in refs if r):
p = reference
field = 'hdl'
recids = get_recids_matching_query(p=p,
f=field,
config=config)
write_message("These match searching %s in %s: %s"
% (reference, field, list(recids)), verbose=9)
if not recids:
insert_into_missing(thisrecid, p)
else:
remove_from_missing(p)
if len(recids) > 1:
store_citation_warning('multiple-matches', p)
msg = "Whoops: record '%d' HDL value '%s' " \
"matches many records; taking only the first one. %s" % \
(thisrecid, p, repr(recids))
write_message(msg, stream=sys.stderr)
for recid in list(recids)[:1]: # take only the first one
add_to_refs(thisrecid, recid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
t5 = os.times()[4]
# Try to find references based on 999C50
# e.g. 1244
write_message("Phase 5: Record ID references")
done = 0
for thisrecid, refs in references_info['record_id'].iteritems():
step("Record ID references", thisrecid, done, len(references_info['record_id']))
done += 1
field = "001"
for recid in (r for r in refs if r):
valid = get_recids_matching_query(p=recid, f=field, config=config)
write_message("These match searching %s in %s: %s"
% (recid, field, list(valid)), verbose=9)
if valid:
add_to_refs(thisrecid, valid[0])
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
t6 = os.times()[4]
# Try to find references based on 999C5i
# e.g. 978-3-942171-73-1
write_message("Phase 6: ISBN references")
done = 0
for thisrecid, refs in references_info['isbn'].iteritems():
step("ISBN references", thisrecid, done, len(references_info['isbn']))
done += 1
for reference in (r for r in refs if r):
p = reference
field = 'isbn'
recids = get_recids_matching_query(p=p,
f=field,
config=config)
write_message("These match searching %s in %s: %s"
% (reference, field, list(recids)), verbose=9)
if not recids:
insert_into_missing(thisrecid, p)
else:
remove_from_missing(p)
if len(recids) > 1:
store_citation_warning('multiple-matches', p)
msg = "Whoops: record '%d' ISBN value '%s' " \
"matches many records; taking only the first one. %s" % \
(thisrecid, p, repr(recids))
write_message(msg, stream=sys.stderr)
for recid in list(recids)[:1]: # take only the first one
add_to_refs(thisrecid, recid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
t7 = os.times()[4]
# Search for stuff like CERN-TH-4859/87 in list of refs
write_message("Phase 7: report numbers catchup")
done = 0
for thisrecid, reportcodes in records_info['report-numbers'].iteritems():
step("Report numbers catchup", thisrecid, done,
len(records_info['report-numbers']))
done += 1
for reportcode in (r for r in reportcodes if r):
if reportcode.startswith('arXiv'):
std_reportcode = standardize_report_number(reportcode)
report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
re.escape(std_reportcode)
recids = get_recids_matching_query(p=report_pattern,
f=tags['refs_report_number'],
m='r',
config=config)
else:
recids = get_recids_matching_query(p=reportcode,
f=tags['refs_report_number'],
config=config)
for recid in recids:
add_to_cites(recid, thisrecid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
# Find this record's pubinfo in other records' bibliography
write_message("Phase 8: journals catchup")
done = 0
t8 = os.times()[4]
for thisrecid, rec_journals in records_info['journals'].iteritems():
step("Journals catchup", thisrecid, done,
len(records_info['journals']))
done += 1
for journal in rec_journals:
journal = journal.replace("\"", "")
# Search the publication string like
# Phys. Lett., B 482 (2000) 417 in 999C5s
recids = get_recids_matching_query(p=journal,
f=tags['refs_journal'],
config=config)
write_message("These records match %s in %s: %s"
% (journal, tags['refs_journal'], list(recids)), verbose=9)
for recid in recids:
add_to_cites(recid, thisrecid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
write_message("Phase 9: DOI catchup")
done = 0
t9 = os.times()[4]
for thisrecid, dois in records_info['doi'].iteritems():
step("DOI catchup", thisrecid, done, len(records_info['doi']))
done += 1
for doi in dois:
recids = get_recids_matching_query(p=doi,
f=tags['refs_doi'],
config=config)
write_message("These records match %s in %s: %s"
% (doi, tags['refs_doi'], list(recids)), verbose=9)
for recid in recids:
add_to_cites(recid, thisrecid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
write_message("Phase 10: HDL catchup")
done = 0
t10 = os.times()[4]
for thisrecid, hdls in records_info['hdl'].iteritems():
step("HDL catchup", thisrecid, done, len(records_info['hdl']))
done += 1
for hdl in hdls:
recids = get_recids_matching_query(p=hdl,
f=tags['refs_doi'],
config=config)
write_message("These records match %s in %s: %s"
% (hdl, tags['refs_doi'], list(recids)), verbose=9)
for recid in recids:
add_to_cites(recid, thisrecid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
write_message("Phase 11: ISBN catchup")
done = 0
t11 = os.times()[4]
for thisrecid, isbns in records_info['isbn'].iteritems():
step("ISBN catchup", thisrecid, done, len(records_info['isbn']))
done += 1
for isbn in isbns:
recids = get_recids_matching_query(p=isbn,
f=tags['refs_isbn'],
config=config)
write_message("These records match %s in %s: %s"
% (isbn, tags['refs_isbn'], list(recids)), verbose=9)
for recid in recids:
add_to_cites(recid, thisrecid)
write_message("Phase 12: Record ID catchup")
done = 0
t12 = os.times()[4]
for thisrecid, record_ids in records_info['record_id'].iteritems():
step("Record ID catchup", thisrecid, done, len(records_info['record_id']))
done += 1
for record_id in record_ids:
recids = get_recids_matching_query(p=record_id,
f=tags['refs_record_id'],
config=config)
write_message("These records match %s in %s: %s"
% (record_id, tags['refs_record_id'], list(recids)), verbose=9)
for recid in recids:
add_to_cites(recid, thisrecid)
mesg = "done fully"
write_message(mesg)
task_update_progress(mesg)
if task_get_task_param('verbose') >= 3:
# Print only X first to prevent flood
write_message("citation_list (x is cited by y):")
write_message(dict(islice(citations.iteritems(), 10)))
write_message("size: %s" % len(citations))
write_message("reference_list (x cites y):")
write_message(dict(islice(references.iteritems(), 10)))
write_message("size: %s" % len(references))
t13 = os.times()[4]
write_message("Execution time for analyzing the citation information "
"generating the dictionary:")
write_message("... checking ref report numbers: %.2f sec" % (t2-t1))
write_message("... checking ref journals: %.2f sec" % (t3-t2))
write_message("... checking ref DOI: %.2f sec" % (t4-t3))
write_message("... checking ref HDL: %.2f sec" % (t5-t4))
write_message("... checking ref Record ID: %.2f sec" % (t6-t5))
write_message("... checking ref ISBN: %.2f sec" % (t7-t6))
write_message("... checking rec report numbers: %.2f sec" % (t8-t7))
write_message("... checking rec journals: %.2f sec" % (t9-t8))
write_message("... checking rec DOI: %.2f sec" % (t10-t9))
write_message("... checking rec HDL: %.2f sec" % (t11-t10))
write_message("... checking rec ISBN: %.2f sec" % (t12-t11))
write_message("... checking rec Record ID: %.2f sec" % (t13-t12))
write_message("... total time of ref_analyze: %.2f sec" % (t13-t1))
return citations, references
def compute_refs_diff(recid, new_refs):
"""
Given a set of references for a record, returns how many references were
added to it. The value can be negative which means the record lost
citations.
"""
old_refs = set(row[0] for row in run_sql("""SELECT citee
FROM rnkCITATIONDICT
WHERE citer = %s""", [recid]))
refs_to_add = new_refs - old_refs
refs_to_delete = old_refs - new_refs
return len(refs_to_add) - len(refs_to_delete)
def compute_cites_diff(recid, new_cites):
"""
This function does the same thing as compute_refs_diff but with citations.
"""
old_cites = set(row[0] for row in run_sql("""SELECT citer
FROM rnkCITATIONDICT
WHERE citee = %s""", [recid]))
cites_to_add = new_cites - old_cites
cites_to_delete = old_cites - new_cites
return len(cites_to_add) - len(cites_to_delete)
def compute_dicts_diff(recids, refs, cites):
"""
Given the new dictionaries for references and citations, computes how
many references were added or removed by comparing them to the current
stored in the database.
"""
cites_diff = 0
for recid in recids:
cites_diff += compute_refs_diff(recid, refs[recid])
cites_diff += compute_cites_diff(recid, cites[recid])
return cites_diff
def store_dicts(recids, refs, cites):
"""Insert the reference and citation list into the database"""
for recid in recids:
replace_refs(recid, refs[recid])
replace_cites(recid, cites[recid])
def replace_refs(recid, new_refs):
"""
Given a set of references, replaces the references of given recid
in the database.
The changes are logged into rnkCITATIONLOG.
"""
old_refs = set(row[0] for row in run_sql("""SELECT citee
FROM rnkCITATIONDICT
WHERE citer = %s""", [recid]))
refs_to_add = new_refs - old_refs
refs_to_delete = old_refs - new_refs
for ref in refs_to_add:
write_message('adding ref %s %s' % (recid, ref), verbose=1)
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
run_sql("""INSERT INTO rnkCITATIONDICT (citer, citee, last_updated)
VALUES (%s, %s, %s)""", (recid, ref, now))
run_sql("""INSERT INTO rnkCITATIONLOG (citer, citee, type, action_date)
VALUES (%s, %s, %s, %s)""", (recid, ref, 'added', now))
for ref in refs_to_delete:
write_message('deleting ref %s %s' % (recid, ref), verbose=1)
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
run_sql("""DELETE FROM rnkCITATIONDICT
WHERE citer = %s and citee = %s""", (recid, ref))
run_sql("""INSERT INTO rnkCITATIONLOG (citer, citee, type, action_date)
VALUES (%s, %s, %s, %s)""", (recid, ref, 'removed', now))
def replace_cites(recid, new_cites):
"""
Given a set of citations, replaces the citations of given recid
in the database.
The changes are logged into rnkCITATIONLOG.
See @replace_refs
"""
old_cites = set(row[0] for row in run_sql("""SELECT citer
FROM rnkCITATIONDICT
WHERE citee = %s""", [recid]))
cites_to_add = new_cites - old_cites
cites_to_delete = old_cites - new_cites
for cite in cites_to_add:
write_message('adding cite %s %s' % (recid, cite), verbose=1)
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
run_sql("""INSERT INTO rnkCITATIONDICT (citee, citer, last_updated)
VALUES (%s, %s, %s)""", (recid, cite, now))
run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date)
VALUES (%s, %s, %s, %s)""", (recid, cite, 'added', now))
for cite in cites_to_delete:
write_message('deleting cite %s %s' % (recid, cite), verbose=1)
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
run_sql("""DELETE FROM rnkCITATIONDICT
WHERE citee = %s and citer = %s""", (recid, cite))
run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date)
VALUES (%s, %s, %s, %s)""", (recid, cite, 'removed', now))
def insert_into_missing(recid, report):
"""Mark reference string as missing.
If a reference is a report number / journal / DOI but we do not have
the corresping record in the database, we mark that particualar
reference string as missing, by adding a row in rnkCITATIONDATAEXT.
The recid represents the record containing the reference string.
"""
if len(report) >= 255:
# Invalid report, it is too long
# and does not fit in the database column
# (currently varchar 255)
return
wasalready = run_sql("""SELECT id_bibrec
FROM rnkCITATIONDATAEXT
WHERE id_bibrec = %s
AND extcitepubinfo = %s""",
(recid, report))
if not wasalready:
run_sql("""INSERT INTO rnkCITATIONDATAEXT(id_bibrec, extcitepubinfo)
VALUES (%s,%s)""", (recid, report))
def remove_from_missing(report):
"""Remove the reference string from the missing table
See @insert_into_missing"""
run_sql("""DELETE FROM rnkCITATIONDATAEXT
WHERE extcitepubinfo = %s""", (report,))
def print_missing(num):
"""
Print the contents of rnkCITATIONDATAEXT table containing external
records that were cited by NUM or more internal records.
NUM is by default taken from the -E command line option.
"""
if not num:
num = task_get_option("print-extcites")
write_message("Listing external papers cited by %i or more \
internal records:" % num)
res = run_sql("""SELECT COUNT(id_bibrec), extcitepubinfo
FROM rnkCITATIONDATAEXT
GROUP BY extcitepubinfo HAVING COUNT(id_bibrec) >= %s
ORDER BY COUNT(id_bibrec) DESC""", (num,))
for cnt, brec in res:
print str(cnt), "\t", brec
write_message("Listing done.")
def tagify(parsedtag):
"""aux auf to make '100__a' out of ['100','','','a']"""
tag = ""
for t in parsedtag:
if t == '':
t = '_'
tag += t
return tag
def store_citation_warning(warning_type, cit_info):
"""Store citation indexing warnings in the database
If we encounter a problem during the citation indexing, such as multiple
results for a report number, we store a warning in rnkCITATIONDATAERR
"""
r = run_sql("""SELECT 1 FROM rnkCITATIONDATAERR
WHERE type = %s
AND citinfo = %s""", (warning_type, cit_info))
if not r:
run_sql("""INSERT INTO rnkCITATIONDATAERR (type, citinfo)
VALUES (%s, %s)""", (warning_type, cit_info))

Event Timeline