bibauthorid_prob_matrix.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Nov 7, 09:51

bibauthorid_prob_matrix.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2011, 2012 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.


	import gc
	import invenio.bibauthorid_config as bconfig
	from invenio.bibauthorid_comparison import compare_bibrefrecs
	from invenio.bibauthorid_comparison import clear_all_caches as clear_comparison_caches
	from invenio.bibauthorid_backinterface import get_modified_papers_before
	from invenio.bibauthorid_general_utils import bibauthor_print \
	, update_status \
	, update_status_final \
	, is_eq

	#import pyximport; pyximport.install()
	from invenio.bibauthorid_bib_matrix import Bib_matrix

	if bconfig.DEBUG_CHECKS:
	def _debug_is_eq_v(vl1, vl2):
	if isinstance(vl1, str) and isinstance(vl2, str):
	return vl1 == vl2

	if isinstance(vl1, tuple) and isinstance(vl2, tuple):
	return is_eq(vl1[0], vl2[0]) and is_eq(vl1[1], vl2[1])

	return False


	class ProbabilityMatrix(object):
	'''
	This class contains and maintains the comparison
	between all virtual authors. It is able to write
	and read from the database and update the results.
	'''
	def __init__(self, name):
	self._bib_matrix = Bib_matrix(name)

	def load(self, load_map=True, load_matrix=True):
	update_status(0., "Loading probability matrix...")
	self._bib_matrix.load()
	update_status_final("Probability matrix loaded.")

	def store(self):
	update_status(0., "Saving probability matrix...")
	self._bib_matrix.store()
	update_status_final("Probability matrix saved.")

	def __getitem__(self, bibs):
	return self._bib_matrix[bibs[0], bibs[1]]

	def getitem_numeric(self, bibs):
	return self._bib_matrix.getitem_numeric(bibs)


	def __get_up_to_date_bibs(self, bib_matrix):
	return frozenset(get_modified_papers_before(
	bib_matrix.get_keys(),
	bib_matrix.creation_time))

	def is_up_to_date(self, cluster_set):
	return self.__get_up_to_date_bibs(self._bib_matrix) >= frozenset(cluster_set.all_bibs())

	def recalculate(self, cluster_set):
	'''
	Constructs probability matrix. If use_cache is true, it will
	try to load old computations from the database. If save cache
	is true it will save the current results into the database.
	@param cluster_set: A cluster set object, used to initialize
	the matrix.
	'''
	last_cleaned = 0
	self._bib_matrix.store()
	try:
	old_matrix = Bib_matrix(self._bib_matrix.name+'copy')
	old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name+'copy')
	old_matrix.load()
	cached_bibs = self.__get_up_to_date_bibs(old_matrix)
	have_cached_bibs = bool(cached_bibs)
	except IOError:
	old_matrix.destroy()
	cached_bibs = None
	have_cached_bibs = False

	self._bib_matrix.destroy()
	self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set)

	ncl = cluster_set.num_all_bibs
	expected = ((ncl * (ncl - 1)) / 2)
	if expected == 0:
	expected = 1

	try:
	cur_calc, opti, prints_counter = 0, 0, 0
	for cl1 in cluster_set.clusters:

	if cur_calc+opti - prints_counter > 100000 or cur_calc == 0:
	update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
	prints_counter = cur_calc+opti

	# #clean caches
	if cur_calc - last_cleaned > 20000000:
	gc.collect()
	# clear_comparison_caches()
	last_cleaned = cur_calc

	for cl2 in cluster_set.clusters:
	if id(cl1) < id(cl2) and not cl1.hates(cl2):
	for bib1 in cl1.bibs:
	for bib2 in cl2.bibs:
	if have_cached_bibs:
	try:
	val = old_matrix[bib1, bib2]
	opti += 1
	if bconfig.DEBUG_CHECKS:
	assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2))
	except KeyError:
	cur_calc += 1
	val = compare_bibrefrecs(bib1, bib2)
	if not val:
	cur_calc += 1
	val = compare_bibrefrecs(bib1, bib2)
	else:
	cur_calc += 1
	val = compare_bibrefrecs(bib1, bib2)
	self._bib_matrix[bib1, bib2] = val

	except Exception, e:
	raise Exception("""Error happened in prob_matrix.recalculate with
	val:%s
	original_exception: %s
	"""%(str(val),str(e)))

	clear_comparison_caches()
	update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))


	def prepare_matirx(cluster_set, force):
	if bconfig.DEBUG_CHECKS:
	assert cluster_set._debug_test_hate_relation()
	assert cluster_set._debug_duplicated_recs()

	matr = ProbabilityMatrix(cluster_set.last_name)
	matr.load(load_map=True, load_matrix=False)
	if not force and matr.is_up_to_date(cluster_set):
	bibauthor_print("Cluster %s is up-to-date and therefore will not be computed."
	% cluster_set.last_name)
	return False

	matr.load(load_map=False, load_matrix=True)
	matr.recalculate(cluster_set)
	matr.store()
	return True

bibauthorid_prob_matrix.pyNo OneTemporaryActions

File Metadata

bibauthorid_prob_matrix.pyView Options

Event Timeline

bibauthorid_prob_matrix.py
No OneTemporary
Actions

bibauthorid_prob_matrix.py
View Options