cluster_set.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Nov 7, 10:25

cluster_set.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	from itertools import chain, groupby, izip, cycle
	from operator import itemgetter
	import cPickle
	from invenio.bibauthorid_matrix_optimization import maximized_mapping
	from invenio.bibauthorid_backinterface import save_cluster
	from invenio.bibauthorid_backinterface import get_all_papers_of_pids
	from invenio.bibauthorid_backinterface import get_bib10x, get_bib70x
	from invenio.bibauthorid_backinterface import get_all_modified_names_from_personid
	from invenio.bibauthorid_backinterface import get_signatures_from_bibrefs
	from invenio.bibauthorid_name_utils import generate_last_name_cluster_str
	from invenio.bibauthorid_general_utils import bibauthor_print


	#python2.4 compatibility
	from invenio.bibauthorid_general_utils import bai_all as all

	class Blob(object):
	def __init__(self, personid_records):
	'''
	@param personid_records:
	A list of tuples: (personid, bibrefrec, flag).
	Notice that all bibrefrecs should be the same
	since the Blob represents only one bibrefrec.
	'''
	self.bib = personid_records[0][1]
	assert all(p[1] == self.bib for p in personid_records), \
	"All cluster sets should share the bibrefrec"
	self.claimed = set()
	self.assigned = set()
	self.rejected = set()
	for pid, _, flag in personid_records:
	if flag > 1:
	self.claimed.add(pid)
	elif flag >= -1:
	self.assigned.add(pid)
	else:
	self.rejected.add(pid)


	def create_blobs_by_pids(pids):
	'''
	Returs a list of blobs by a given set of personids.
	Blob is an object which describes all information
	for a bibrefrec in the personid table.
	@type pids: iterable of integers
	'''
	all_bibs = get_all_papers_of_pids(pids)
	all_bibs = ((x[0], (int(x[1]), x[2], x[3]), x[4]) for x in all_bibs)
	bibs_dict = groupby(sorted(all_bibs, key=itemgetter(1)), key=itemgetter(1))
	blobs = [Blob(list(bibs)) for _, bibs in bibs_dict]

	return blobs


	def group_blobs(blobs):
	'''
	Separates the blobs into two groups
	of objects - those with claims and
	those without.
	'''

	# created from blobs, which are claimed
	# [(bibrefrec, personid)]
	union = []

	# created from blobs, which are not claimed
	# [(bibrefrec, personid/None, [personid])]
	independent = []

	for blob in blobs:
	assert len(blob.claimed) + len(blob.assigned) == 1, \
	"Each blob must have exactly one associated signature"
	if len(blob.claimed) > 0:
	union.append((blob.bib, list(blob.claimed)[0]))
	else:
	independent.append((blob.bib, list(blob.assigned)[0], list(blob.rejected)))

	return (union, independent)


	class ClusterSet(object):
	class Cluster(object):
	def __init__(self, bibs, hate=None):
	# hate is a symetrical relation
	self.bibs = set(bibs)
	if hate:
	self.hate = set(hate)
	else:
	self.hate = set([])
	self.personid = None

	def hates(self, other):
	return other in self.hate

	def quarrel(self, cl2):
	self.hate.add(cl2)
	cl2.hate.add(self)

	def _debug_test_hate_relation(self):
	for cl2 in self.hate:
	if not self.hates(cl2) or not cl2.hates(self):
	return False
	return True

	def __init__(self):
	self.clusters = []
	self.update_bibs()
	self.num_all_bibs = None
	self.last_name = None

	def update_bibs(self):
	self.num_all_bibs = sum(len(cl.bibs) for cl in self.clusters)

	def all_bibs(self):
	return chain.from_iterable(cl.bibs for cl in self.clusters)

	def create_skeleton(self, personids, last_name):
	blobs = create_blobs_by_pids(personids)
	self.last_name = last_name

	union, independent = group_blobs(blobs)

	union_clusters = {}
	for uni in union:
	union_clusters[uni[1]] = union_clusters.get(uni[1], []) + [uni[0]]

	cluster_dict = dict((personid, self.Cluster(bibs)) for personid, bibs in union_clusters.items())
	self.clusters = cluster_dict.values()

	for i, cl in enumerate(self.clusters):
	cl.hate = set(chain(self.clusters[:i], self.clusters[i + 1:]))

	for ind in independent:
	bad_clusters = [cluster_dict[i] for i in ind[2] if i in cluster_dict]
	cl = self.Cluster([ind[0]], bad_clusters)
	for bcl in bad_clusters:
	bcl.hate.add(cl)
	self.clusters.append(cl)

	self.update_bibs()
	return self

	# Creates a cluster set, ignoring the claims and the
	# rejected papers.
	def create_pure(self, personids, last_name):
	blobs = create_blobs_by_pids(personids)
	self.last_name = last_name

	self.clusters = [self.Cluster((blob.bib,)) for blob in blobs]
	self.update_bibs()
	return self

	# no longer used
	def create_body(self, blobs):
	union, independent = group_blobs(blobs)

	arranged_clusters = {}
	for cls in chain(union, independent):
	arranged_clusters[cls[1]] = arranged_clusters.get(cls[1], []) + [cls[0]]

	for pid, bibs in arranged_clusters.items():
	cl = self.Cluster(bibs)
	cl.personid = pid
	self.clusters.append(cl)
	self.update_bibs()
	return self

	def create_from_mark(self, bibrefs, last_name):
	bibrecrefs = get_signatures_from_bibrefs(bibrefs)
	self.clusters = [ClusterSet.Cluster([bib]) for bib in bibrecrefs]
	self.last_name = last_name
	self.update_bibs()
	return self

	# a very slow fucntion checking when the hate relation is no longer symetrical
	def _debug_test_hate_relation(self):
	for cl1 in self.clusters:
	if not cl1._debug_test_hate_relation():
	return False
	return True

	# similar to the function above
	def _debug_duplicated_recs(self, mapping=None):
	for cl in self.clusters:
	if mapping:
	setty = set(mapping[x][2] for x in cl.bibs)
	else:
	setty = set(x[2] for x in cl.bibs)

	if len(cl.bibs) != len(setty):
	return False
	return True

	# No longer used but it might be handy.
	@staticmethod
	def match_cluster_sets(cs1, cs2):
	"""
	This functions tries to generate the best matching
	between cs1 and cs2 acoarding to the shared bibrefrecs.
	It returns a dictionary with keys, clsuters in cs1,
	and values, clusters in cs2.
	@param and type of cs1 and cs2: cluster_set
	@return: dictionary with the matching clusters.
	@return type: { cluster : cluster }
	"""

	matr = [[len(cl1.bibs & cl2.bibs) for cl2 in cs2.clusters] for cl1 in cs1.clusters]
	mapping = maximized_mapping(matr)
	return dict((cs1.clusters[mappy[0]], cs2.clusters[mappy[1]]) for mappy in mapping)

	def store(self):
	'''
	Stores the cluster set in a special table.
	This is used to store the results of
	tortoise/wedge in a table and later merge them
	with personid.
	'''
	named_clusters = (("%s.%d" % (self.last_name, idx), cl) for idx, cl in enumerate(self.clusters))
	map(save_cluster, named_clusters)


	def delayed_create_from_mark(bibrefs, last_name):
	def ret():
	return ClusterSet().create_from_mark(bibrefs, last_name)
	return ret


	def delayed_cluster_sets_from_marktables(limit_to_surnames=False):
	# { name -> [(table, bibref)] }
	bibauthor_print('Delayed_cluster_set_from_marktables limited to %s' % str(limit_to_surnames))

	name_buket = {}
	if limit_to_surnames:
	limit_to_surnames = set([generate_last_name_cluster_str(s) for s in limit_to_surnames])

	for tab, ref, name in chain(izip(cycle((100,)), izip(get_bib10x())),
	izip(cycle((700,)), izip(get_bib70x()))):
	name = generate_last_name_cluster_str(name)
	if limit_to_surnames and not name in limit_to_surnames:
	continue
	name_buket[name] = name_buket.get(name, []) + [(tab, ref)]

	bibauthor_print('Delayed_cluster_set_from_marktables going to get %s signatures....' % str(len(name_buket)))

	all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs))))
	for name, refs in name_buket.items())
	all_refs = sorted(all_refs, key=itemgetter(2))
	return ([delayed_create_from_mark(set(refs), name) for name, refs, _ in all_refs],
	map(itemgetter(0), all_refs),
	map(itemgetter(2), all_refs))


	def create_lastname_list_from_personid(last_modification):
	'''
	This function generates a dictionary from a last name
	to list of personids which have this lastname.
	'''
	# ((personid, [full Name1], Nbibs) ... )
	all_names = get_all_modified_names_from_personid(last_modification)

	# ((personid, last_name, Nbibs) ... )
	all_names = ((row[0], generate_last_name_cluster_str(iter(row[1]).next()), row[2])
	for row in all_names)

	# { (last_name, [(personid)... ], Nbibs) ... }
	all_names = groupby(sorted(all_names, key=itemgetter(1)), key=itemgetter(1))
	all_names = ((key, list(data)) for key, data in all_names)
	all_names = ((key, map(itemgetter(0), data), sum(x[2] for x in data)) for key, data in all_names)

	return all_names


	def delayed_create(create_f, pids, lname):
	def ret():
	return create_f(ClusterSet(), pids, lname)
	return ret


	def delayed_cluster_sets_from_personid(pure, last_modification=None):
	names = create_lastname_list_from_personid(last_modification)
	names = sorted(names, key=itemgetter(2))
	if pure:
	create = ClusterSet.create_pure
	else:
	create = ClusterSet.create_skeleton
	return ([delayed_create(create, name[1], name[0]) for name in names],
	map(itemgetter(0), names),
	map(itemgetter(2), names))

cluster_set.pyNo OneTemporaryActions

File Metadata

cluster_set.pyView Options

Event Timeline

cluster_set.py
No OneTemporary
Actions

cluster_set.py
View Options