dbc_cluster_file_reader.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jul 12, 08:23

dbc_cluster_file_reader.py
View Options

	#!/usr/bin/env python
	# -- coding: utf-8 --

	" module containing class dbcClusterFileReader, daugther of ClusterFileReader"

	import operator

	##########################################################################
	from cluster_file_reader import ClusterFileReader
	from cluster import Cluster
	from sequence import Sequence
	from cluster_collection import ClusterCollection
	from sequence_collection import SequenceCollection
	##########################################################################


	class dbcClusterFileReader(ClusterFileReader):

	"""
	class dbcClusterFileReader: this object extract and store the information given by the dbc software output. Unlike cd-hit output, dbc output do not provide a cluster head (a sequence that will be the representant of the cluster), so this sequence will be chosen as the one beeing the most frequent in each cluster. As the clusterID is the seqID of the cluster head, the clusters are instanciated with a temporary ID (number) and this ID is changed when the file is read and the clustersID are known.
	"""

	# ------------------------------------------------------------------ #
	# Constructors/Destructors #
	# ------------------------------------------------------------------ #

	def __init__(self,filename):
	"""__init__: set as member of the object an empty list of sampleID and the given filename and an empty local cluster dictionnary:
	- key : the number of the cluster in the file (temporary information)
	- value : dictionnary containing the sequences and their abundance
	in the cluster
	this dictionnary will be used to store the """
	ClusterFileReader.__init__(self, filename)

	# Members ---------------------- #

	# dictionnary d_clust
	self.d_clust = {}


	def __del__(self):
	"""__del__: not implemented"""
	pass

	# ------------------------------------------------------------------ #
	# Methods #
	# ------------------------------------------------------------------ #

	# public:

	def incrementTempDictionary(self,num_clust, sequence):
	""" incrementTempDictionary: increment the first element of the dictionary value of a particular sequence by 1 """

	# the value of the dictionnary is the count of a sequences (key) is a cluster
	if sequence.seq in self.d_clust[num_clust] :
	self.d_clust[num_clust][sequence.seq][0] += 1
	else :
	self.d_clust[num_clust][sequence.seq] = [1,sequence.seqID,sequence.seq_sample]

	return 0



	def read(self,key_string, min_clust_size = 200):
	"""read: we have to read the entire file to have the clusters complete"""

	# instanciate two collections to store the values contained in the cluster file
	cluster_collection = ClusterCollection()

	# open the files
	clust_file = open(self.filename,'r')

	# collect the sequences and their frequency per sample and cluster in d_clust_sample

	#pass the header
	clust_file.readline()
	clust_line = clust_file.readline()

	while clust_line:

	split_clust_line = clust_line.replace('\n','').split('\t')
	# -> ['seqnum', 'num_clust', 'seqID', 'sequence']

	# Cluster 0 contains all the non clusterized sequences we dont want to keep them, so :
	if split_clust_line[1]=='0':
	pass

	# get the number of the cluster
	else:
	num_clust = split_clust_line[1]

	# if it is the first time this cluster is seen,
	if num_clust not in self.d_clust:

	#add this number of cluster as key of the temp dictionary
	self.d_clust[num_clust] = {}

	# and instanciate a new cluster with num_clust as temporary name
	cluster = Cluster(num_clust)

	# add this new cluster to the cluster collection
	cluster_collection.addCluster(cluster)

	# get the information about the sequence:
	# the sampleID is stored into the sequence name after a key_string
	sampleID = key_string + split_clust_line[2].split(key_string)[1]
	seqID = split_clust_line[2].replace('>','').split(key_string)[0]
	seq = split_clust_line[3].replace('\n','')

	# instanciate a new sequence with the information collected in that line
	sequence = Sequence(seqID,sampleID, seq)

	# and assign the temporary clusterID to this sequence
	sequence.seq_cluster_id = num_clust

	# store thi sequence in sequence_collection
	self.sequence_collection.addSequence(sequence)

	# complete the list of sample id if necessary
	if sampleID not in self.list_sample_id :
	self.list_sample_id.append(sampleID)

	# increment the number of this particular sequence in this cluster by 1
	self.incrementTempDictionary(num_clust,sequence)

	# increment the dictionary d_abundance_per_sample[sampleID] = nb_seqence
	# increment total cluster size
	current_cluster = cluster_collection.getCluster(num_clust)
	self.incrementAbundancePerSample(current_cluster, sampleID)

	clust_line = clust_file.readline()

	# close the file when all the lines were read
	clust_file.close()

	# sort the list of sample id
	self.list_sample_id.sort()

	# only when all the lines were read can we know if the clusters have the minimum required size, so they are deleted here if they dont.

	for num_clust in self.d_clust:

	# if the cluster is smaller than minimal size (200 by default)
	if cluster_collection.cluster_collection[num_clust].size < min_clust_size :

	# delete the sequences corresponding to this cluster from the sequence collection
	temp_list_sequence_to_remove = []
	for seqID in self.sequence_collection.dict :
	if self.sequence_collection.dict[seqID].seq_cluster_id == num_clust:
	temp_list_sequence_to_remove.append(seqID)

	self.sequence_collection.removeSequences(temp_list_sequence_to_remove)

	# delete the cluster
	del cluster_collection.cluster_collection[num_clust]

	# if the cluster has the required size,
	else :
	# then the most frequen sequence is chosen to be the cluster head
	cluster_head = self.getClusterHead(num_clust)

	# the cluster head seqID (cluster_head.seqID) will be replace the temp id of the cluster (num_clust)
	cluster_collection.modifyClusterHead(num_clust,cluster_head)

	# the identifier is actualised in the sequences of the cluster
	self.sequence_collection.modifyClusterID(num_clust,cluster_head.seqID)


	return cluster_collection



	def getClusterHead(self,num_clust):
	"""getClusterHeads: will extract the most frequent sequence in each
	cluster and assign it to the cluster as member .head_id """

	# get the maximum value in
	most_abundant_seq_info = max(self.d_clust[num_clust].iteritems(),key=operator.itemgetter(1))

	# implementer un getsequence
	#print 'most_abundant seq info ' , most_abundant_seq_info
	cluster_head = self.sequence_collection.getSequence(most_abundant_seq_info[1][1])

	return cluster_head


	##########################################################################


	if __name__ == '__main__':
	test = dbcClusterFileReader()
	test.read(None,'/home/aline/spe_repository_aa/project/class_diagram/input_files/R2_dbc.clusters','-201')

dbc_cluster_file_reader.pyNo OneTemporaryActions

File Metadata

dbc_cluster_file_reader.pyView Options

Event Timeline

dbc_cluster_file_reader.py
No OneTemporary
Actions

dbc_cluster_file_reader.py
View Options