Page MenuHomec4science

dbc_cluster_file_reader.py
No OneTemporary

File Metadata

Created
Thu, Jul 10, 06:18

dbc_cluster_file_reader.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
" module containing class dbcClusterFileReader, daugther of ClusterFileReader"
import operator
##########################################################################
from cluster_file_reader import ClusterFileReader
from cluster import Cluster
from sequence import Sequence
from cluster_collection import ClusterCollection
from sequence_collection import SequenceCollection
##########################################################################
class dbcClusterFileReader(ClusterFileReader):
"""
class dbcClusterFileReader: this object extract and store the information given by the dbc software output. Unlike cd-hit output, dbc output do not provide a cluster head (a sequence that will be the representant of the cluster), so this sequence will be chosen as the one beeing the most frequent in each cluster. As the clusterID is the seqID of the cluster head, the clusters are instanciated with a temporary ID (number) and this ID is changed when the file is read and the clustersID are known.
"""
# ------------------------------------------------------------------ #
# Constructors/Destructors #
# ------------------------------------------------------------------ #
def __init__(self,filename):
"""__init__: set as member of the object an empty list of sampleID and the given filename and an empty local cluster dictionnary:
- key : the number of the cluster in the file (temporary information)
- value : dictionnary containing the sequences and their abundance
in the cluster
this dictionnary will be used to store the """
ClusterFileReader.__init__(self, filename)
# Members ---------------------- #
# dictionnary d_clust
self.d_clust = {}
def __del__(self):
"""__del__: not implemented"""
pass
# ------------------------------------------------------------------ #
# Methods #
# ------------------------------------------------------------------ #
# public:
def incrementTempDictionary(self,num_clust, sequence):
""" incrementTempDictionary: increment the first element of the dictionary value of a particular sequence by 1 """
# the value of the dictionnary is the count of a sequences (key) is a cluster
if sequence.seq in self.d_clust[num_clust] :
self.d_clust[num_clust][sequence.seq][0] += 1
else :
self.d_clust[num_clust][sequence.seq] = [1,sequence.seqID,sequence.seq_sample]
return 0
def read(self,key_string, min_clust_size = 200):
"""read: we have to read the entire file to have the clusters complete"""
# instanciate two collections to store the values contained in the cluster file
cluster_collection = ClusterCollection()
# open the files
clust_file = open(self.filename,'r')
# collect the sequences and their frequency per sample and cluster in d_clust_sample
#pass the header
clust_file.readline()
clust_line = clust_file.readline()
while clust_line:
split_clust_line = clust_line.replace('\n','').split('\t')
# -> ['seqnum', 'num_clust', 'seqID', 'sequence']
# Cluster 0 contains all the non clusterized sequences we dont want to keep them, so :
if split_clust_line[1]=='0':
pass
# get the number of the cluster
else:
num_clust = split_clust_line[1]
# if it is the first time this cluster is seen,
if num_clust not in self.d_clust:
#add this number of cluster as key of the temp dictionary
self.d_clust[num_clust] = {}
# and instanciate a new cluster with num_clust as temporary name
cluster = Cluster(num_clust)
# add this new cluster to the cluster collection
cluster_collection.addCluster(cluster)
# get the information about the sequence:
# the sampleID is stored into the sequence name after a key_string
sampleID = key_string + split_clust_line[2].split(key_string)[1]
seqID = split_clust_line[2].replace('>','').split(key_string)[0]
seq = split_clust_line[3].replace('\n','')
# instanciate a new sequence with the information collected in that line
sequence = Sequence(seqID,sampleID, seq)
# and assign the temporary clusterID to this sequence
sequence.seq_cluster_id = num_clust
# store thi sequence in sequence_collection
self.sequence_collection.addSequence(sequence)
# complete the list of sample id if necessary
if sampleID not in self.list_sample_id :
self.list_sample_id.append(sampleID)
# increment the number of this particular sequence in this cluster by 1
self.incrementTempDictionary(num_clust,sequence)
# increment the dictionary d_abundance_per_sample[sampleID] = nb_seqence
# increment total cluster size
current_cluster = cluster_collection.getCluster(num_clust)
self.incrementAbundancePerSample(current_cluster, sampleID)
clust_line = clust_file.readline()
# close the file when all the lines were read
clust_file.close()
# sort the list of sample id
self.list_sample_id.sort()
# only when all the lines were read can we know if the clusters have the minimum required size, so they are deleted here if they dont.
for num_clust in self.d_clust:
# if the cluster is smaller than minimal size (200 by default)
if cluster_collection.cluster_collection[num_clust].size < min_clust_size :
# delete the sequences corresponding to this cluster from the sequence collection
temp_list_sequence_to_remove = []
for seqID in self.sequence_collection.dict :
if self.sequence_collection.dict[seqID].seq_cluster_id == num_clust:
temp_list_sequence_to_remove.append(seqID)
self.sequence_collection.removeSequences(temp_list_sequence_to_remove)
# delete the cluster
del cluster_collection.cluster_collection[num_clust]
# if the cluster has the required size,
else :
# then the most frequen sequence is chosen to be the cluster head
cluster_head = self.getClusterHead(num_clust)
# the cluster head seqID (cluster_head.seqID) will be replace the temp id of the cluster (num_clust)
cluster_collection.modifyClusterHead(num_clust,cluster_head)
# the identifier is actualised in the sequences of the cluster
self.sequence_collection.modifyClusterID(num_clust,cluster_head.seqID)
return cluster_collection
def getClusterHead(self,num_clust):
"""getClusterHeads: will extract the most frequent sequence in each
cluster and assign it to the cluster as member .head_id """
# get the maximum value in
most_abundant_seq_info = max(self.d_clust[num_clust].iteritems(),key=operator.itemgetter(1))
# implementer un getsequence
#print 'most_abundant seq info ' , most_abundant_seq_info
cluster_head = self.sequence_collection.getSequence(most_abundant_seq_info[1][1])
return cluster_head
##########################################################################
if __name__ == '__main__':
test = dbcClusterFileReader()
test.read(None,'/home/aline/spe_repository_aa/project/class_diagram/input_files/R2_dbc.clusters','-201')

Event Timeline