Page MenuHomec4science

cd_hit_cluster_file_reader.py
No OneTemporary

File Metadata

Created
Sun, Jul 13, 20:33

cd_hit_cluster_file_reader.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
" module containing cdHitClusterFileReader, daugther of ClusterFileReader"
##########################################################################
from cluster_file_reader import ClusterFileReader
from cluster import Cluster
from sequence import Sequence
from cluster_collection import ClusterCollection
from sequence_collection import SequenceCollection
##########################################################################
class cdHitClusterFileReader(ClusterFileReader):
"""
class cdHitClusterFileReader: this object extract and store the information given by the cd-hit software output
"""
# ------------------------------------------------------------------ #
# Constructors/Destructors #
# ------------------------------------------------------------------ #
def __init__(self,filename):
"""__init__: the initialization of cdHitClusterFileReader is the same as the mother ClusterFileReader"""
ClusterFileReader.__init__(self,filename)
def __del__(self):
"""__del__: not implemented """
pass
# ------------------------------------------------------------------ #
# Methods #
# ------------------------------------------------------------------ #
# public:
def read(self, key_string, min_clust_size = 200):
"""read: get the information in the output of the software cd-hit, that is the size and the cluster head of the cluster larger thant min_clust_size and store it in a object of type ClusterCollection. It also collect all the sequences and their affiliation to a particular cluster and store them into the member sequence_collection """
# initialize the two objects where the information will be stored
cluster_collection = ClusterCollection()
# open the cluster files
clust_file = open(self.filename,'r')
# and read it line by line
clust_line = clust_file.readline()
while clust_line:
# cluster information is preceded by '>Cluster' + the cluster number
if clust_line[0:3]=='>Cl':
# extract the cluster number
num_clust = clust_line.split(' ')[1].replace('\n','')
# the number of sequences of the cluster is initialized to zero
nb_seq = 0
# the sequences of the cluster will be stored in a temporary list
temp_list_of_sequences = []
clust_line = clust_file.readline()
# as long as it is not a new cluster
while clust_line[0:3]!= '>Cl'and clust_line:
# the the sequence ID and the sample they belong to are extracted from the sequence name
seqID = clust_line.split(', >')[1].split('... ')[0].replace('>','').split(key_string)[0]
sampleID = key_string + clust_line.split(', >')[1].split('... ')[0].split(key_string)[1]
# the sequences per se are not printed in cd-hit output files, in the future the sequences can be fetch in the former input file (type fasta)
seq = ''
# this new sequence is instanciated and added to the list of sequences
sequence = Sequence(seqID,sampleID, seq)
temp_list_of_sequences.append(sequence)
# the different sampleID are stored in a member list
if sampleID not in self.list_sample_id :
self.list_sample_id.append(sampleID)
# the cluster head is given by this symbole '*'
if clust_line.split('... ')[1].replace('\n','')== '*':
cluster_head_id = seqID
# the number of sequences in the cluster is incremented
nb_seq += 1
#print clust_line
#print nb_seq
clust_line = clust_file.readline()
# once the end of the cluster is reached,
# if it is larger than the min_clust_size threshold, the new cluster is instanciated and the size information is updated
if nb_seq >= min_clust_size:
cluster = Cluster(cluster_head_id)
# the new cluster is added to the cluster collection
cluster_collection.addCluster(cluster)
# store the membership of the sequences of the cluster by adding their cluster_head_id
# and store these sequences in the sequence collection
for sequence in temp_list_of_sequences :
sequence.seq_cluster_id = cluster_head_id
self.sequence_collection.addSequence(sequence)
self.incrementAbundancePerSample(cluster,sequence.seq_sample)
# if the line begining by '>Cl' is not at the begining of the file
else :
print 'probleme de capture du fichier clstr'
clust_line = clust_file.readline()
clust_file.close()
print 'size ',cluster_collection.cluster_collection['M01312_138_000000000-AN8YW_1_1112_16352_8661'].size
# the list of samples is sorted
self.list_sample_id.sort()
print len(cluster_collection.cluster_collection), ' clusters have been recovered. '
print 'The total number of sequences found in these clusters is ', len(self.sequence_collection.dict)
print 'size ',cluster_collection.cluster_collection['M01312_138_000000000-AN8YW_1_1112_16352_8661'].size
return cluster_collection
##########################################################################
if __name__ == '__main__':
test = cdHitClusterFileReader()
test.read(None,'/home/aline/spe_repository_aa/project/class_diagram/input_files/R2_trim.contigs.good_all.clust.sort_200.clstr','-201')

Event Timeline