File Metadata

Created: Sun, Jul 13, 20:33

cd_hit_cluster_file_reader.py
View Options

	#!/usr/bin/env python
	# -- coding: utf-8 --

	" module containing cdHitClusterFileReader, daugther of ClusterFileReader"


	##########################################################################
	from cluster_file_reader import ClusterFileReader
	from cluster import Cluster
	from sequence import Sequence
	from cluster_collection import ClusterCollection
	from sequence_collection import SequenceCollection
	##########################################################################


	class cdHitClusterFileReader(ClusterFileReader):

	"""
	class cdHitClusterFileReader: this object extract and store the information given by the cd-hit software output
	"""

	# ------------------------------------------------------------------ #
	# Constructors/Destructors #
	# ------------------------------------------------------------------ #

	def __init__(self,filename):
	"""__init__: the initialization of cdHitClusterFileReader is the same as the mother ClusterFileReader"""

	ClusterFileReader.__init__(self,filename)


	def __del__(self):
	"""__del__: not implemented """
	pass

	# ------------------------------------------------------------------ #
	# Methods #
	# ------------------------------------------------------------------ #

	# public:

	def read(self, key_string, min_clust_size = 200):
	"""read: get the information in the output of the software cd-hit, that is the size and the cluster head of the cluster larger thant min_clust_size and store it in a object of type ClusterCollection. It also collect all the sequences and their affiliation to a particular cluster and store them into the member sequence_collection """

	# initialize the two objects where the information will be stored
	cluster_collection = ClusterCollection()


	# open the cluster files
	clust_file = open(self.filename,'r')

	# and read it line by line
	clust_line = clust_file.readline()
	while clust_line:

	# cluster information is preceded by '>Cluster' + the cluster number
	if clust_line[0:3]=='>Cl':

	# extract the cluster number
	num_clust = clust_line.split(' ')[1].replace('\n','')

	# the number of sequences of the cluster is initialized to zero
	nb_seq = 0

	# the sequences of the cluster will be stored in a temporary list
	temp_list_of_sequences = []

	clust_line = clust_file.readline()

	# as long as it is not a new cluster
	while clust_line[0:3]!= '>Cl'and clust_line:

	# the the sequence ID and the sample they belong to are extracted from the sequence name
	seqID = clust_line.split(', >')[1].split('... ')[0].replace('>','').split(key_string)[0]

	sampleID = key_string + clust_line.split(', >')[1].split('... ')[0].split(key_string)[1]

	# the sequences per se are not printed in cd-hit output files, in the future the sequences can be fetch in the former input file (type fasta)
	seq = ''

	# this new sequence is instanciated and added to the list of sequences
	sequence = Sequence(seqID,sampleID, seq)
	temp_list_of_sequences.append(sequence)

	# the different sampleID are stored in a member list
	if sampleID not in self.list_sample_id :
	self.list_sample_id.append(sampleID)

	# the cluster head is given by this symbole '*'
	if clust_line.split('... ')[1].replace('\n','')== '*':
	cluster_head_id = seqID

	# the number of sequences in the cluster is incremented

	nb_seq += 1
	#print clust_line
	#print nb_seq
	clust_line = clust_file.readline()

	# once the end of the cluster is reached,
	# if it is larger than the min_clust_size threshold, the new cluster is instanciated and the size information is updated
	if nb_seq >= min_clust_size:
	cluster = Cluster(cluster_head_id)


	# the new cluster is added to the cluster collection
	cluster_collection.addCluster(cluster)

	# store the membership of the sequences of the cluster by adding their cluster_head_id
	# and store these sequences in the sequence collection
	for sequence in temp_list_of_sequences :
	sequence.seq_cluster_id = cluster_head_id
	self.sequence_collection.addSequence(sequence)
	self.incrementAbundancePerSample(cluster,sequence.seq_sample)

	# if the line begining by '>Cl' is not at the begining of the file
	else :
	print 'probleme de capture du fichier clstr'
	clust_line = clust_file.readline()

	clust_file.close()
	print 'size ',cluster_collection.cluster_collection['M01312_138_000000000-AN8YW_1_1112_16352_8661'].size
	# the list of samples is sorted
	self.list_sample_id.sort()


	print len(cluster_collection.cluster_collection), ' clusters have been recovered. '
	print 'The total number of sequences found in these clusters is ', len(self.sequence_collection.dict)
	print 'size ',cluster_collection.cluster_collection['M01312_138_000000000-AN8YW_1_1112_16352_8661'].size
	return cluster_collection


	##########################################################################


	if __name__ == '__main__':
	test = cdHitClusterFileReader()
	test.read(None,'/home/aline/spe_repository_aa/project/class_diagram/input_files/R2_trim.contigs.good_all.clust.sort_200.clstr','-201')

cd_hit_cluster_file_reader.py
No OneTemporary
Actions

File Metadata

cd_hit_cluster_file_reader.py
View Options

Event Timeline

cd_hit_cluster_file_reader.pyNo OneTemporaryActions

File Metadata

cd_hit_cluster_file_reader.pyView Options

Event Timeline

cd_hit_cluster_file_reader.py
No OneTemporary
Actions

cd_hit_cluster_file_reader.py
View Options