clustering_process_fct.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Jul 13, 20:12

clustering_process_fct.py
View Options

	#!/usr/bin/env python
	# -- coding: utf-8 --

	"This file contain the clustering_process functions, it take 9 arguments as input"

	########################################################
	from dbc_cluster_file_reader import dbcClusterFileReader
	from cd_hit_cluster_file_reader import cdHitClusterFileReader
	from taxonomic_database_reader import TaxonomicDatabaseReader
	from blast_file_reader import BlastFileReader
	from reader import FileReader
	from fasta_reader import FastaReader
	import subprocess
	import os
	import fnmatch
	########################################################


	def clustering_process(clustering_software,key_string, reference_fasta_filename, input_fasta_file,cluster_filename, query_fasta_file, blast_file, output_table_file_name,dbtype,min_cluster_size):

	"""clustering_process: The clustering process will choose between two specific clustering reader according to the cluster_software (input), the cluster information will be stored in the cluster_collection object. Then a taxonomic collection will be generated by the taxonomic_reader. The cluster heads sequences will be compared with the reference database to assign a taxonomy (kingdom, class, ..., species) to the clusters. Finally, if an output file name was provided, a table of the evolution of clusters through the samples is created and written in this output file """

	# give a name for the indexed database that will be created by blast
	path, name = os.path.split(reference_fasta_filename)
	database_output_file = str(path + os.path.splitext(reference_fasta_filename)[0])

	# choice of the clustering reader

	if clustering_software == 'dbc':
	cluster_reader = dbcClusterFileReader(cluster_filename)

	elif clustering_software =='cd-hit' or clustering_software == 'cd_hit':
	cluster_reader = cdHitClusterFileReader(cluster_filename)

	else :
	print 'Error : the clustering software name is not valid, please enter in option -c : dbc or cd_hit.'


	# use the cluster reader to extract the clusters from cluster file
	print "extracting cluster information"
	cluster_collection = cluster_reader.read(key_string,min_cluster_size)

	# get the list of cluster head names
	cluster_head_filter = cluster_collection.cluster_collection.keys()

	print "Collecting taxonomic information from the reference database"
	# use the taxonomic_reader to build a taxonomic database from the given reference file
	taxonomic_reader = TaxonomicDatabaseReader(reference_fasta_filename)
	taxonomic_reader.read(None)

	print 'extract cluster head sequences from input fasta file'
	# use a reader to get only the cluster heads sequences from the former input file
	cluster_head_fasta_builder = FastaReader(input_fasta_file)

	# the cluster_head_filter is the list of the cluster head sequences names
	cluster_head_sequences = cluster_head_fasta_builder.read(cluster_head_filter,key_string)

	print 'build the cluster head fasta file'
	# the function build_fasta will build a fasta file with the cluster head sequences collected above
	cluster_head_fasta_builder.build_fasta(query_fasta_file)

	print 'making blastn database'

	# call blastn program to build the required reference database
	# blastn: 2.2.31+

	res1 = subprocess.call(['makeblastdb', '-in' ,reference_fasta_filename, '-input_type', 'fasta', '-dbtype', dbtype, '-title',path, '-out', database_output_file])

	# call blastn program to compare the sequences with the reference database

	print 'blast cluster head sequences against the database'
	res2 = subprocess.call(['blastn', '-query', query_fasta_file, '-db', database_output_file ,'-out', blast_file ,'-num_alignments' ,'1', '-num_descriptions' ,'1'])

	# use the blast_reader to extract the results of blastn, ie assign
	print 'Assign taxonomic match to clusters'
	blast_reader = BlastFileReader(blast_file)

	# assign taxonomy with {seqID : refID, similarity%} as parameter
	blast_reader.read(None)
	cluster_collection.assignTaxonomy(blast_reader.dict,taxonomic_reader.taxonomy_collection)

	# optionally create a cluster evolution table (evolution of the number of sequences in the cluster through the samples), here keeping only the clusters bigger than 4000
	if output_table_file_name != None :

	cluster_collection.buildClusterTables(cluster_reader.list_sample_id,output_table_file_name,4000)


	return cluster_collection

clustering_process_fct.pyNo OneTemporaryActions

File Metadata

clustering_process_fct.pyView Options

Event Timeline

clustering_process_fct.py
No OneTemporary
Actions

clustering_process_fct.py
View Options