Page MenuHomec4science

clustering_process_fct.py
No OneTemporary

File Metadata

Created
Sun, Jul 13, 20:12

clustering_process_fct.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"This file contain the clustering_process functions, it take 9 arguments as input"
########################################################
from dbc_cluster_file_reader import dbcClusterFileReader
from cd_hit_cluster_file_reader import cdHitClusterFileReader
from taxonomic_database_reader import TaxonomicDatabaseReader
from blast_file_reader import BlastFileReader
from reader import FileReader
from fasta_reader import FastaReader
import subprocess
import os
import fnmatch
########################################################
def clustering_process(clustering_software,key_string, reference_fasta_filename, input_fasta_file,cluster_filename, query_fasta_file, blast_file, output_table_file_name,dbtype,min_cluster_size):
"""clustering_process: The clustering process will choose between two specific clustering reader according to the cluster_software (input), the cluster information will be stored in the cluster_collection object. Then a taxonomic collection will be generated by the taxonomic_reader. The cluster heads sequences will be compared with the reference database to assign a taxonomy (kingdom, class, ..., species) to the clusters. Finally, if an output file name was provided, a table of the evolution of clusters through the samples is created and written in this output file """
# give a name for the indexed database that will be created by blast
path, name = os.path.split(reference_fasta_filename)
database_output_file = str(path + os.path.splitext(reference_fasta_filename)[0])
# choice of the clustering reader
if clustering_software == 'dbc':
cluster_reader = dbcClusterFileReader(cluster_filename)
elif clustering_software =='cd-hit' or clustering_software == 'cd_hit':
cluster_reader = cdHitClusterFileReader(cluster_filename)
else :
print 'Error : the clustering software name is not valid, please enter in option -c : dbc or cd_hit.'
# use the cluster reader to extract the clusters from cluster file
print "extracting cluster information"
cluster_collection = cluster_reader.read(key_string,min_cluster_size)
# get the list of cluster head names
cluster_head_filter = cluster_collection.cluster_collection.keys()
print "Collecting taxonomic information from the reference database"
# use the taxonomic_reader to build a taxonomic database from the given reference file
taxonomic_reader = TaxonomicDatabaseReader(reference_fasta_filename)
taxonomic_reader.read(None)
print 'extract cluster head sequences from input fasta file'
# use a reader to get only the cluster heads sequences from the former input file
cluster_head_fasta_builder = FastaReader(input_fasta_file)
# the cluster_head_filter is the list of the cluster head sequences names
cluster_head_sequences = cluster_head_fasta_builder.read(cluster_head_filter,key_string)
print 'build the cluster head fasta file'
# the function build_fasta will build a fasta file with the cluster head sequences collected above
cluster_head_fasta_builder.build_fasta(query_fasta_file)
print 'making blastn database'
# call blastn program to build the required reference database
# blastn: 2.2.31+
res1 = subprocess.call(['makeblastdb', '-in' ,reference_fasta_filename, '-input_type', 'fasta', '-dbtype', dbtype, '-title',path, '-out', database_output_file])
# call blastn program to compare the sequences with the reference database
print 'blast cluster head sequences against the database'
res2 = subprocess.call(['blastn', '-query', query_fasta_file, '-db', database_output_file ,'-out', blast_file ,'-num_alignments' ,'1', '-num_descriptions' ,'1'])
# use the blast_reader to extract the results of blastn, ie assign
print 'Assign taxonomic match to clusters'
blast_reader = BlastFileReader(blast_file)
# assign taxonomy with {seqID : refID, similarity%} as parameter
blast_reader.read(None)
cluster_collection.assignTaxonomy(blast_reader.dict,taxonomic_reader.taxonomy_collection)
# optionally create a cluster evolution table (evolution of the number of sequences in the cluster through the samples), here keeping only the clusters bigger than 4000
if output_table_file_name != None :
cluster_collection.buildClusterTables(cluster_reader.list_sample_id,output_table_file_name,4000)
return cluster_collection

Event Timeline