Page MenuHomec4science

main.py
No OneTemporary

File Metadata

Created
Sat, Jul 12, 20:50
#!/usr/bin/env python
# -*- coding: utf-8 -*-
" This program can perform 2 differents process depending on the chosen option (-o): clustering take as input a file of clustered sequences and a reference database and compare a representant of the cluster to the reference database to assign a taxonomy to the sequences of the clusters, an evolution of the clusters through the different samples can be ouput. metadata take as input a metadata file and will store the informations stored in the file if it corresponds to a database table."
##########################################################################
import argparse
import os
from clustering_process_fct import clustering_process
from metadata_reader import metadataReader
from config_file_reader import ConfigFileReader
from test_fct import check_clustering
##########################################################################
""" The arguments are parsed in the main and are dispatched to the appropriate function/object."""
# command line for clustering#
##############################
# /home/aline/spe_repository_aa/project/class_diagram/skeleton/main.py -f '/home/aline/spe_repository_aa/project/class_diagram/input_files/config_clustering_cd_hit.txt'
# command test
################
#/home/aline/spe_repository_aa/project/class_diagram/skeleton/main.py -f /home/aline/spe_repository_aa/project/class_diagram/Test_files/config_clustering_dbc_test.txt
#/home/aline/spe_repository_aa/project/class_diagram/skeleton/main.py -f /home/aline/spe_repository_aa/project/class_diagram/Test_files/config_clustering_cd_hit_test.txt
#/home/aline/spe_repository_aa/project/class_diagram/skeleton/main.py -f /home/aline/spe_repository_aa/project/class_diagram/Test_files/config_metadata_test.txt
# command line for metadata#
############################
#./main.py -o 'metadata'-m '/home/aline/spe_repository_aa/project/class_diagram/skeleton/Data/performances.csv' -a '/home/aline/spe_repository_aa/project/class_diagram/skeleton/My_database/my_database.db'
############################## inputs ####################################
parser = argparse.ArgumentParser(description = 'Program that process and store molecular sequences and related metadata in a sqlite3 database')
# change the parser if a configuration file is provided
parser.add_argument('-f', '--config', help = 'configuration file', required = False)
args = parser.parse_args()
if args.config != None:
parser = ConfigFileReader(args.config)
#parser.read(None)
# Main choice of option
parser.add_argument('-o', '--option', help = 'choose between clustering (clustering), or store metadata (metadata)', required = True)
# arguments for clustering
parser.add_argument('-k', '--key_string', help = 'string marking the begining of sampleID in sequences names', required = False)
parser.add_argument('-cs', '--cluster_soft', help = 'name of the clustering software, should be dbc or cd_hit', required = False)
parser.add_argument('-r', '--ref_fasta', help = 'name of the reference fasta file', required = False)
parser.add_argument('-d', '--database_name', help = 'output', required = False)
parser.add_argument('-i', '--input_fasta', help = 'the initial fasta file containing the sequences ', required = False)
parser.add_argument('-cf', '--cluster_file', help = 'the output of the clustering algorithm', required = False)
parser.add_argument('-q', '--query_fasta', help = 'the file containing only the sequences that are cluster heads', required = False)
parser.add_argument('-b', '--blast_file', help = 'the output of the blastn algorithm', required = False)
parser.add_argument('-t', '--table_file', help = 'the name of the output file table to be created', required = False)
parser.add_argument('-e', '--test', help = 'option set to True for the test mode', required = False)
parser.add_argument('-s', '--seq_type', help = 'type of sequences, can be DNA, RNA or proteins', required = False)
parser.add_argument('-st', '--store', help = 'true to store the clusters in the database', required = False)
parser.add_argument('-p', '--min_clust_size', help = 'set the minimum cluster size', required = False)
# arguments for metadata storage in the database
parser.add_argument('-m', '--metadata', help = 'a list of metadata files containing the information we want to store in the database, separated with a ;', required = False)
parser.add_argument('-a', '--my_db', help = 'name of the database', required = False)
############################## arguments ##############################
if args.config != None:
parser.read()
args = parser.parse_args()
# choix du type de sequence
sequence_type = args.seq_type
if sequence_type == 'DNA' or sequence_type == 'RNA':
dbtype = 'nucl'
elif sequence_type =='protein' :
dbtype = 'prot'
else :
dbtype = ''
print 'dbtype :', dbtype
# database name
my_db_name = args.my_db
# test mode
test_mode = args.test
# choice of option
##################
### CLUSTERING ###
##################
if args.option.lower() == 'clustering' :
clustering_software = args.cluster_soft
# string used to recognize the begining of the sampleID in the sequence names
key_string = args.key_string
# folder containing the files of the 16s reference database
reference_filename = args.ref_fasta
#The fasta_file used to do the clustering
input_fasta_file = args.input_fasta
cluster_filename = args.cluster_file
# files with only the cluster head sequences
query_fasta_file = args.query_fasta
# result of the comparison with the reference database
blast_file = args.blast_file
# outputs file names
output_table_file_name = args.table_file
# minimal cluster size
try :
min_clust_size = int(args.min_clust_size)
except :
min_clust_size = None
# launch the clustering
cluster_collection = clustering_process(clustering_software,key_string, reference_filename, input_fasta_file,cluster_filename, query_fasta_file, blast_file, output_table_file_name,dbtype,min_clust_size)
# if the option to store the clustering results in the local database is set to true
if args.store.lower() == 'true':
cluster_collection.storeInDatabase(my_db_name)
if test_mode :
check_clustering(clustering_software,cluster_collection)
##################
### METADATA ###
##################
elif args.option.lower() == 'metadata' :
# metadata can be ',' separated list of metadata files
metadata_file_list = args.metadata.split(',')
# a metadata reader is created for each file
for metadata_file in metadata_file_list :
metadata_reader = metadataReader(metadata_file,my_db_name)
metadata_reader.read(None,None) # possible here to add a filter or keystring to select metadata
else :
print 'error, unvalid choice of option: -o must be either clustering or metadata.'

Event Timeline