Page MenuHomec4science

EMSequenceApplication.cpp
No OneTemporary

File Metadata

Created
Thu, May 30, 22:49

EMSequenceApplication.cpp

#include <EMSequenceApplication.hpp>
#include <EMSequence.hpp>
#include <iostream>
#include <string>
#include <utility> // std::move()
#include <stdexcept> // std::invalid_argument
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp> // boost::split()
#include <Matrix2D.hpp>
#include <dna_utility.hpp>
namespace po = boost::program_options ;
template<class T>
std::ostream& operator << (std::ostream& stream,
const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
EMSequenceApplication::EMSequenceApplication(int argn, char** argv)
: file_seq(""), files_motif(""),
n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false),
n_threads(0), seed(""), runnable(true)
{
// parse command line options and set the fields
this->parseOptions(argn, argv) ;
}
int EMSequenceApplication::run()
{ if(this->runnable)
{ EMSequence* em(nullptr) ;
// data
Matrix2D<int> data(this->file_seq) ;
// seeds motifs randomly
if(this->files_motif == "")
{ em = new EMSequence(std::move(data),
this->n_class,
this->n_iter,
this->n_shift,
this->flip,
this->bckg_class,
this->seed,
this->n_threads) ;
}
// seeds motifs with the given matrices
else
{ // model
std::vector<std::string> motif_paths ;
boost::split(motif_paths, this->files_motif, [](char c){return c == ',';}) ;
// this->n_class = motif_paths.size() + this->bckg_class ;
size_t model_ncol = data.get_ncol() - this->n_shift + 1 ;
// add the given motif, random motifs (if needed) and
// background class (if needed)
Matrix3D<double> model = this->init_model(model_ncol,
data,
motif_paths) ;
em = new EMSequence(std::move(data),
std::move(model),
this->n_iter,
this->flip,
this->n_threads) ;
}
// classify
em->classify() ;
std::cout << em->get_post_prob() << std::endl ;
// clean
delete em ;
em = nullptr ;
return EXIT_SUCCESS ;
}
else
{ return EXIT_FAILURE ; }
}
void EMSequenceApplication::parseOptions(int argn, char** argv)
{
// no option to parse
if(argv == nullptr)
{ std::string message = "no options to parse!" ;
throw std::invalid_argument(message) ;
}
// help messages
std::string desc_msg = "\n"
"EMSequence is a probabilistic partitioning algorithm that \n"
"sofetly assigns sequences to classes given their motif content \n"
"The assignment probabilities are returned through stdout.\n\n" ;
std::string opt_help_msg = "Produces this help message." ;
std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n "
"by default 0 (no parallelization)." ;
std::string opt_seq_msg = "The path to the file containing the sequences" ;
std::string opt_motifs_msg = "A coma separated list of path to files containing the initial motifs\n"
"values. The motifs should be probability matrices in horizontal format.\n"
"If the motifs are too short after accounting for shifting, extra\n"
"columns with uniform probabilities will be added on each side. The\n"
"given number of classes (--class) should at least be the number of\n"
"initial motifs. If the number of classes is bigger than the number of"
"given motifs, the remaining classes are initialised randomly\n." ;
std::string opt_iter_msg = "The number of iterations." ;
std::string opt_class_msg = "The number of classes to find." ;
std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n"
"the data. By default, shifting is disabled (equivalent to\n"
"--shift 1)." ;
std::string opt_flip_msg = "Enables flipping to realign the data.";
std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n"
"contains the sequence background probabilities at each position\n"
"and is never updated." ;
std::string opt_seed_msg = "A value to seed the random number generator.";
// option parser
boost::program_options::variables_map vm ;
boost::program_options::options_description desc(desc_msg) ;
std::string seeding_tmp ;
desc.add_options()
("help,h", opt_help_msg.c_str())
("seq", po::value<std::string>(&(this->file_seq)), opt_seq_msg.c_str())
("motifs", po::value<std::string>(&(this->files_motif)), opt_motifs_msg.c_str())
("iter,i", po::value<size_t>(&(this->n_iter)), opt_iter_msg.c_str())
("class,c", po::value<size_t>(&(this->n_class)), opt_class_msg.c_str())
("shift,s", po::value<size_t>(&(this->n_shift)), opt_shift_msg.c_str())
("flip", opt_flip_msg.c_str())
("bgclass", opt_bckg_msg.c_str())
("seed", po::value<std::string>(&(this->seed)), opt_seed_msg.c_str())
("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;
// parse
try
{ po::store(po::parse_command_line(argn, argv, desc), vm) ;
po::notify(vm) ;
}
catch(std::invalid_argument& e)
{ std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
throw std::invalid_argument(msg) ;
}
catch(...)
{ throw std::invalid_argument("An unknown error occured while parsing the options") ; }
bool help = vm.count("help") ;
// checks unproper option settings
if(this->file_seq == "" and
(not help))
{ std::string msg("Error! No data were given (--seq)!") ;
throw std::invalid_argument(msg) ;
}
// no iter given -> 1 iter
if(this->n_iter == 0)
{ this->n_iter = 1 ; }
// no shift class given -> 1 class
if(this->n_class == 0)
{ this->n_class = 1 ; }
// no shift given, value of 1 -> no shift
if(this->n_shift == 0)
{ this->n_shift = 1 ; }
// set flip
if(vm.count("flip"))
{ this->flip = true ; }
// set background class
if(vm.count("bgclass"))
{ this->bckg_class = true ; }
// help invoked, run() cannot be invoked
if(help)
{ std::cout << desc << std::endl ;
this->runnable = false ;
return ;
}
// everything fine, run() can be called
else
{ this->runnable = true ;
return ;
}
}
Matrix3D<double> EMSequenceApplication::init_model(size_t model_len,
const Matrix2D<int>& data,
const std::vector<std::string>& motif_paths) const
{
int n_class_given = motif_paths.size() ;
int n_class_bckg = this->bckg_class ;
int n_class_rand = this->n_class - n_class_given - n_class_bckg ;
// number of classes should at least be number of motifs
if(n_class_given > (int)this->n_class)
{ char msg[4096] ;
sprintf(msg, "Error! number of class given (--class %zu) should at "
"least be equal to number of motifs (--motifs %d)",
this->n_class, n_class_given) ;
throw std::invalid_argument(msg) ;
}
// check if there is room for a background class
if((int)this->n_class < n_class_given+this->bckg_class)
{ char msg[4096] ;
sprintf(msg, "Error! no class left to add a background "
"class (--bgclass) with the given motifs (--motifs) (--class %zu)",
this->n_class) ;
throw std::invalid_argument(msg) ;
}
// init empty model
Matrix3D<double> model(this->n_class,
model_len,
4,
0.25) ;
// add given motifs
for(size_t i=0; i<motif_paths.size(); i++)
{ Matrix2D<double> matrix(motif_paths[i]) ;
// motif is too big for this shift
if(matrix.get_ncol() > model_len)
{ char msg[4096] ;
sprintf(msg,
"Error! In %s, motif column number is bigger "
"than data column number - shift + 1 "
"(%zu > %zu - %zu + 1)",
motif_paths[i].c_str(),
matrix.get_ncol(),
data.get_ncol(),
this->n_shift) ;
throw std::invalid_argument(msg) ;
}
// insert motif in middle of matrix
else
{ // size_t j_model = this->n_shift / 2 ;
size_t j_model = (model_len - matrix.get_ncol()) / 2 ;
for(size_t j_mat=0, j_mod=j_model; j_mat<matrix.get_ncol(); j_mat++, j_mod++)
{ for(size_t k=0; k<4; k++)
{ model(i,j_mod,k) = matrix(k,j_mat) ; }
}
}
}
// add random motifs and background class
// delegate this to EMSequence constructor
// (ensure that it is done properly)
if(n_class_rand > 0)
{ // initialise randomly
EMSequence em(data,
n_class_rand,
this->n_iter,
this->n_shift,
this->flip,
this->bckg_class,
this->seed,
this->n_threads) ;
Matrix3D<double> model_rand = em.get_sequence_models() ;
// copy them into model
for(int i_rand=0, i_mod=n_class_given; i_rand<n_class_rand; i_rand++, i_mod++)
{ for(int j=0; j<(int)model_len; j++)
{ for(int k=0; k<4; k++)
{ model(i_mod,j,k) = model_rand(i_rand,j,k) ; }
}
}
}
return model ;
}
int main(int argn, char** argv)
{ EMSequenceApplication app(argn, argv) ;
return app.run() ;
}

Event Timeline