No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Jul 19, 21:13

View Options

	diff --git a/src/Applications/EMConsensusSequenceApplication.cpp b/src/Applications/EMConsensusSequenceApplication.cpp
	index 05ccddd..391036a 100644
	--- a/src/Applications/EMConsensusSequenceApplication.cpp
	+++ b/src/Applications/EMConsensusSequenceApplication.cpp
	@@ -1,187 +1,271 @@

	#include <EMConsensusSequenceApplication.hpp>
	#include <EMConsensusSequence.hpp>

	#include <iostream>
	#include <string>
	#include <vector>
	#include <utility> // std::move()
	#include <stdexcept> // std::invalid_argument
	#include <boost/program_options.hpp>

	#include <Matrix2D.hpp>
	#include <Matrix4D.hpp>
	#include <matrix_utility.hpp>
	+#include <KmersStatistics.hpp> // kmer::compute_kmer_pvalue()
	+#include <sorting_utility.hpp> // order()
	+

	namespace po = boost::program_options ;


	EMConsensusSequenceApplication::EMConsensusSequenceApplication(int argn, char** argv)
	: file_consseq(""), file_filter(""), file_out(""),
	n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false),
	n_threads(0), seed(""), runnable(true)
	{
	// parse command line options and set the fields
	this->parseOptions(argn, argv) ;
	}

	int EMConsensusSequenceApplication::run()
	{ if(this->runnable)
	{ EMConsensusSequence* em(nullptr) ;

	// row filter
	std::vector<size_t> filter ;
	if(this->file_filter != "")
	{ // it is a column vector, easier to use the Matrix2D interface
	// to read it rather than coding a function for :)
	filter = Matrix2D<size_t>(this->file_filter).get_data() ;
	std::sort(filter.begin(), filter.end()) ;
	}

	// data
	Matrix3D<double> data ;
	data.load(this->file_consseq) ;
	// filter out some rows if needed
	if(filter.size())
	{ data = filter_rows(filter, data) ; }

	// seeds motifs randomly
	- em = new EMConsensusSequence(std::move(data),
	- this->n_class,
	- this->n_iter,
	- this->n_shift,
	- this->flip,
	- this->bckg_class,
	- this->seed,
	- this->n_threads) ;
	+ if(this->seed != "")
	+ { em = new EMConsensusSequence(std::move(data),
	+ this->n_class,
	+ this->n_iter,
	+ this->n_shift,
	+ this->flip,
	+ this->bckg_class,
	+ this->seed,
	+ this->n_threads) ;
	+ }
	+ // seeds from enriched kmers
	+ else
	+ { size_t model_ncol = data.get_dim()[1] - this->n_shift + 1 ;
	+
	+ Matrix3D<double> model = this->init_model_kmer(model_ncol,
	+ data) ;
	+ em = new EMConsensusSequence(std::move(data),
	+ std::move(model),
	+ this->n_iter,
	+ this->flip,
	+ this->n_threads) ;
	+ }

	// classify
	em->classify() ;
	em->get_post_prob().save(this->file_out) ;

	// clean
	delete em ;
	em = nullptr ;

	return EXIT_SUCCESS ;
	}
	else
	{ return EXIT_FAILURE ; }
	}

	void EMConsensusSequenceApplication::parseOptions(int argn, char** argv)
	{
	// no option to parse
	if(argv == nullptr)
	{ std::string message = "no options to parse!" ;
	throw std::invalid_argument(message) ;
	}

	// help messages
	std::string desc_msg = "\n"
	"EMConsensusSequence is a probabilistic partitioning algorithm that \n"
	"sofetly assigns consensus sequences to classes given their motif\n"
	"content.\n"
	"The assignment probabilities are written in binary format as a 4D "
	"matrix.\n\n" ;
	std::string opt_help_msg = "Produces this help message." ;
	std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n "
	"by default 0 (no parallelization)." ;
	std::string opt_consseq_msg = "The path to the file containing the consensus sequences" ;
	std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n"
	"indices of rows to filter out in the data." ;
	std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n"
	"in binary format." ;
	std::string opt_iter_msg = "The number of iterations." ;
	std::string opt_class_msg = "The number of classes to find." ;
	std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n"
	"the data. By default, shifting is disabled (equivalent to\n"
	"--shift 1)." ;
	std::string opt_flip_msg = "Enables flipping to realign the data.";
	std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n"
	"contains the sequence background probabilities at each position\n"
	"and is never updated." ;
	std::string opt_seed_msg = "A value to seed the random number generator.";

	// option parser
	boost::program_options::variables_map vm ;
	boost::program_options::options_description desc(desc_msg) ;

	std::string seeding_tmp ;

	desc.add_options()
	("help,h", opt_help_msg.c_str())

	("consseq", po::value<std::string>(&(this->file_consseq)), opt_consseq_msg.c_str())
	("filter", po::value<std::string>(&(this->file_filter)), opt_filter_msg.c_str())

	("out", po::value<std::string>(&(this->file_out)), opt_file_out_msg.c_str())

	("iter,i", po::value<size_t>(&(this->n_iter)), opt_iter_msg.c_str())
	("class,c", po::value<size_t>(&(this->n_class)), opt_class_msg.c_str())
	("shift,s", po::value<size_t>(&(this->n_shift)), opt_shift_msg.c_str())
	("flip", opt_flip_msg.c_str())
	("bgclass", opt_bckg_msg.c_str())

	("seed", po::value<std::string>(&(this->seed)), opt_seed_msg.c_str())
	("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;

	// parse
	try
	{ po::store(po::parse_command_line(argn, argv, desc), vm) ;
	po::notify(vm) ;
	}
	catch(std::invalid_argument& e)
	{ std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
	throw std::invalid_argument(msg) ;
	}
	catch(...)
	{ throw std::invalid_argument("An unknown error occured while parsing the options") ; }

	bool help = vm.count("help") ;

	// checks unproper option settings
	if(this->file_consseq == "" and
	(not help))
	{ std::string msg("Error! No data were given (--seq)!") ;
	throw std::invalid_argument(msg) ;
	}
	if(this->file_out == "" and
	(not help))
	{ std::string msg("Error! No output file given (--out)!") ;
	throw std::invalid_argument(msg) ;
	}

	// no iter given -> 1 iter
	if(this->n_iter == 0)
	{ this->n_iter = 1 ; }
	// no shift class given -> 1 class
	if(this->n_class == 0)
	{ this->n_class = 1 ; }
	// no shift given, value of 1 -> no shift
	if(this->n_shift == 0)
	{ this->n_shift = 1 ; }
	// set flip
	if(vm.count("flip"))
	{ this->flip = true ; }
	// set background class
	if(vm.count("bgclass"))
	{ this->bckg_class = true ; }
	// help invoked, run() cannot be invoked
	if(help)
	{ std::cout << desc << std::endl ;
	this->runnable = false ;
	return ;
	}
	// everything fine, run() can be called
	else
	{ this->runnable = true ;
	return ;
	}
	}

	+Matrix3D<double> EMConsensusSequenceApplication::init_model_kmer(size_t l_model,
	+ const Matrix3D<double>& data) const
	+{
	+ // leave space for 2N's on each side
	+ size_t l_kmer = l_model ;
	+ size_t n_n = 0 ; // so far, 0 N's added
	+ if(l_model > 4)
	+ { n_n = 2 ; // 2 N's on each side
	+ l_kmer -= (2*n_n) ;
	+ }
	+
	+ // compute the pvalue associated to each kmer
	+ auto kmers_pvalues = kmers::compute_kmer_pvalue(data, l_kmer) ;
	+
	+ // sort kmers by ascending pvalue
	+ std::vector<size_t> index = order(kmers_pvalues.second, true) ;
	+ // get most significant
	+ std::vector<std::string> kmers(this->n_class) ;
	+ for(size_t i=0; i<this->n_class; i++)
	+ { size_t idx = index[i] ;
	+ kmers[i] = kmers_pvalues.first[idx] ;
	+ std::cerr << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ;
	+ }
	+ // turn to motifs
	+ double p_base = 0.7 ; // the prob of the base matching these of the kmer
	+ double p_nbase = 0.1 ; // the prob of the bases not matching these of the kmer
	+ double p_n = 0.25 ; // the prob of N
	+ // only N's for now
	+ Matrix3D<double> model(this->n_class,
	+ l_model,
	+ 4,
	+ p_n) ;
	+ for(size_t i=0; i<kmers.size(); i++)
	+ { for(size_t j_kmer=0, j_model=n_n; j_kmer<l_kmer; j_kmer++)
	+ { // A
	+ if(kmers[i][j_kmer] == '0')
	+ { model(i,j_model, 0) = p_base ;
	+ model(i,j_model, 1) = p_nbase ;
	+ model(i,j_model, 2) = p_nbase ;
	+ model(i,j_model, 3) = p_nbase ;
	+ }
	+ // C
	+ else if(kmers[i][j_kmer] == '1')
	+ { model(i,j_model, 0) = p_nbase ;
	+ model(i,j_model, 1) = p_base ;
	+ model(i,j_model, 2) = p_nbase ;
	+ model(i,j_model, 3) = p_nbase ;
	+ }
	+ // G
	+ else if(kmers[i][j_kmer] == '2')
	+ { model(i,j_model, 0) = p_nbase ;
	+ model(i,j_model, 1) = p_nbase ;
	+ model(i,j_model, 2) = p_base ;
	+ model(i,j_model, 3) = p_nbase ;
	+ }
	+ // T
	+ else if(kmers[i][j_kmer] == '3')
	+ { model(i,j_model, 0) = p_nbase ;
	+ model(i,j_model, 1) = p_nbase ;
	+ model(i,j_model, 2) = p_nbase ;
	+ model(i,j_model, 3) = p_base ;
	+ }
	+ }
	+ }
	+ return model ;
	+}
	+
	int main(int argn, char** argv)
	{ EMConsensusSequenceApplication app(argn, argv) ;
	return app.run() ;
	}

	diff --git a/src/Applications/EMConsensusSequenceApplication.hpp b/src/Applications/EMConsensusSequenceApplication.hpp
	index f356f4d..7064fee 100644
	--- a/src/Applications/EMConsensusSequenceApplication.hpp
	+++ b/src/Applications/EMConsensusSequenceApplication.hpp
	@@ -1,117 +1,143 @@
	#ifndef EMCONSENSUSSEQUENCEAPPLICATION_HPP
	#define EMCONSENSUSSEQUENCEAPPLICATION_HPP

	#include <ApplicationInterface.hpp>

	#include <string>
	#include <vector>

	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>

	/*!
	* \brief The EMConsensusSequenceApplication class is a wrapper around an
	* EMConsensusSequence instance creating an autonomous application to classify
	* consensus sequences by directly passing all the options and parameters from
	* the command line.
	*/
	class EMConsensusSequenceApplication: public ApplicationInterface
	{
	public:
	EMConsensusSequenceApplication() = delete ;
	EMConsensusSequenceApplication(const EMConsensusSequenceApplication& app) = delete ;
	/*!
	* \brief Constructs an object from the command line
	* options.
	* \param argn the number of options passed to the
	* main() function.
	* \param argv the vector of options passed to the
	* main() function.
	*/
	EMConsensusSequenceApplication(int argn, char** argv) ;

	/*!
	* \brief Runs the application. The data are classified
	* using the given settings and the posterior probability
	* matrix is returned through the stdout.
	* The matrix is a 4D matrix with dimensions :
	* regions, class, shift flip.
	* \return an exit code EXIT_SUCCESS or EXIT_FAILURE
	* to return to the OS.
	*/
	virtual int run() override ;

	private:
	/*!
	* \brief Parses the program command line options and
	* sets the object field accordingly.
	* If the help option is detected, the "runnable"
	* field is set to false and subsequent calls to
	* run() will produce nothing.
	* \param argn the number of options passed to the
	* main() function.
	* \param argv the vector of options passed to the
	* main() function.
	* \throw std::invalid_argument if an error is found
	* in the program options.
	*/
	void parseOptions(int argn, char** argv) ;

	+ /*!
	+ * \brief Initialise the class models from the
	+ * most significantly enriched kmers in the data.
	+ * The kmers are turned into probability matrices
	+ * using the following rules :
	+ * 1) the probability of a base matching a base at
	+ * a given position of the kmer is set to 0.7, all
	+ * other to 0.1
	+ * 2) if the model length is bigger than 4, two
	+ * N's are added on each side of the central kmer
	+ * core (prob are 0.25, 0,25, 0.25, 0.25).
	+ * \param l_model the length of the model.
	+ * \param data the sequences in integer format
	+ * (A:0, C:1, G:2, T:3).
	+ * Its dimensions are the following :
	+ * 1st the number of sequences
	+ * 2nd the length of the sequences.
	+ * \return The models initialised.
	+ * Its dimensions are the following :
	+ * 1st the number of classes
	+ * 2nd the length of the model
	+ * 3rd 4 for A, C, G, T
	+ */
	+ Matrix3D<double> init_model_kmer(size_t l_model,
	+ const Matrix3D<double>& data) const ;
	+
	/*!
	* \brief the paths to the file containing the
	* consensus sequence data.
	*/
	std::string file_consseq ;

	/*!
	* \brief the path to the file containing the
	* (0-based) index of the rows to filter out
	* from the data.
	*/
	std::string file_filter ;

	/*!
	* \brief the path to the file in which the probability
	* matrix will be saved.
	*/
	std::string file_out ;

	/*!
	* \brief the number of classes to partition the data into.
	*/
	size_t n_class ;
	/*!
	* \brief the number of iterations allowed.
	*/
	size_t n_iter ;
	/*!
	* \brief the shifting freedom.
	*/
	size_t n_shift ;
	/*!
	* \brief whether flipping freedom is allowed.
	*/
	bool flip ;
	/*!
	* \brief whether a constant class to model the
	* sequence background should be added. This
	* class has the sequence background probabilities
	* at each position.
	*/
	bool bckg_class ;
	/*!
	* \brief the number of threads.
	*/
	size_t n_threads ;
	/*!
	* \brief a seed to initialise the random number generator.
	*/
	std::string seed ;
	/*!
	* \brief a flag indicating whether the core of run() can be
	* run or not.
	*/
	bool runnable ;
	} ;

	#endif // EMCONSENSUSSEQUENCEAPPLICATION_HPP
	diff --git a/src/Applications/EMSequenceApplication.cpp b/src/Applications/EMSequenceApplication.cpp
	index 1b8ab5f..3d38eb4 100644
	--- a/src/Applications/EMSequenceApplication.cpp
	+++ b/src/Applications/EMSequenceApplication.cpp
	@@ -1,295 +1,377 @@

	#include <EMSequenceApplication.hpp>
	#include <EMSequence.hpp>

	#include <iostream>
	#include <string>
	#include <utility> // std::move()
	#include <stdexcept> // std::invalid_argument
	#include <boost/program_options.hpp>
	#include <boost/algorithm/string.hpp> // boost::split()

	#include <Matrix2D.hpp>
	#include <Matrix4D.hpp>
	#include <matrix_utility.hpp> // filter()
	-
	+#include <KmersStatistics.hpp> // kmer::compute_kmer_pvalue()
	+#include <sorting_utility.hpp> // order()

	namespace po = boost::program_options ;


	EMSequenceApplication::EMSequenceApplication(int argn, char** argv)
	: file_seq(""), files_motif(""), file_filter(""), file_out(""),
	n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false),
	n_threads(0), seed(""), runnable(true)
	{
	// parse command line options and set the fields
	this->parseOptions(argn, argv) ;
	}

	int EMSequenceApplication::run()
	{ if(this->runnable)
	{ EMSequence* em(nullptr) ;

	// data
	Matrix2D<int> data(this->file_seq) ;

	// filter out some rows if needed
	std::vector<size_t> filter ;
	if(this->file_filter != "")
	{ // it is a column vector, easier to use the Matrix2D interface
	// to read it rather than coding a function for :)
	filter = Matrix2D<size_t>(this->file_filter).get_data() ;
	std::sort(filter.begin(), filter.end()) ;
	data = filter_rows(filter, data) ;
	}

	// seeds motifs randomly
	- if(this->files_motif == "")
	+ if(this->files_motif == "" and
	+ this->seed != "")
	{ em = new EMSequence(std::move(data),
	this->n_class,
	this->n_iter,
	this->n_shift,
	this->flip,
	this->bckg_class,
	this->seed,
	this->n_threads) ;
	}
	// seeds motifs with the given matrices
	- else
	+ else if(this->files_motif != "")
	{ // model
	std::vector<std::string> motif_paths ;
	boost::split(motif_paths, this->files_motif, [](char c){return c == ',';}) ;
	// this->n_class = motif_paths.size() + this->bckg_class ;
	size_t model_ncol = data.get_ncol() - this->n_shift + 1 ;

	// add the given motif, random motifs (if needed) and
	// background class (if needed)
	+
	Matrix3D<double> model = this->init_model(model_ncol,
	data,
	motif_paths) ;

	em = new EMSequence(std::move(data),
	std::move(model),
	this->n_iter,
	this->flip,
	this->n_threads) ;
	}
	+ // seeds from enriched kmers
	+ else
	+ { size_t model_ncol = data.get_ncol() - this->n_shift + 1 ;

	+ Matrix3D<double> model = this->init_model_kmer(model_ncol,
	+ data) ;
	+ em = new EMSequence(std::move(data),
	+ std::move(model),
	+ this->n_iter,
	+ this->flip,
	+ this->n_threads) ;
	+ }
	// classify
	em->classify() ;
	em->get_post_prob().save(this->file_out) ;

	// clean
	delete em ;
	em = nullptr ;

	return EXIT_SUCCESS ;
	}
	else
	{ return EXIT_FAILURE ; }
	}

	void EMSequenceApplication::parseOptions(int argn, char** argv)
	{
	// no option to parse
	if(argv == nullptr)
	{ std::string message = "no options to parse!" ;
	throw std::invalid_argument(message) ;
	}

	// help messages
	std::string desc_msg = "\n"
	"EMSequence is a probabilistic partitioning algorithm that \n"
	"sofetly assigns sequences to classes given their motif content.\n"
	"The assignment probabilities are written in binary format as a 4D "
	"matrix.\n\n" ;
	std::string opt_help_msg = "Produces this help message." ;
	std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n "
	"by default 0 (no parallelization)." ;
	std::string opt_seq_msg = "The path to the file containing the sequences" ;
	std::string opt_motifs_msg = "A coma separated list of path to files containing the initial motifs\n"
	"values. The motifs should be probability matrices in horizontal format.\n"
	"If the motifs are too short after accounting for shifting, extra\n"
	"columns with uniform probabilities will be added on each side. The\n"
	"given number of classes (--class) should at least be the number of\n"
	"initial motifs. If the number of classes is bigger than the number of"
	"given motifs, the remaining classes are initialised randomly\n." ;
	std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n"
	"indices of rows to filter out in the data." ;
	std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n"
	"in binary format." ;
	std::string opt_iter_msg = "The number of iterations." ;
	std::string opt_class_msg = "The number of classes to find." ;
	std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n"
	"the data. By default, shifting is disabled (equivalent to\n"
	"--shift 1)." ;
	std::string opt_flip_msg = "Enables flipping to realign the data.";
	std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n"
	"contains the sequence background probabilities at each position\n"
	"and is never updated." ;
	std::string opt_seed_msg = "A value to seed the random number generator.";

	// option parser
	boost::program_options::variables_map vm ;
	boost::program_options::options_description desc(desc_msg) ;

	std::string seeding_tmp ;

	desc.add_options()
	("help,h", opt_help_msg.c_str())

	("seq", po::value<std::string>(&(this->file_seq)), opt_seq_msg.c_str())
	("motifs", po::value<std::string>(&(this->files_motif)), opt_motifs_msg.c_str())
	("filter", po::value<std::string>(&(this->file_filter)), opt_filter_msg.c_str())
	("out", po::value<std::string>(&(this->file_out)), opt_file_out_msg.c_str())

	("iter,i", po::value<size_t>(&(this->n_iter)), opt_iter_msg.c_str())
	("class,c", po::value<size_t>(&(this->n_class)), opt_class_msg.c_str())
	("shift,s", po::value<size_t>(&(this->n_shift)), opt_shift_msg.c_str())
	("flip", opt_flip_msg.c_str())
	("bgclass", opt_bckg_msg.c_str())

	("seed", po::value<std::string>(&(this->seed)), opt_seed_msg.c_str())
	("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;

	// parse
	try
	{ po::store(po::parse_command_line(argn, argv, desc), vm) ;
	po::notify(vm) ;
	}
	catch(std::invalid_argument& e)
	{ std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
	throw std::invalid_argument(msg) ;
	}
	catch(...)
	{ throw std::invalid_argument("An unknown error occured while parsing the options") ; }

	bool help = vm.count("help") ;

	// checks unproper option settings
	if(this->file_seq == "" and
	(not help))
	{ std::string msg("Error! No data were given (--seq)!") ;
	throw std::invalid_argument(msg) ;
	}
	if(this->file_out == "" and
	(not help))
	{ std::string msg("Error! No output file given (--out)!") ;
	throw std::invalid_argument(msg) ;
	}

	// no iter given -> 1 iter
	if(this->n_iter == 0)
	{ this->n_iter = 1 ; }
	// no shift class given -> 1 class
	if(this->n_class == 0)
	{ this->n_class = 1 ; }
	// no shift given, value of 1 -> no shift
	if(this->n_shift == 0)
	{ this->n_shift = 1 ; }
	// set flip
	if(vm.count("flip"))
	{ this->flip = true ; }
	// set background class
	if(vm.count("bgclass"))
	{ this->bckg_class = true ; }
	// help invoked, run() cannot be invoked
	if(help)
	{ std::cout << desc << std::endl ;
	this->runnable = false ;
	return ;
	}
	// everything fine, run() can be called
	else
	{ this->runnable = true ;
	return ;
	}
	}

	-Matrix3D<double> EMSequenceApplication::init_model(size_t model_len,
	+
	+Matrix3D<double> EMSequenceApplication::init_model(size_t l_model,
	const Matrix2D<int>& data,
	const std::vector<std::string>& motif_paths) const
	{
	int n_class_given = motif_paths.size() ;
	int n_class_bckg = this->bckg_class ;
	int n_class_rand = this->n_class - n_class_given - n_class_bckg ;

	// number of classes should at least be number of motifs
	if(n_class_given > (int)this->n_class)
	{ char msg[4096] ;
	sprintf(msg, "Error! number of class given (--class %zu) should at "
	"least be equal to number of motifs (--motifs %d)",
	this->n_class, n_class_given) ;
	throw std::invalid_argument(msg) ;
	}
	// check if there is room for a background class
	if((int)this->n_class < n_class_given+this->bckg_class)
	{ char msg[4096] ;
	sprintf(msg, "Error! no class left to add a background "
	"class (--bgclass) with the given motifs (--motifs) (--class %zu)",
	this->n_class) ;
	throw std::invalid_argument(msg) ;
	}

	// init empty model
	Matrix3D<double> model(this->n_class,
	- model_len,
	+ l_model,
	4,
	0.25) ;
	// add given motifs
	for(size_t i=0; i<motif_paths.size(); i++)
	{ Matrix2D<double> matrix(motif_paths[i]) ;
	// motif is too big for this shift
	- if(matrix.get_ncol() > model_len)
	+ if(matrix.get_ncol() > l_model)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! In %s, motif column number is bigger "
	"than data column number - shift + 1 "
	"(%zu > %zu - %zu + 1)",
	motif_paths[i].c_str(),
	matrix.get_ncol(),
	data.get_ncol(),
	this->n_shift) ;
	throw std::invalid_argument(msg) ;
	}
	// insert motif in middle of matrix
	else
	{ // size_t j_model = this->n_shift / 2 ;
	- size_t j_model = (model_len - matrix.get_ncol()) / 2 ;
	+ size_t j_model = (l_model - matrix.get_ncol()) / 2 ;
	for(size_t j_mat=0, j_mod=j_model; j_mat<matrix.get_ncol(); j_mat++, j_mod++)
	{ for(size_t k=0; k<4; k++)
	{ model(i,j_mod,k) = matrix(k,j_mat) ; }
	}
	}
	}

	// add random motifs and background class
	// delegate this to EMSequence constructor
	// (ensure that it is done properly)
	if(n_class_rand > 0)
	{ // initialise randomly
	EMSequence em(data,
	n_class_rand,
	this->n_iter,
	this->n_shift,
	this->flip,
	this->bckg_class,
	this->seed,
	this->n_threads) ;
	Matrix3D<double> model_rand = em.get_sequence_models() ;
	// copy them into model
	for(int i_rand=0, i_mod=n_class_given; i_rand<n_class_rand; i_rand++, i_mod++)
	- { for(int j=0; j<(int)model_len; j++)
	+ { for(int j=0; j<(int)l_model; j++)
	{ for(int k=0; k<4; k++)
	{ model(i_mod,j,k) = model_rand(i_rand,j,k) ; }
	}
	}
	}
	return model ;
	}

	+Matrix3D<double> EMSequenceApplication::init_model_kmer(size_t l_model,
	+ const Matrix2D<int>& data) const
	+{
	+ // leave space for 2N's on each side
	+ size_t l_kmer = l_model ;
	+ size_t n_n = 0 ; // so far, 0 N's added
	+ if(l_model > 4)
	+ { n_n = 2 ; // 2 N's on each side
	+ l_kmer -= (2*n_n) ;
	+ }
	+
	+ // compute the pvalue associated to each kmer
	+ auto kmers_pvalues = kmers::compute_kmer_pvalue(data, l_kmer) ;
	+
	+ // sort kmers by ascending pvalue
	+ std::vector<size_t> index = order(kmers_pvalues.second, true) ;
	+ // get most significant
	+ std::vector<std::string> kmers(this->n_class) ;
	+ for(size_t i=0; i<this->n_class; i++)
	+ { size_t idx = index[i] ;
	+ kmers[i] = kmers_pvalues.first[idx] ;
	+ std::cerr << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ;
	+ }
	+ // turn to motifs
	+ double p_base = 0.7 ; // the prob of the base matching these of the kmer
	+ double p_nbase = 0.1 ; // the prob of the bases not matching these of the kmer
	+ double p_n = 0.25 ; // the prob of N
	+ // only N's for now
	+ Matrix3D<double> model(this->n_class,
	+ l_model,
	+ 4,
	+ p_n) ;
	+ for(size_t i=0; i<kmers.size(); i++)
	+ { for(size_t j_kmer=0, j_model=n_n; j_kmer<l_kmer; j_kmer++)
	+ { // A
	+ if(kmers[i][j_kmer] == '0')
	+ { model(i,j_model, 0) = p_base ;
	+ model(i,j_model, 1) = p_nbase ;
	+ model(i,j_model, 2) = p_nbase ;
	+ model(i,j_model, 3) = p_nbase ;
	+ }
	+ // C
	+ else if(kmers[i][j_kmer] == '1')
	+ { model(i,j_model, 0) = p_nbase ;
	+ model(i,j_model, 1) = p_base ;
	+ model(i,j_model, 2) = p_nbase ;
	+ model(i,j_model, 3) = p_nbase ;
	+ }
	+ // G
	+ else if(kmers[i][j_kmer] == '2')
	+ { model(i,j_model, 0) = p_nbase ;
	+ model(i,j_model, 1) = p_nbase ;
	+ model(i,j_model, 2) = p_base ;
	+ model(i,j_model, 3) = p_nbase ;
	+ }
	+ // T
	+ else if(kmers[i][j_kmer] == '3')
	+ { model(i,j_model, 0) = p_nbase ;
	+ model(i,j_model, 1) = p_nbase ;
	+ model(i,j_model, 2) = p_nbase ;
	+ model(i,j_model, 3) = p_base ;
	+ }
	+ }
	+ }
	+ return model ;
	+}
	+
	int main(int argn, char** argv)
	{ EMSequenceApplication app(argn, argv) ;
	return app.run() ;
	}

	diff --git a/src/Applications/EMSequenceApplication.hpp b/src/Applications/EMSequenceApplication.hpp
	index 1047b58..cf72af7 100644
	--- a/src/Applications/EMSequenceApplication.hpp
	+++ b/src/Applications/EMSequenceApplication.hpp
	@@ -1,141 +1,171 @@
	#ifndef EMSEQUENCEAPPLICATION_HPP
	#define EMSEQUENCEAPPLICATION_HPP

	#include <ApplicationInterface.hpp>

	#include <string>
	#include <vector>

	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>

	/*!
	* \brief The EMSequenceApplication class is a wrapper around an EMSequence
	* instance creating an autonomous application to classify sequences by directly
	* passing all the options and parameters from the command line.
	*/
	class EMSequenceApplication: public ApplicationInterface
	{
	public:
	EMSequenceApplication() = delete ;
	EMSequenceApplication(const EMSequenceApplication& app) = delete ;
	/*!
	* \brief Constructs an object from the command line
	* options.
	* \param argn the number of options passed to the
	* main() function.
	* \param argv the vector of options passed to the
	* main() function.
	*/
	EMSequenceApplication(int argn, char** argv) ;

	/*!
	* \brief Runs the application. The data are classified
	* using the given settings and the posterior probability
	* matrix is returned through the stdout.
	* The matrix is a 4D matrix with dimensions :
	* regions, class, shift flip.
	* \return an exit code EXIT_SUCCESS or EXIT_FAILURE
	* to return to the OS.
	*/
	virtual int run() override ;

	private:
	/*!
	* \brief Parses the program command line options and
	* sets the object field accordingly.
	* If the help option is detected, the "runnable"
	* field is set to false and subsequent calls to
	* run() will produce nothing.
	* \param argn the number of options passed to the
	* main() function.
	* \param argv the vector of options passed to the
	* main() function.
	* \throw std::invalid_argument if an error is found
	* in the program options.
	*/
	void parseOptions(int argn, char** argv) ;

	+ /*!
	+ * \brief Initialise the class models from the
	+ * most significantly enriched kmers in the data.
	+ * The kmers are turned into probability matrices
	+ * using the following rules :
	+ * 1) the probability of a base matching a base at
	+ * a given position of the kmer is set to 0.7, all
	+ * other to 0.1
	+ * 2) if the model length is bigger than 4, two
	+ * N's are added on each side of the central kmer
	+ * core (prob are 0.25, 0,25, 0.25, 0.25).
	+ * \param l_model the length of the model.
	+ * \param data the sequences in integer format
	+ * (A:0, C:1, G:2, T:3).
	+ * Its dimensions are the following :
	+ * 1st the number of sequences
	+ * 2nd the length of the sequences.
	+ * \return The models initialised.
	+ * Its dimensions are the following :
	+ * 1st the number of classes
	+ * 2nd the length of the model
	+ * 3rd 4 for A, C, G, T
	+ */
	+ Matrix3D<double> init_model_kmer(size_t l_model,
	+ const Matrix2D<int>& data) const ;
	+
	/*!
	* \brief Initialise the class models if matrices
	* are given as initial class motifs.
	* If the given class motifs are shorter than the
	* model after accounting for shifting, extra columns
	* with uniform probabilities will be added on each
	* side.
	* If the number of classes is higher than the
	* number of given motifs, extra classes will be
	* initialised randomly.A background class is included
	* if needed.
	- * \param model_len the number of positions (columns)
	+ * \param l_model the number of positions (columns)
	* of the model to initialise.
	* \param data the sequence matrix, in integer format.
	* \param motif_paths the paths to the files containing
	* the probability matrices to use to initialise the
	* class motifs.
	- * \return
	+ * \return The models initialised.
	+ * Its dimensions are the following :
	+ * 1st the number of classes
	+ * 2nd the length of the model
	+ * 3rd 4 for A, C, G, T
	*/
	- Matrix3D<double> init_model(size_t model_len,
	+ Matrix3D<double> init_model(size_t l_model,
	const Matrix2D<int>& data,
	const std::vector<std::string>& motif_paths) const ;

	/*!
	* \brief the paths to the file containing the sequence
	* data.
	*/
	std::string file_seq ;
	/*!
	* \brief a coma separated list of files containing the
	* initial motif matrices.
	*/
	std::string files_motif ;
	/*!
	* \brief the path to the file containing the
	* (0-based) index of the rows to filter out
	* from the data.
	*/
	std::string file_filter ;
	/*!
	* \brief the path to the file in which the probability
	* matrix will be saved.
	*/
	std::string file_out ;
	/*!
	* \brief the number of classes to partition the data into.
	*/
	size_t n_class ;
	/*!
	* \brief the number of iterations allowed.
	*/
	size_t n_iter ;
	/*!
	* \brief the shifting freedom.
	*/
	size_t n_shift ;
	/*!
	* \brief whether flipping freedom is allowed.
	*/
	bool flip ;
	/*!
	* \brief whether a constant class to model the
	* sequence background should be added. This
	* class has the sequence background probabilities
	* at each position.
	*/
	bool bckg_class ;
	/*!
	* \brief the number of threads.
	*/
	size_t n_threads ;
	/*!
	* \brief a seed to initialise the random number generator.
	*/
	std::string seed ;
	/*!
	* \brief a flag indicating whether the core of run() can be
	* run or not.
	*/
	bool runnable ;
	} ;

	#endif // EMSEQUENCEAPPLICATION_HPP
	diff --git a/src/Clustering/ConsensusSequenceLayer.cpp b/src/Clustering/ConsensusSequenceLayer.cpp
	index 05c7e1e..a513f38 100644
	--- a/src/Clustering/ConsensusSequenceLayer.cpp
	+++ b/src/Clustering/ConsensusSequenceLayer.cpp
	@@ -1,371 +1,385 @@
	#include <ConsensusSequenceLayer.hpp>

	#include <vector>
	#include <future> // std::promise, std::future
	#include <stdexcept> // std::invalid_argument
	#include <cmath> // log()
	#include <functional> // std::bind(), std::ref()

	#include <ThreadPool.hpp>
	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>
	#include <Matrix4D.hpp>
	#include <dna_utility.hpp> // dna::base_composition()


	double ConsensusSequenceLayer::score_consensus_subseq(const Matrix3D<double>& cons_seq,
	size_t row,
	size_t start,
	const Matrix2D<double>& model)
	{ size_t ncol = cons_seq.get_dim()[1] ;
	size_t dim3 = cons_seq.get_dim()[2] ;

	if(start > ncol - model.get_nrow())
	{ char msg[4096] ;
	sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
	start, ncol - model.get_nrow()) ;
	throw std::invalid_argument(msg) ;
	}
	else if(model.get_nrow() > ncol)
	{ char msg[4096] ;
	sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
	model.get_nrow(), ncol) ;
	throw std::invalid_argument(msg) ;
	}
	else if(model.get_ncol() != 4)
	{ char msg[4096] ;
	sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
	model.get_ncol()) ;
	throw std::invalid_argument(msg) ;
	}
	else if(dim3 != 4)
	{ char msg[4096] ;
	sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
	dim3) ;
	throw std::invalid_argument(msg) ;
	}

	size_t from = start ;
	size_t to = from + model.get_nrow() ; // will score [from, to)


	double ll = 0 ;

	for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
	{ double sum = 0. ;
	for(size_t k=0; k<dim3; k++)
	{ sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ; }
	ll += log(sum) ;
	}

	return ll ;
	}


	ConsensusSequenceLayer::ConsensusSequenceLayer(const Matrix3D<double>& data,
	size_t n_class,
	size_t n_shift,
	bool flip,
	bool bckg_class)
	: Data3DLayer(data, n_class, n_shift, flip,bckg_class)
	{
	this->n_category = 4 ;

	// initialise the empty model
	this->model = Matrix3D<double>(this->n_class,
	this->l_model,
	this->n_category,
	0.) ;
	// background class
	if(this->bckg_class)
	{ this->create_bckg_class() ; }
	}

	ConsensusSequenceLayer::ConsensusSequenceLayer(Matrix3D<double>&& data,
	size_t n_class,
	size_t n_shift,
	bool flip,
	bool bckg_class)
	: Data3DLayer(std::move(data), n_class, n_shift, flip, bckg_class)
	{
	this->n_category = 4 ;

	// initialise the empty model
	this->model = Matrix3D<double>(this->n_class,
	this->l_model,
	this->n_category,
	0.) ;

	// background class
	if(this->bckg_class)
	{ this->create_bckg_class() ; }
	}

	+ConsensusSequenceLayer::ConsensusSequenceLayer(const Matrix3D<double>& data,
	+ const Matrix3D<double>& model,
	+ bool flip,
	+ bool bckg_class)
	+ : Data3DLayer(data, model,flip, bckg_class)
	+{}
	+
	+ConsensusSequenceLayer::ConsensusSequenceLayer(Matrix3D<double>&& data,
	+ Matrix3D<double>&& model,
	+ bool flip,
	+ bool bckg_class)
	+ : Data3DLayer(std::move(data), std::move(model),flip, bckg_class)
	+{}
	+
	ConsensusSequenceLayer::~ConsensusSequenceLayer()
	{ ; }

	void ConsensusSequenceLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
	vector_d& loglikelihood_max,
	ThreadPool* threads) const
	{
	// dimension checks
	this->check_loglikelihood_dim(loglikelihood) ;
	this->check_loglikelihood_max_dim(loglikelihood_max) ;

	// compute the prob model and the prob reverse-complement model
	std::vector<Matrix2D<double>> model(this->n_class,
	Matrix2D<double>(this->l_model,
	this->n_category,
	0.)) ;
	std::vector<Matrix2D<double>> model_rev = model ;
	for(size_t i=0; i<this->n_class; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ // forward
	model[i](j,k) = this->model(i,j,k) ;
	// reverse
	model_rev[i](this->l_model-j-1,this->n_category-k-1)
	= this->model(i,j,k) ;
	}
	}
	}

	// don't parallelize
	if(threads == nullptr)
	{ std::promise<bool> promise ;
	std::future<bool> future = promise.get_future() ;
	this->compute_loglikelihoods_routine(0, this->n_dim1,
	loglikelihood,
	loglikelihood_max,
	model,
	model_rev,
	promise) ;
	future.get() ;
	}
	// parallelize
	else
	{ size_t n_threads = threads->getNThread() ;
	// compute the slices on which each thread will work
	std::vector<std::pair<size_t,size_t>> slices =
	ThreadPool::split_range(0, this->n_dim1, n_threads) ;

	// get promises and futures
	// the function run by the threads will simply fill the promise with
	// "true" to indicate that they are done
	std::vector<std::promise<bool>> promises(n_threads) ;
	std::vector<std::future<bool>> futures(n_threads) ;
	for(size_t i=0; i<n_threads; i++)
	{ futures[i] = promises[i].get_future() ; }
	// distribute work to threads
	// -------------------------- threads start --------------------------
	for(size_t i=0; i<n_threads; i++)
	{ auto slice = slices[i] ;
	threads->addJob(std::move(
	std::bind(&ConsensusSequenceLayer::compute_loglikelihoods_routine,
	this,
	slice.first,
	slice.second,
	std::ref(loglikelihood),
	std::ref(loglikelihood_max),
	std::ref(model),
	std::ref(model_rev),
	std::ref(promises[i])))) ;
	}
	// wait until all threads are done working
	for(auto& future : futures)
	{ future.get() ; }
	// -------------------------- threads stop ---------------------------
	}
	}

	void ConsensusSequenceLayer::compute_loglikelihoods_routine(size_t from,
	size_t to,
	Matrix4D<double>& loglikelihood,
	vector_d& loglikelihood_max,
	const std::vector<Matrix2D<double>>& model,
	const std::vector<Matrix2D<double>>& model_rev,
	std::promise<bool>& done) const
	{
	// compute log likelihood
	for(size_t i=from; i<to; i++)
	{
	// set max to min possible value
	loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;

	for(size_t j=0; j<this->n_class; j++)
	{
	for(size_t s=0; s<this->n_shift; s++)
	{ // forward strand
	{ double ll_fw = score_consensus_subseq(this->data, i, s, model[j]) ;
	loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
	// keep track of max per row
	if(ll_fw > loglikelihood_max[i])
	{ loglikelihood_max[i] = ll_fw ; }

	}
	// reverse
	if(this->flip)
	{ double ll_rev = score_consensus_subseq(this->data, i, s, model_rev[j]) ;
	loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
	// keep track of max per row
	if(ll_rev > loglikelihood_max[i])
	{ loglikelihood_max[i] = ll_rev ; }
	}
	}
	}
	}
	done.set_value(true) ;
	}

	void ConsensusSequenceLayer::update_model(const Matrix4D<double>& posterior_prob,
	ThreadPool* threads)
	{ // don't parallelize
	if(threads == nullptr)
	{ std::promise<Matrix3D<double>> promise ;
	std::future<Matrix3D<double>> future = promise.get_future() ;
	this->update_model_routine(0,
	this->n_dim1,
	posterior_prob,
	promise) ;
	// this->model = future.get() ;
	auto model = future.get() ;
	size_t n_class_to_update = this->n_class - this->bckg_class ;
	for(size_t i=0; i<n_class_to_update; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ this->model(i,j,k) = model(i,j,k) ; }
	}
	}
	}
	// parallelize
	else
	{ size_t n_threads = threads->getNThread() ;
	// compute the slices on which each thread will work
	std::vector<std::pair<size_t,size_t>> slices =
	ThreadPool::split_range(0, this->n_dim1, n_threads) ;

	// get promises and futures
	// the function run by the threads will simply fill the promise with
	// "true" to indicate that they are done
	std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
	std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
	for(size_t i=0; i<n_threads; i++)
	{ futures[i] = promises[i].get_future() ; }
	// distribute work to threads
	// -------------------------- threads start --------------------------
	for(size_t i=0; i<n_threads; i++)
	{ auto slice = slices[i] ;
	threads->addJob(std::move(
	std::bind(&ConsensusSequenceLayer::update_model_routine,
	this,
	slice.first,
	slice.second,
	std::ref(posterior_prob),
	std::ref(promises[i])))) ;
	}
	// reinitialise the model
	size_t n_class_to_update = this->n_class - this->bckg_class ;
	for(size_t i=0; i<n_class_to_update; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ this->model(i,j,k) = 0. ; }
	}
	}
	// wait until all threads are done working
	// and update the model
	for(auto& future : futures)
	{ Matrix3D<double> model_part = future.get() ;
	// for(size_t i=0; i<this->n_class; i++)
	for(size_t i=0; i<n_class_to_update; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ this->model(i,j,k) += model_part(i,j,k) ; }
	}
	}
	}
	// -------------------------- threads stop ---------------------------
	}
	// make sure to have no 0 values
	for(size_t i=0; i<this->n_class; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ this->model(i,j,k) =
	std::max(this->model(i,j,k), ConsensusSequenceLayer::p_min) ;
	}
	}
	}
	// normalize to get probs
	for(size_t i=0; i<this->n_class; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ double sum = 0. ;
	for(size_t k=0; k<this->n_category; k++)
	{ sum += this->model(i,j,k) ; }
	for(size_t k=0; k<this->n_category; k++)
	{ double p = this->model(i,j,k) / sum ;
	this->model(i,j,k) = p ;
	}
	}
	}
	}

	void ConsensusSequenceLayer::update_model_routine(size_t from,
	size_t to,
	const Matrix4D<double>& posterior_prob,
	std::promise<Matrix3D<double>>& promise) const
	{ // dimension checks
	this->check_posterior_prob_dim(posterior_prob) ;

	Matrix3D<double> model(this->n_class,
	this->l_model,
	this->n_category,
	0.) ;

	size_t n_class_to_update = this->n_class - this->bckg_class ;

	for(size_t k=0; k < n_class_to_update; k++)
	{ for(size_t s=0; s<this->n_shift; s++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ // base prob on fw and rv strand
	vector_d base_prob_fw(this->n_category, 0.) ;
	vector_d base_prob_rv(this->n_category, 0.) ;
	for(size_t i=from; i<to; i++)
	{ for(size_t base=0, base_rv=3; base<this->n_dim3; base++, base_rv--)
	{ // --------------- forward ---------------
	{ base_prob_fw[base] +=
	this->data(i,s+j,base) *
	posterior_prob(i,k,s,ConsensusSequenceLayer::FORWARD) ;
	}
	// --------------- reverse ---------------
	if(this->flip)
	{
	base_prob_rv[base_rv] +=
	this->data(i,s+j,base) *
	posterior_prob(i,k,s,ConsensusSequenceLayer::REVERSE) ;
	}
	}
	}
	// update this position of the model
	for(size_t i=0,i_rev=base_prob_fw.size()-1; i<base_prob_fw.size(); i++,i_rev--)
	{ // --------------- forward ---------------
	{ model(k,j,i) += base_prob_fw[i] ; }
	// --------------- reverse ---------------
	if(this->flip)
	{ model(k,this->l_model-j-1,i) += base_prob_rv[i] ; }
	}
	}
	}
	}
	promise.set_value(model) ;
	}

	void ConsensusSequenceLayer::create_bckg_class()
	{ // sequence composition
	std::vector<double> base_comp =
	dna::base_composition(this->data,
	this->flip) ;
	// set last class as background class
	for(size_t i=0; i<this->n_category; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ this->model(this->n_class-1,j,i) = base_comp[i] ; }
	}
	}
	diff --git a/src/Clustering/ConsensusSequenceLayer.hpp b/src/Clustering/ConsensusSequenceLayer.hpp
	index c0a18b8..b2bdd47 100644
	--- a/src/Clustering/ConsensusSequenceLayer.hpp
	+++ b/src/Clustering/ConsensusSequenceLayer.hpp
	@@ -1,202 +1,249 @@
	#ifndef CONSENSUSSEQUENCELAYER_HPP
	#define CONSENSUSSEQUENCELAYER_HPP

	#include <Data3DLayer.hpp>

	#include <vector>
	#include <future> // std::promise

	#include <ThreadPool.hpp>
	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>
	#include <Matrix4D.hpp>


	typedef std::vector<double> vector_d ;


	class ConsensusSequenceLayer : public Data3DLayer
	{
	public:
	/*!
	* \brief Computes the log-likelihood of the sub-
	* sequence - stored in a given row (1st dimension) -
	* and starting at the offset <col> (2nd dimension) in
	* the given sequence matrix.
	* The subsequence length is determined by the model
	* lenght.
	* \param cons_seq a probability matrix containing
	* the consensus sequence. Its dimensions are :
	* 1st the different sequences
	* 2nd the sequences length
	* 3rd 4 for A,C,G,T.
	* \param row the row containing the sequence of
	* interest.
	* \param col the index at which the sub-sequence
	* is starting (1st index inside the subsequence
	* of interest).
	* \param model a model containing the probability
	* model.
	* \return the log-likelihood of the sub-sequence
	* given the model.
	* \throw std::invalid_argument if 1) the offset is
	* invalid, 2) the sequence and the model have
	* incompatible dimensions or 3) the model 2n dimension
	* is not 4 (A,C,G,T).
	*/
	static double score_consensus_subseq(const Matrix3D<double>& cons_seq,
	size_t row,
	size_t col,
	const Matrix2D<double>& model) ;
	public:
	/*!
	* \brief Constructs an object with the
	* given data and an empty (0 values)
	* model.
	* \param data the data.
	* \param n_class the number of classes
	* of the model.
	* \param n_shift the number of shift
	* states of the model.
	* \param flip whether flipping is allowed.
	* \param bckg_class should the last class
	* of the model is constant and models the
	* background.
	*/
	ConsensusSequenceLayer(const Matrix3D<double>& data,
	size_t n_class,
	size_t n_shift,
	bool flip,
	bool bckg_class) ;

	/*!
	* \brief Constructs an object with the
	* given data and an empty (0 values)
	* model.
	* \param data the data.
	* \param n_class the number of classes
	* of the model.
	* \param n_shift the number of shift
	* states of the model.
	* \param flip whether flipping is allowed.
	* \param bckg_class should the last class
	* of the model is constant and models the
	* background.
	*/
	ConsensusSequenceLayer(Matrix3D<double>&& data,
	size_t n_class,
	size_t n_shift,
	bool flip,
	bool bckg_class) ;

	+ /*!
	+ * \brief Construct an object with the
	+ * given data and model.
	+ * The shifting freedom is set to (data number
	+ * of columns) - (the model 2nd dimension)
	+ * + 1.
	+ * \param data the data.
	+ * \param model the model with the following
	+ * dimensions :
	+ * dim1 the number of classes
	+ * dim2 the model length
	+ * dim3 4 (A,C,G,T)
	+ * \param flip whether flipping is allowed.
	+ * \param bckg_class should the last class
	+ * of the model is constant and models the
	+ * background.
	+ */
	+ ConsensusSequenceLayer(const Matrix3D<double>& data,
	+ const Matrix3D<double>& model,
	+ bool flip,
	+ bool bckg_class) ;
	+
	+ /*!
	+ * \brief Construct an object with the
	+ * given data and model.
	+ * The shifting freedom is set to (data number
	+ * of columns) - (the model 2nd dimension)
	+ * + 1.
	+ * \param data the data. The sequences
	+ * should be stored as integer values :
	+ * A:0, C:1, G:2, T:3, else:5.
	+ * \param model the model with the following
	+ * dimensions :
	+ * dim1 the number of classes
	+ * dim2 the model length
	+ * dim3 4 (A,C,G,T)
	+ * \param flip whether flipping is allowed.
	+ * \param bckg_class should the last class
	+ * of the model is constant and models the
	+ * background.
	+ */
	+ ConsensusSequenceLayer(Matrix3D<double>&& data,
	+ Matrix3D<double>&& model,
	+ bool flip,
	+ bool bckg_class) ;
	+
	+
	/*!
	* \brief Destructor.
	*/
	virtual ~ConsensusSequenceLayer() ;

	/*!
	* \brief Computes the log likelihood of the data
	* given the current model parameters.
	* \param logliklihood a matrix to store the
	* results. It should have the following dimensions :
	* 1st : same as the data number of row
	* 2nd : same as the model number of classes
	* 3rd : same as the number of shifts
	* 4th : same as the number of flip states
	* \param loglikelihood_max a vector containing the
	* max value for each row of loglikelihood.
	* Its length should be equal to the data row number.
	* \throw std::invalid_argument if the dimensions are
	* incorrect.
	*/
	virtual void compute_loglikelihoods(Matrix4D<double>& loglikelihood,
	vector_d& loglikelihood_max,
	ThreadPool* threads=nullptr) const override ;

	/*!
	* \brief Updates the model given the posterior
	* probabilities (the probabilities of each row
	* in the data to be assigned to each class,
	* for each shift and flip state).
	* 0 values inside the model are forbidden.
	* The lowest possible value inside the model
	* is SequenceLayer::p_min.
	* \param posterior_prob the data assignment probabilities to
	* the different classes.
	*/
	virtual void update_model(const Matrix4D<double>& posterior_prob,
	ThreadPool* threads=nullptr) override ;

	protected:
	/*!
	* \brief The routine that effectively performs the
	* loglikelihood computations.
	* \param from the index of the first row of the data
	* to considered.
	* \param to the index of the past last row of the data
	* to considered.
	* \param loglikelihood a matrix to store the
	* results. It should have the following dimensions :
	* 1st : same as the data number of row
	* 2nd : same as the model number of classes
	* 3rd : same as the number of shifts
	* 4th : same as the number of flip states
	* \param loglikelihood_max a vector containing the
	* max value for each row of log_likelihood.
	* Its length should be equal to the data row number.
	* \param model_log a vector containing the matrices with
	* the log values of the model for each class.
	* \param model_log_rev a vector containing the matrices with
	* the log values of the reverse strand model for each class
	* (the 1st position in the model becomes the last in the
	* reverse model and probabilities are swapped A<->T and C<->G).
	* \param done a promise to be filled when the routine
	* is done running.
	*/
	void compute_loglikelihoods_routine(size_t from,
	size_t to,
	Matrix4D<double>& loglikelihood,
	vector_d& loglikelihood_max,
	const std::vector<Matrix2D<double>>& model_log,
	const std::vector<Matrix2D<double>>& model_log_rev,
	std::promise<bool>& done) const ;

	/*!
	* \brief The routine that effectively update the model.
	* 0 values inside the model are forbidden.
	* The lowest possible value inside the model
	* is SequenceLayer::p_min.
	* \param from the index of the first row of the
	* posterior probabilities to considered.
	* \param to the index of the past last row of the
	* posterior probabilities to considered.
	* \param posterior_prob the data assignment probabilities
	* to the different classes.
	* \param
	* \param done a promise containing the partial model
	* computed from the given data slice. If several routines
	* work together at updating the model, they need to be
	* summed and normalized (by the column sum) to get the
	* final model.
	*/
	void update_model_routine(size_t from,
	size_t to,
	const Matrix4D<double>& posterior_prob,
	std::promise<Matrix3D<double>>& done) const ;

	/*!
	* \brief Sets the last class as background class with the
	* mean base probababilities of the data at each position.
	*/
	void create_bckg_class() ;


	protected:




	} ;


	#endif // CONSENSUSSEQUENCELAYER_HPP
	diff --git a/src/Clustering/Data3DLayer.cpp b/src/Clustering/Data3DLayer.cpp
	index cbd6c1a..7bb0bcf 100644
	--- a/src/Clustering/Data3DLayer.cpp
	+++ b/src/Clustering/Data3DLayer.cpp
	@@ -1,128 +1,172 @@
	#include <Data3DLayer.hpp>

	#include <Matrix3D.hpp>
	#include <Matrix4D.hpp>

	Data3DLayer::Data3DLayer()
	: DataLayer()
	{}

	Data3DLayer::Data3DLayer(const Matrix3D<double>& data,
	size_t n_class,
	size_t n_shift,
	bool flip,
	bool bckg_class)
	: DataLayer(n_class, n_shift, flip, bckg_class),
	data(data),
	n_dim1(this->data.get_dim()[0]),
	n_dim2(this->data.get_dim()[1]),
	n_dim3(this->data.get_dim()[2])
	{
	this->l_model = this->n_dim2 - this->n_shift + 1 ;

	// models cannot be initialise here
	// as the number of categories depend
	// on the exact class
	}

	Data3DLayer::Data3DLayer(Matrix3D<double>&& data,
	size_t n_class,
	size_t n_shift,
	bool flip,
	bool bckg_class)
	: DataLayer(n_class, n_shift, flip, bckg_class),
	data(data),
	n_dim1(this->data.get_dim()[0]),
	n_dim2(this->data.get_dim()[1]),
	n_dim3(this->data.get_dim()[2])
	{
	this->l_model = this->n_dim2 - this->n_shift + 1 ;

	// models cannot be initialise here
	// as the number of categories depend
	// on the exact class
	}

	+Data3DLayer::Data3DLayer(const Matrix3D<double>& data,
	+ const Matrix3D<double>& model,
	+ bool flip,
	+ bool bckg_class)
	+ : DataLayer(model, flip, bckg_class),
	+ data(data),
	+ n_dim1(this->data.get_dim()[0]),
	+ n_dim2(this->data.get_dim()[1]),
	+ n_dim3(this->data.get_dim()[2])
	+{
	+ this->n_shift = this->n_dim2 - this->l_model + 1 ;
	+
	+ // check if model is not too long
	+ if(this->n_dim2 < this->l_model)
	+ { char msg[4096] ;
	+ sprintf(msg,
	+ "Error! model is longer than data : %zu / %zu",
	+ this->l_model, this->n_dim2) ;
	+ throw std::invalid_argument(msg) ;
	+ }
	+}
	+
	+Data3DLayer::Data3DLayer(Matrix3D<double>&& data,
	+ Matrix3D<double>&& model,
	+ bool flip,
	+ bool bckg_class)
	+ : DataLayer(std::move(model), flip, bckg_class),
	+ data(data),
	+ n_dim1(this->data.get_dim()[0]),
	+ n_dim2(this->data.get_dim()[1]),
	+ n_dim3(this->data.get_dim()[2])
	+{
	+ this->n_shift = this->n_dim2 - this->l_model + 1 ;
	+
	+ // check if model is not too long
	+ if(this->n_dim2 < this->l_model)
	+ { char msg[4096] ;
	+ sprintf(msg,
	+ "Error! model is longer than data : %zu / %zu",
	+ this->l_model, this->n_dim2) ;
	+ throw std::invalid_argument(msg) ;
	+ }
	+}
	+
	Data3DLayer::~Data3DLayer()
	{}

	void Data3DLayer::check_loglikelihood_dim(const Matrix4D<double>& loglikelihood) const
	{ if(loglikelihood.get_dim()[0] != this->n_dim1)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! loglikelihood matrix 1st dimension is not "
	"equal to data 1st dimension : %zu / %zu",
	loglikelihood.get_dim()[0], this->n_dim1) ;
	throw std::invalid_argument(msg) ;
	}
	else if(loglikelihood.get_dim()[1] != this->n_class)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! loglikelihood matrix 2nd dimension is not "
	"equal to model class number : %zu / %zu",
	loglikelihood.get_dim()[1], this->n_class) ;
	throw std::invalid_argument(msg) ;
	}
	else if(loglikelihood.get_dim()[2] != this->n_shift)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! loglikelihood matrix 3rd dimension is not "
	"equal to model shift state number : %zu / %zu",
	loglikelihood.get_dim()[2], this->n_shift) ;
	throw std::invalid_argument(msg) ;
	}
	else if(loglikelihood.get_dim()[3] != this->n_flip)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! loglikelihood matrix 4th dimension is not "
	"equal to model flip state number : %zu / %zu",
	loglikelihood.get_dim()[3], this->n_flip) ;
	throw std::invalid_argument(msg) ;
	}
	}

	void Data3DLayer::check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const
	{ if(loglikelihood_max.size() != this->n_dim1)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! loglikelihood_max length is not "
	"equal to data 1st dimension : %zu / %zu",
	loglikelihood_max.size(), this->n_dim1) ;
	throw std::invalid_argument(msg) ;
	}
	}

	void Data3DLayer::check_posterior_prob_dim(const Matrix4D<double>& posterior_prob) const
	{ if(posterior_prob.get_dim()[0] != this->n_dim1)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! posterior_prob matrix 1st dimension is not "
	"equal to data 1st dimension : %zu / %zu",
	posterior_prob.get_dim()[0], this->n_dim1) ;
	throw std::invalid_argument(msg) ;
	}
	else if(posterior_prob.get_dim()[1] != this->n_class)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! posterior_prob matrix 2nd dimension is not "
	"equal to model class number : %zu / %zu",
	posterior_prob.get_dim()[1], this->n_class) ;
	throw std::invalid_argument(msg) ;
	}
	else if(posterior_prob.get_dim()[2] != this->n_shift)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! posterior_prob matrix 3rd dimension is not "
	"equal to model shift state number : %zu / %zu",
	posterior_prob.get_dim()[2], this->n_shift) ;
	throw std::invalid_argument(msg) ;
	}
	else if(posterior_prob.get_dim()[3] != this->n_flip)
	{ char msg[4096] ;
	sprintf(msg,
	"Error! posterior_prob matrix 4th dimension is not "
	"equal to model flip state number : %zu / %zu",
	posterior_prob.get_dim()[3], this->n_flip) ;
	throw std::invalid_argument(msg) ;
	}
	}
	diff --git a/src/Clustering/Data3DLayer.hpp b/src/Clustering/Data3DLayer.hpp
	index 19ffc09..86f0503 100644
	--- a/src/Clustering/Data3DLayer.hpp
	+++ b/src/Clustering/Data3DLayer.hpp
	@@ -1,135 +1,169 @@
	#ifndef DATA3DLAYER_HPP
	#define DATA3DLAYER_HPP

	#include <DataLayer.hpp>

	#include <Matrix3D.hpp>
	#include <Matrix4D.hpp>

	typedef std::vector<double> vector_d ;

	class Data3DLayer : public DataLayer
	{
	public:
	/*!
	* \brief Constructs an empty object.
	*/
	Data3DLayer() ;

	/*!
	* \brief Constructs an object with the
	* given data.
	* An empty model is not initialised yet
	* as the model number of categories
	* depends on the final class.
	* \param data the data.
	* \param n_class the number of classes
	* of the model.
	* \param n_shift the number of shift
	* states of the model.
	* \param flip whether flipping is allowed.
	* \param bckg_class should the last class
	* of the model is constant and models the
	* background.
	*/
	Data3DLayer(const Matrix3D<double>& data,
	size_t n_class,
	size_t n_shift,
	bool flip,
	bool bckg_class) ;

	/*!
	* \brief Constructs an object with the
	* given data.
	* An empty model is not initialised yet
	* as the model number of categories
	* depends on the final class.
	* \param data the data.
	* \param n_class the number of classes
	* of the model.
	* \param n_shift the number of shift
	* states of the model.
	* \param flip whether flipping is allowed.
	* \param bckg_class should the last class
	* of the model is constant and models the
	* background.
	*/
	Data3DLayer(Matrix3D<double>&& data,
	size_t n_class,
	size_t n_shift,
	bool flip,
	bool bckg_class) ;

	+ /*!
	+ * \brief Constructs an object with the
	+ * given data and model.
	+ * The model dimensions set the number of
	+ * classes and the shifting freedom.
	+ * \param data the data.
	+ * \param the model.
	+ * \param flip whether flipping is allowed.
	+ * \param bckg_class should the last class
	+ * of the model is constant and models the
	+ * background.
	+ */
	+ Data3DLayer(const Matrix3D<double>& data,
	+ const Matrix3D<double>& model,
	+ bool flip,
	+ bool bckg_class) ;
	+
	+ /*!
	+ * \brief Constructs an object with the
	+ * given data and model.
	+ * The model dimensions set the number of
	+ * classes and the shifting freedom.
	+ * \param data the data.
	+ * \param the model.
	+ * \param flip whether flipping is allowed.
	+ * \param bckg_class should the last class
	+ * of the model is constant and models the
	+ * background.
	+ */
	+ Data3DLayer(Matrix3D<double>&& data,
	+ Matrix3D<double>&& model,
	+ bool flip,
	+ bool bckg_class) ;
	+
	/*!
	* \brief Destructor.
	*/
	virtual ~Data3DLayer() ;


	protected:
	/*!
	* \brief Checks the argument has compatible
	* dimensions with the data and models. If this is
	* not the case, throw a std::invalid_argument with
	* a relevant message.
	* \param logliklihood a matrix to store the
	* results. It should have the following dimensions :
	* 1st : same as the data row number
	* 2nd : same as the model class number
	* 3rd : same as the shift state number
	* 4th : same as the flip state number
	* \throw std::invalid_argument if the dimensions are
	* incorrect.
	*/
	virtual void check_loglikelihood_dim(const Matrix4D<double>& loglikelihood) const override;

	/*!
	* \brief Checks that the argument has compatible
	* dimensions with the data and models. If this is
	* not the case, throw a std::invalid_argument with
	* a relevant message.
	* \param loglikelihood_max a vector containing the
	* max value for each row of log_likelihood.
	* It should have a length equal to the number of
	* the data row number.
	* \throw std::invalid_argument if the dimensions are
	* incorrect.
	*/
	virtual void check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const override ;

	/*!
	* \brief Checks the argument has compatible
	* dimensions with the data and models. If this is
	* not the case, throw a std::invalid_argument with
	* a relevant message.
	* \param posterior_prob a matrix to store the
	* results. It should have the following dimensions :
	* 1st : same as the data row number
	* 2nd : same as the model class number
	* 3rd : same as the shift state number
	* 4th : same as the flip state number
	* \throw std::invalid_argument if the dimensions are
	* incorrect.
	*/
	virtual void check_posterior_prob_dim(const Matrix4D<double>& posterior_prob) const override ;

	/*!
	* \brief the data.
	*/
	Matrix3D<double> data ;
	/*!
	* \brief The size of data 1st dimension.
	*/
	size_t n_dim1 ;
	/*!
	* \brief The size of data 2nd dimension.
	*/
	size_t n_dim2 ;
	/*!
	* \brief The size of data 3rd dimension.
	*/
	size_t n_dim3 ;

	} ;

	#endif // DATA3DLAYER_HPP
	diff --git a/src/Clustering/EMConsensusSequence.cpp b/src/Clustering/EMConsensusSequence.cpp
	index 1dd3343..ea97f85 100644
	--- a/src/Clustering/EMConsensusSequence.cpp
	+++ b/src/Clustering/EMConsensusSequence.cpp
	@@ -1,294 +1,355 @@
	#include <EMConsensusSequence.hpp>

	#include <string>
	#include <vector>
	#include <future> // std::promise, std::future
	#include <utility> // std::pair, std::move()
	#include <functional> // std::bind(), std::ref()

	#include <ConsensusSequenceLayer.hpp> // SequenceLayer
	#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
	#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
	#include <ThreadPool.hpp> // ThreadPool
	#include <dna_utility.hpp> // dna::base_composition()


	EMConsensusSequence::EMConsensusSequence(const Matrix3D<double>& seq_matrix,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed,
	size_t n_threads)
	: EMBase(seq_matrix.get_dim()[0],
	seq_matrix.get_dim()[1],
	n_class,
	n_iter,
	n_shift,
	flip,
	n_threads),
	loglikelihood_max(n_row, 0.),
	cseq_layer(nullptr)
	{
	this->loglikelihood_max = vector_d(n_row, 0.) ;

	// initialise post prob randomly
	// getRandomGenerator(seed) ;
	this->set_post_prob_random(seed) ;

	// data and models
	this->cseq_layer = new ConsensusSequenceLayer(seq_matrix,
	this->n_class,
	this->n_shift,
	this->flip,
	bckg_class) ;

	// intialise the models with the post prob
	this->cseq_layer->update_model(this->post_prob,
	this->threads) ;
	}

	EMConsensusSequence::EMConsensusSequence(Matrix3D<double>&& seq_matrix,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed,
	size_t n_threads)
	: EMBase(seq_matrix.get_dim()[0],
	seq_matrix.get_dim()[1],
	n_class,
	n_iter,
	n_shift,
	flip,
	n_threads),
	loglikelihood_max(n_row, 0.),
	cseq_layer(nullptr)
	{
	this->loglikelihood_max = vector_d(n_row, 0.) ;

	// initialise post prob randomly
	// getRandomGenerator(seed) ;
	this->set_post_prob_random(seed) ;

	// data and models
	this->cseq_layer = new ConsensusSequenceLayer(std::move(seq_matrix),
	this->n_class,
	this->n_shift,
	this->flip,
	bckg_class) ;

	// intialise the models with the post prob
	this->cseq_layer->update_model(this->post_prob,
	this->threads) ;
	}

	+EMConsensusSequence::EMConsensusSequence(const Matrix3D<double>& seq_matrix,
	+ const Matrix3D<double>& motifs,
	+ size_t n_iter,
	+ bool flip,
	+ bool bckg_class,
	+ size_t n_threads)
	+ : EMBase(seq_matrix.get_dim()[0],
	+ seq_matrix.get_dim()[1],
	+ motifs.get_dim()[0],
	+ n_iter,
	+ seq_matrix.get_dim()[1] - motifs.get_dim()[1] + 1,
	+ flip,
	+ n_threads),
	+ loglikelihood_max(n_row, 0.),
	+ cseq_layer(nullptr)
	+{
	+
	+ this->loglikelihood_max = vector_d(n_row, 0.) ;
	+
	+ // data and models
	+ // background motif (if any) is the last of the given motifs
	+ this->cseq_layer = new ConsensusSequenceLayer(seq_matrix,
	+ motifs,
	+ this->flip,
	+ bckg_class) ;
	+
	+ // intialise the class prob uniformly
	+ this->set_state_prob_uniform() ;
	+}
	+
	+EMConsensusSequence::EMConsensusSequence(Matrix3D<double>&& seq_matrix,
	+ Matrix3D<double>&& motifs,
	+ size_t n_iter,
	+ bool flip,
	+ bool bckg_class,
	+ size_t n_threads)
	+ : EMBase(seq_matrix.get_dim()[0],
	+ seq_matrix.get_dim()[1],
	+ motifs.get_dim()[0],
	+ n_iter,
	+ seq_matrix.get_dim()[1] - motifs.get_dim()[1] + 1,
	+ flip,
	+ n_threads),
	+ loglikelihood_max(n_row, 0.),
	+ cseq_layer(nullptr)
	+{
	+
	+ this->loglikelihood_max = vector_d(n_row, 0.) ;
	+
	+ // data and models
	+ // background motif (if any) is the last of the given motifs
	+ this->cseq_layer = new ConsensusSequenceLayer(std::move(seq_matrix),
	+ std::move(motifs),
	+ this->flip,
	+ bckg_class) ;
	+
	+ // intialise the class prob uniformly
	+ this->set_state_prob_uniform() ;
	+}
	+
	+
	EMConsensusSequence::~EMConsensusSequence()
	{ if(this->cseq_layer != nullptr)
	{ delete this->cseq_layer ;
	this->cseq_layer = nullptr ;
	}
	if(this->threads != nullptr)
	{ this->threads->join() ;
	delete this->threads ;
	this->threads = nullptr ;
	}
	}

	Matrix3D<double> EMConsensusSequence::get_sequence_models() const
	{ return this->cseq_layer->get_model() ; }

	EMConsensusSequence::exit_codes EMConsensusSequence::classify()
	{
	size_t bar_update_n = this->n_iter ;
	ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;

	// optimize the partition
	for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
	{ // E-step
	this->compute_loglikelihood() ;
	this->compute_post_prob() ;
	// M-step
	this->compute_class_prob() ;
	this->update_models() ;
	this->center_post_state_prob() ;
	bar.update() ;
	}
	bar.update() ; std::cerr << std::endl ;
	return EMConsensusSequence::exit_codes::ITER_MAX ;
	}

	void EMConsensusSequence::compute_loglikelihood()
	{ // compute the loglikelihood
	this->cseq_layer->compute_loglikelihoods(this->loglikelihood,
	this->loglikelihood_max,
	this->threads) ;
	// rescale the values
	// don't parallelize
	if(this->threads == nullptr)
	{ std::promise<bool> promise ;
	std::future<bool> future = promise.get_future() ;
	this->compute_loglikelihood_routine(0,
	this->n_row,
	promise) ;
	future.get() ;
	}
	// parallelize
	else
	{ size_t n_threads = this->threads->getNThread() ;

	// compute the slices on which each thread will work
	std::vector<std::pair<size_t,size_t>> slices =
	ThreadPool::split_range(0, this->n_row,n_threads) ;

	// get promises and futures
	std::vector<std::promise<bool>> promises(n_threads) ;
	std::vector<std::future<bool>> futures(n_threads) ;
	for(size_t i=0; i<n_threads; i++)
	{ futures[i] = promises[i].get_future() ; }

	// distribute work to threads
	// -------------------------- threads start --------------------------
	for(size_t i=0; i<n_threads; i++)
	{ auto slice = slices[i] ;
	this->threads->addJob(std::move(
	std::bind(&EMConsensusSequence::compute_loglikelihood_routine,
	this,
	slice.first,
	slice.second,
	std::ref(promises[i])))) ;
	}
	// wait until all threads are done working
	for(auto& future : futures)
	{ future.get() ; }
	// -------------------------- threads stop ---------------------------
	}
	}

	void EMConsensusSequence::compute_loglikelihood_routine(size_t from,
	size_t to,
	std::promise<bool>& done)
	{
	// rescale the values
	for(size_t i=from; i<to; i++)
	{ for(size_t j=0; j<this->n_class; j++)
	{ for(size_t k=0; k<this->n_shift; k++)
	{ for(size_t l=0; l<this->n_flip; l++)
	{ this->loglikelihood(i,j,k,l) =
	std::max(this->loglikelihood(i,j,k,l) -
	this->loglikelihood_max[i],
	ConsensusSequenceLayer::p_min_log) ;
	}
	}
	}
	}
	done.set_value(true) ;
	}

	void EMConsensusSequence::compute_post_prob()
	{ // don't parallelize
	if(this->threads == nullptr)
	{ std::promise<vector_d> promise ;
	std::future<vector_d> future = promise.get_future() ;
	this->compute_post_prob_routine(0, this->n_row, promise) ;
	// compute the sum of post prob and the per class sum of post prob
	// from the partial results computed on each slice
	this->post_prob_tot = 0. ;
	this->post_prob_colsum = future.get() ;
	for(const auto& prob : this->post_prob_colsum)
	{ this->post_prob_tot += prob ; }
	}
	// parallelize
	else
	{ size_t n_threads = this->threads->getNThread() ;

	// compute the slices on which each thread will work
	std::vector<std::pair<size_t,size_t>> slices =
	ThreadPool::split_range(0, this->n_row,n_threads) ;

	// get promises and futures
	// the function run by the threads will compute
	// the partial sum per class of post_prob for the given slice
	// this should be used to compute the complete sum of post_prob
	// and the complete sum per class of post_prob
	std::vector<std::promise<vector_d>> promises(n_threads) ;
	std::vector<std::future<vector_d>> futures(n_threads) ;
	for(size_t i=0; i<n_threads; i++)
	{ futures[i] = promises[i].get_future() ; }

	// distribute work to threads
	// -------------------------- threads start --------------------------
	for(size_t i=0; i<n_threads; i++)
	{ auto slice = slices[i] ;
	this->threads->addJob(std::move(
	std::bind(&EMConsensusSequence::compute_post_prob_routine,
	this,
	slice.first,
	slice.second,
	std::ref(promises[i])))) ;
	}
	// wait until all threads are done working
	// compute the sum of post prob and the per class sum of post prob
	// from the partial results computed on each slice
	this->post_prob_tot = 0. ;
	this->post_prob_colsum = vector_d(this->n_class, 0.) ;
	for(auto& future : futures)
	{ auto probs = future.get() ;
	for(size_t i=0; i<this->n_class; i++)
	{ double prob = probs[i] ;
	this->post_prob_colsum[i] += prob ;
	this->post_prob_tot += prob ;
	}
	}
	// -------------------------- threads stop ---------------------------
	}
	}


	void EMConsensusSequence::compute_post_prob_routine(size_t from,
	size_t to,
	std::promise<vector_d>& post_prob_colsum)
	{ vector_d colsums(this->n_class, 0.) ;

	// reset grand total
	// this->post_prob_tot = 0 ;
	// this->post_prob_colsum = vector_d(n_class, 0) ;

	// post prob
	for(size_t i=from; i<to; i++)
	{ // reset row sum to 0
	this->post_prob_rowsum[i] = 0. ;
	for(size_t n_class=0; n_class<this->n_class; n_class++)
	{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
	{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
	{
	double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
	this->post_state_prob(n_class,n_shift,n_flip) ;
	this->post_prob(i,n_class,n_shift,n_flip) = p ;
	this->post_prob_rowsum[i] += p ;
	}
	}
	}
	// normalize
	for(size_t n_class=0; n_class<this->n_class; n_class++)
	{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
	{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
	{
	double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
	this->post_prob_rowsum[i],
	ConsensusSequenceLayer::p_min) ;
	this->post_prob(i,n_class,n_shift,n_flip) = p ;
	colsums[n_class] += p ;
	}
	}
	}
	}
	post_prob_colsum.set_value(colsums) ;
	}

	void EMConsensusSequence::update_models()
	{ this->cseq_layer->update_model(this->post_prob,
	this->threads) ;
	}
	diff --git a/src/Clustering/EMConsensusSequence.hpp b/src/Clustering/EMConsensusSequence.hpp
	index 01ff753..72959ea 100644
	--- a/src/Clustering/EMConsensusSequence.hpp
	+++ b/src/Clustering/EMConsensusSequence.hpp
	@@ -1,199 +1,279 @@
	#ifndef EMCONSENSUSSEQUENCE_HPP
	#define EMCONSENSUSSEQUENCE_HPP

	#include <EMBase.hpp>

	#include <vector>
	#include <string>
	#include <future> // std::promise

	#include <Matrix3D.hpp>
	#include <ConsensusSequenceLayer.hpp>


	typedef std::vector<double> vector_d ;


	class EMConsensusSequence : public EMBase
	{
	public:
	/*!
	* \brief Constructs an object to partition the
	* given consensus sequences (rows) according to
	* their motif content.
	* The sequences models are initialised randomly.
	* \param sequence_matrix a matrix containing the
	* consensus sequences in a probability matrix.
	* Its dimensions are :
	* 1st the number of consensus sequences
	* 2nd the length of the consensus sequences
	* 3rd 4 for A,C,G,T
	* The sums over the 1st and 2nd dimensions should
	* be 1. The overall sum of the matrix values should
	* be the st dimension.
	* \param n_class the number of region classes
	* to search.
	* \param n_iter the number of optimization iterations.
	* \param n_shift the number of shift states allowed.
	* \param flip whether flipping is allowed.
	* \param bckg_class the last class is used to model the background
	* by setting all its parameters, at all positions, to the
	* background base probabilties. Since the background is constant,
	* this class will never be updated.
	* \param seed a seed to initialise the random number
	* generator.
	* \param n_threads the number of parallel threads
	* to run the computations. 0 means no parallel
	* computing, everything is run on the main thread.
	*/
	EMConsensusSequence(const Matrix3D<double>& sequence_matrix,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed="",
	size_t n_threads=0) ;
	/*!
	* \brief Constructs an object to partition the
	* given consensus sequences (rows) according to
	* their motif
	* content.
	* The sequences models are initialised randomly.
	* \param sequence_matrix a matrix containing the
	* consensus sequences in a probability matrix.
	* Its dimensions are :
	* 1st the number of consensus sequences
	* 2nd the length of the consensus sequences
	* 3rd 4 for A,C,G,T
	* The sums over the 1st and 2nd dimensions should
	* be 1. The overall sum of the matrix values should
	* be the st dimension.
	* \param n_class the number of region classes
	* to search.
	* \param n_iter the number of optimization iterations.
	* \param n_shift the number of shift states allowed.
	* \param flip whether flipping is allowed.
	* \param bckg_class the last class is used to model the background
	* by setting all its parameters, at all positions, to the
	* background base probabilties. Since the background is constant,
	* this class will never be updated.
	* \param seed a seed to initialise the random number
	* generator.
	* \param n_threads the number of parallel threads
	* to run the computations. 0 means no parallel
	* computing, everything is run on the main thread.
	*/
	EMConsensusSequence(Matrix3D<double>&& sequence_matrix,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed="",
	size_t n_threads=0) ;

	+ /*!
	+ * \brief Constructs an object to partition the
	+ * given consensus sequences (rows) according to
	+ * their motif content.
	+ * The sequences class models are initialised using
	+ * the given motifs. The class probabilities are
	+ * initialised uniformelly.
	+ * The shifting freedom is set to (data 2n dimension)
	+ * - (the model 2nd dimension) + 1.
	+ * \param a matrix containing the consensus sequences
	+ * in a probability matrix. Its dimensions are :
	+ * 1st the number of consensus sequences
	+ * 2nd the length of the consensus sequences
	+ * 3rd 4 for A,C,G,T
	+ * The sums over the 1st and 2nd dimensions should
	+ * be 1. The overall sum of the matrix values should
	+ * be the st dimension.
	+ * \param motifs a matrix containing the different initial
	+ * class models with the following dimensions :
	+ * dim1 the number of classes
	+ * dim2 the model length
	+ * dim3 4 for A,C,G,T
	+ * \param n_class the number of region classes
	+ * to search.
	+ * \param n_iter the number of optimization iterations.
	+ * \param flip whether flipping is allowed.
	+ * \param bckg_class indicates that the last class in the
	+ * given motifs is used to model the background and it
	+ * should never be updated.
	+ * \param n_threads the number of parallel threads
	+ * to run the computations. 0 means no parallel
	+ * computing, everything is run on the main thread.
	+ */
	+ EMConsensusSequence(const Matrix3D<double>& sequence_matrix,
	+ const Matrix3D<double>& motifs,
	+ size_t n_iter,
	+ bool flip,
	+ bool bckg_class,
	+ size_t n_threads=0) ;
	+
	+ /*!
	+ * \brief Constructs an object to partition the
	+ * given consensus sequences (rows) according to
	+ * their motif content.
	+ * The sequences class models are initialised using
	+ * the given motifs. The class probabilities are
	+ * initialised uniformelly.
	+ * The shifting freedom is set to (data 2n dimension)
	+ * - (the model 2nd dimension) + 1.
	+ * \param a matrix containing the consensus sequences
	+ * in a probability matrix. Its dimensions are :
	+ * 1st the number of consensus sequences
	+ * 2nd the length of the consensus sequences
	+ * 3rd 4 for A,C,G,T
	+ * The sums over the 1st and 2nd dimensions should
	+ * be 1. The overall sum of the matrix values should
	+ * be the st dimension.
	+ * \param motifs a matrix containing the different initial
	+ * class models with the following dimensions :
	+ * dim1 the number of classes
	+ * dim2 the model length
	+ * dim3 4 for A,C,G,T
	+ * \param n_class the number of region classes
	+ * to search.
	+ * \param n_iter the number of optimization iterations.
	+ * \param flip whether flipping is allowed.
	+ * \param bckg_class indicates that the last class in the
	+ * given motifs is used to model the background and it
	+ * should never be updated.
	+ * \param n_threads the number of parallel threads
	+ * to run the computations. 0 means no parallel
	+ * computing, everything is run on the main thread.
	+ */
	+ EMConsensusSequence(Matrix3D<double>&& sequence_matrix,
	+ Matrix3D<double>&& motifs,
	+ size_t n_iter,
	+ bool flip,
	+ bool bckg_class,
	+ size_t n_threads=0) ;
	+
	EMConsensusSequence(const EMConsensusSequence& other) = delete ;

	/*!
	* \brief Destructor.
	*/
	virtual ~EMConsensusSequence() override ;

	/*!
	* \brief Returns the class sequence model.
	* \return the class sequence model.
	*/
	Matrix3D<double> get_sequence_models() const ;

	/*!
	* \brief Runs the sequence model optimization and
	* the data classification.
	* \return a code indicating how the optimization
	* ended.
	*/
	virtual EMConsensusSequence::exit_codes classify() override ;

	private:

	/*!
	* \brief Computes the data log likelihood given the
	* current models, for all layers and the joint
	* likelihood for each state (the sum of the layer
	* likelihoods for all layers, for a given state).
	* To avoid numerical issues when computing posterior
	* probabilities, the lowest possible value authorized
	* as log likelihood is ConsensusSequenceLayer::p_min_log.
	* Any value below is replaced by this one.
	*/
	virtual void compute_loglikelihood() override ;

	/*!
	* \brief This is a routine of compute_loglikelihood().
	* This method rescales the loglikelihood values by
	* substacting to each value the maximum loglikelihood
	* value found in the same data row.
	* To avoid numerical issues when computing posterior
	* probabilities, the lowest possible value authorized
	* as log likelihood is ConsensusSequenceLayer::p_min_log.
	* Any value below is replaced by this one.
	* \param from the index of the first row
	* in the data to consider.
	* \param to the index of the past last row
	* in the data to consider.
	* \param done a promise to fill when the method
	* is done.
	*/
	void compute_loglikelihood_routine(size_t from,
	size_t to,
	std::promise<bool>& done) ;

	/*!
	* \brief Computes the data posterior probabilties.
	* To avoid numerical issues the lowest possible
	* value authorized as posterior probability is
	* ConsensusSequenceLayer::p_min. Any value below
	* is replaced by this one.
	*/
	virtual void compute_post_prob() override ;

	/*!
	* \brief The routine that effectively computes
	* the posterior probabilties.
	* To avoid numerical issues the lowest possible
	* value authorized as posterior probability is
	* ConsensusSequenceLayer::p_min. Any value below
	* is replaced by this one.
	* \param from the index of the first row
	* in the data to consider.
	* \param to the index of the past last row
	* in the data to consider.
	* \param done the partial column (over the classes)
	* sum of posterior probabilities. If several routines
	* are running together, the colsums are retrieved by
	* summing up the vectors together.
	*/
	void compute_post_prob_routine(size_t from,
	size_t to,
	std::promise<vector_d>& post_prob_colsum) ;

	/*!
	* \brief Update the data models for all layers, given
	* the current posterior and class probabilities.
	*/
	virtual void update_models() override ;

	/*!
	* \brief the max loglikelihood value for
	* each data row.
	*/
	std::vector<double> loglikelihood_max ;
	/*!
	* \brief A pointer to the object managing
	* the data and their model.
	*/
	ConsensusSequenceLayer* cseq_layer ;

	} ;

	#endif // EMCONSENSUSSEQUENCE_HPP
	diff --git a/src/Statistics/KmersStatistics.cpp b/src/Statistics/KmersStatistics.cpp
	index 58533a8..686c103 100644
	--- a/src/Statistics/KmersStatistics.cpp
	+++ b/src/Statistics/KmersStatistics.cpp
	@@ -1,230 +1,314 @@
	#include <KmersStatistics.hpp>

	#include <string>
	#include <vector>
	#include <unordered_map>

	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>
	#include <Statistics.hpp> // poisson_cdf()
	#include <dna_utility.hpp> // dna::consensus_to_kmer()

	Matrix2D<int> kmers::compute_mononucleotide(size_t k,
	const std::vector<std::string>& kmers,
	const std::vector<int>& counts)
	{
	Matrix2D<int> comp(k, 4, 0) ;

	for(size_t i=0; i<kmers.size(); i++)
	{
	std::string kmer = kmers[i] ;
	int count = counts[i] ;

	for (size_t j=0; j<k ; j++)
	{ int base = int(kmer[j]) - int('0') ;
	comp(j, base) += count ;
	}
	}
	return(comp);
	}

	Matrix3D<int> kmers::compute_dinucleotide(size_t k,
	const std::vector<std::string>& kmers,
	const std::vector<int>& counts)
	{
	Matrix3D<int> comp2(4, k-1, 4) ;

	for(size_t i=0; i<kmers.size(); i++)
	{ std::string kmer = kmers[i] ;
	int count = counts[i] ;
	for (size_t j=0; j<k-1; j++)
	{ int base1 = int(kmer[j]) - int('0') ;
	int base2 = int(kmer[j+1]) - int('0') ;
	comp2(base1, j, base2) += count ;
	}
	}
	return(comp2);
	}

	std::vector<double> kmers::compute_exp_values(size_t k,
	const std::vector<std::string>& kmers,
	const Matrix2D<int>& comp1,
	const Matrix3D<int>& comp2)
	{ std::vector<double> expected(kmers.size()) ;

	for(size_t i=0; i<kmers.size(); i++)
	{ std::string kmer = kmers[i] ;
	double exp = comp1(0, int(kmer[0]) - int('0')) ;

	for(size_t j=0; j<k-1; j++)
	{ int base1 = int(kmer[j]) - int('0') ;
	int base2 = int(kmer[j+1]) - int('0') ;
	exp *= ((double)comp2(base1, j, base2) /
	(double)comp1(j+1, base2)) ;
	}

	expected[i] = exp ;
	}
	return expected;
	}


	std::vector<double> kmers::compute_pvalues(const std::vector<int>& counts,
	const std::vector<double>& expected)
	{
	std::vector<double> pvalues(counts.size()) ;

	for(size_t i=0; i<counts.size(); i++)
	{
	pvalues[i] = poisson_cdf((double)counts[i],
	expected[i],
	false) ;
	}
	return pvalues ;
	}

	+std::unordered_map<std::string, int>
	+kmers::compute_kmer_count(const Matrix2D<int>& sequences,
	+ size_t length)
	+{ std::unordered_map<std::string, int> kmer_count ;
	+
	+ size_t n_row = sequences.get_dim()[0] ;
	+ size_t n_shift = sequences.get_dim()[1] - length + 1 ;
	+
	+ for(size_t i=0; i<n_row; i++)
	+ { for(size_t s=0; s<n_shift; s++)
	+ { // get kmer
	+ std::string kmer(length, '0') ;
	+ bool n_found = false ;
	+ for(size_t j_seq=s, j_kmer=0; j_seq<s+length; j_seq++, j_kmer++)
	+ { // check for N's
	+ if(sequences(i, j_seq) == dna::char_to_int('N'))
	+ { n_found = true ;
	+ break ;
	+ }
	+ kmer[j_kmer] = std::to_string(sequences(i, j_seq))[0] ;
	+ }
	+
	+ // found N -> skip this one
	+ if(n_found)
	+ { continue ; }
	+
	+ // insert
	+ auto iter = kmer_count.find(kmer) ;
	+ if(iter == kmer_count.end())
	+ { kmer_count.emplace(kmer, 1) ; }
	+ // update
	+ else
	+ { iter->second += 1 ; }
	+ }
	+ }
	+ return kmer_count ;
	+}
	+
	std::unordered_map<std::string, int>
	kmers::compute_kmer_count(const Matrix3D<double>& consensus_sequence,
	size_t length)
	{ std::unordered_map<std::string, int> kmer_count ;

	size_t n_row = consensus_sequence.get_dim()[0] ;
	size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;

	for(size_t i=0; i<n_row; i++)
	{ for(size_t s=0; s<n_shift; s++)
	{ // get kmer
	std::string kmer = dna::consensus_to_kmer(consensus_sequence,
	i,
	s,
	s+length) ;
	// insert
	auto iter = kmer_count.find(kmer) ;
	if(iter == kmer_count.end())
	{ kmer_count.emplace(kmer, 1) ; }
	// update
	else
	{ iter->second += 1 ; }
	}
	}
	return kmer_count ;
	}

	+std::pair<std::vector<std::string>, std::vector<double>>
	+kmers::compute_kmer_pvalue(const Matrix2D<int>& sequences,
	+ size_t k)
	+{
	+
	+ // get kmer count
	+ std::unordered_map<std::string, int> kmer_count =
	+ kmers::compute_kmer_count(sequences, k) ;
	+
	+ // turn to vectors
	+ std::vector<std::string> kmers(kmer_count.size()) ;
	+ size_t i=0 ;
	+ for(const auto& iter : kmer_count)
	+ { kmers[i] = iter.first ;
	+ i++ ;
	+ }
	+ std::sort(kmers.begin(), kmers.end()) ;
	+ std::vector<int> counts(kmer_count.size()) ;
	+ i=0 ;
	+ for(const auto& kmer : kmers)
	+ { counts[i] = kmer_count[kmer] ;
	+ i++ ;
	+ }
	+ kmer_count.clear() ;
	+
	+ // get mononucleotide composition in kmer
	+ // this is a probability matrix modeling the kmer
	+ Matrix2D<int> comp1 = kmers::compute_mononucleotide(k, kmers, counts) ;
	+
	+ // get dinucleotide composition in kmer
	+ // this is a probability matrix modeling the kmer
	+ Matrix3D<int> comp2 = kmers::compute_dinucleotide(k, kmers, counts) ;
	+
	+ // get expected number of occurence for each kmer
	+ std::vector<double> expected = kmers::compute_exp_values(k,
	+ kmers,
	+ comp1,
	+ comp2) ;
	+
	+ // compute the pvalue associated to each kmer
	+ std::vector<double> pvalues = kmers::compute_pvalues(counts,
	+ expected) ;
	+
	+ return std::make_pair(kmers, pvalues) ;
	+}
	+
	std::pair<std::vector<std::string>, std::vector<double>>
	kmers::compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
	size_t k)
	{

	// get kmer count
	std::unordered_map<std::string, int> kmer_count =
	kmers::compute_kmer_count(consensus_sequences, k) ;

	/*
	consensus_sequences.get_dim() ;
	std::unordered_map<std::string, int> kmer_count ;
	// ANN
	kmer_count["000"] = 1 ;
	kmer_count["001"] = 2 ;
	kmer_count["002"] = 3 ;
	kmer_count["003"] = 4 ;
	kmer_count["010"] = 5 ;
	kmer_count["011"] = 6 ;
	kmer_count["012"] = 7 ;
	kmer_count["013"] = 8 ;
	kmer_count["020"] = 9 ;
	kmer_count["021"] = 10 ;
	kmer_count["022"] = 11 ;
	kmer_count["023"] = 12 ;
	kmer_count["030"] = 13 ;
	kmer_count["031"] = 14 ;
	kmer_count["032"] = 15 ;
	kmer_count["033"] = 16 ;
	// CNN
	kmer_count["100"] = 1 ;
	kmer_count["101"] = 2 ;
	kmer_count["102"] = 3 ;
	kmer_count["103"] = 4 ;
	kmer_count["110"] = 5 ;
	kmer_count["111"] = 6 ;
	kmer_count["112"] = 7 ;
	kmer_count["113"] = 8 ;
	kmer_count["120"] = 9 ;
	kmer_count["121"] = 10 ;
	kmer_count["122"] = 11 ;
	kmer_count["123"] = 12 ;
	kmer_count["130"] = 13 ;
	kmer_count["131"] = 14 ;
	kmer_count["132"] = 15 ;
	kmer_count["133"] = 16 ;
	// GNN
	kmer_count["200"] = 1 ;
	kmer_count["201"] = 2 ;
	kmer_count["202"] = 3 ;
	kmer_count["203"] = 4 ;
	kmer_count["210"] = 5 ;
	kmer_count["211"] = 6 ;
	kmer_count["212"] = 7 ;
	kmer_count["213"] = 8 ;
	kmer_count["220"] = 9 ;
	kmer_count["221"] = 10 ;
	kmer_count["222"] = 11 ;
	kmer_count["223"] = 12 ;
	kmer_count["230"] = 13 ;
	kmer_count["231"] = 14 ;
	kmer_count["232"] = 15 ;
	kmer_count["233"] = 16 ;
	// TNN
	kmer_count["300"] = 1 ;
	kmer_count["301"] = 2 ;
	kmer_count["302"] = 3 ;
	kmer_count["303"] = 4 ;
	kmer_count["310"] = 5 ;
	kmer_count["311"] = 6 ;
	kmer_count["312"] = 7 ;
	kmer_count["313"] = 8 ;
	kmer_count["320"] = 9 ;
	kmer_count["321"] = 10 ;
	kmer_count["322"] = 11 ;
	kmer_count["323"] = 12 ;
	kmer_count["330"] = 13 ;
	kmer_count["331"] = 14 ;
	kmer_count["332"] = 15 ;
	kmer_count["333"] = 16 ;
	*/

	// turn to vectors
	std::vector<std::string> kmers(kmer_count.size()) ;
	size_t i=0 ;
	for(const auto& iter : kmer_count)
	{ kmers[i] = iter.first ;
	i++ ;
	}
	std::sort(kmers.begin(), kmers.end()) ;
	std::vector<int> counts(kmer_count.size()) ;
	i=0 ;
	for(const auto& kmer : kmers)
	{ counts[i] = kmer_count[kmer] ;
	i++ ;
	}
	kmer_count.clear() ;

	// get mononucleotide composition in kmer
	// this is a probability matrix modeling the kmer
	Matrix2D<int> comp1 = kmers::compute_mononucleotide(k, kmers, counts) ;

	// get dinucleotide composition in kmer
	// this is a probability matrix modeling the kmer
	Matrix3D<int> comp2 = kmers::compute_dinucleotide(k, kmers, counts) ;

	// get expected number of occurence for each kmer
	std::vector<double> expected = kmers::compute_exp_values(k,
	kmers,
	comp1,
	comp2) ;

	// compute the pvalue associated to each kmer
	std::vector<double> pvalues = kmers::compute_pvalues(counts,
	expected) ;

	return std::make_pair(kmers, pvalues) ;
	}
	diff --git a/src/Statistics/KmersStatistics.hpp b/src/Statistics/KmersStatistics.hpp
	index fcc4f60..b6c39ef 100644
	--- a/src/Statistics/KmersStatistics.hpp
	+++ b/src/Statistics/KmersStatistics.hpp
	@@ -1,133 +1,185 @@
	#ifndef KMER_STATISTICS_HPP
	#define KMER_STATISTICS_HPP

	#include <string>
	#include <vector>
	#include <unordered_map>

	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>


	namespace kmers {

	+ /*!
	+ * \brief Creates a table of all kmer found in
	+ * a set of consensus sequences and return an
	+ * associated pvalue for each one of them.
	+ * The pvalue is computed as 1 - p(count, exp)
	+ * where p is the CDF of a Poisson distribution
	+ * with <count> being the number of time a given
	+ * kmer is observed in the data and <exp> its
	+ * expected number of occurences. The expected
	+ * number of occurences is computed from the
	+ * observed di-nucleotide occurences in the given
	+ * data.
	+ * \param sequences the sequences in integer
	+ * format (A:0, C:1, G:2, T:3). It has the
	+ * following dimensions :
	+ * 1st the number of sequences
	+ * 2n the length of the sequences
	+ * \param k the kmer length.
	+ * \return a pair of vectors containing the kmers
	+ * in integer format (A:0, C:1, G:2, T:3) and
	+ * their number of occurences (in the same order).
	+ */
	+ std::pair<std::vector<std::string>, std::vector<double>>
	+ compute_kmer_pvalue(const Matrix2D<int>& sequences,
	+ size_t k) ;
	+
	/*!
	* \brief Creates a table of all kmer found in
	* a set of consensus sequences and return an
	* associated pvalue for each one of them.
	* To transform a consensus sequence into a kmer,
	* the major base at each position is considered
	* only.
	* The pvalue is computed as 1 - p(count, exp)
	* where p is the CDF of a Poisson distribution
	* with <count> being the number of time a given
	* kmer is observed in the data and <exp> its
	* expected number of occurences. The expected
	* number of occurences is computed from the
	* observed di-nucleotide occurences in the given
	* data.
	- * \param consensus_sequences
	+ * \param consensus_sequence a matrix containing the
	+ * consensus sequences as a probability matrix. It has
	+ * the following dimensions :
	+ * 1st the number of consensus sequences
	+ * 2nd the length of the consensus sequences
	+ * 3rd 4 for A, C, G, T
	+ * The summing over the 1st and 2nd dimensions should
	+ * give 1s.
	* \param k the kmer length.
	* \return a pair of vectors containing the kmers
	* in integer format (A:0, C:1, G:2, T:3) and
	* their number of occurences (in the same order).
	*/
	std::pair<std::vector<std::string>, std::vector<double>>
	compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
	size_t k) ;

	+ /*!
	+ * \brief Computes the number of occurences of each kmer
	+ * appearing in a set of consensus sequences.
	+ * To transform a consensus sequence into a kmer,
	+ * the major base at each position is considered
	+ * only.
	+ * \param sequences the sequences in integer
	+ * format (A:0, C:1, G:2, T:3). It has the
	+ * following dimensions :
	+ * 1st the number of sequences
	+ * 2n the length of the sequences
	+ * \param k the kmer length.
	+ * \return a map with the kmer found as keys and their
	+ * number of occurences as value.
	+ */
	+ std::unordered_map<std::string, int>
	+ compute_kmer_count(const Matrix2D<int>& sequences,
	+ size_t k) ;
	+
	/*!
	* \brief Computes the number of occurences of each kmer
	* appearing in a set of consensus sequences.
	* To transform a consensus sequence into a kmer,
	* the major base at each position is considered
	* only.
	* \param consensus_sequence a matrix containing the
	* consensus sequences as a probability matrix. It has
	* the following dimensions :
	* 1st the number of consensus sequences
	* 2nd the length of the consensus sequences
	* 3rd 4 for A, C, G, T
	* The summing over the 1st and 2nd dimensions should
	* give 1s.
	* \param k the kmer length.
	* \return a map with the kmer found as keys and their
	* number of occurences as value.
	*/
	std::unordered_map<std::string, int>
	compute_kmer_count(const Matrix3D<double>& consensus_sequence,
	size_t k) ;

	/*!
	* \brief Compute the pvalue of a set of kmers given their
	* number of occurences and their expected number of
	* occurences.
	* \param counts the number of occurences of each kmer.
	* \param expected the expected number of occurences of
	* each kmer.
	* \return the pvalue of each kmer.
	*/
	std::vector<double> compute_pvalues(const std::vector<int>& counts,
	const std::vector<double>& expected) ;

	/*!
	* \brief Computes the number of expected occurences of a set
	* of kmers given the mononucleotide and dinucleotide composition
	* of the kmers.
	* \param k the length of the kmers.
	* \param kmers the kmers in integer format (A:0, C:1, G:2, T:3)
	* \param comp1 a matrix with the number of counts of each
	* base at each position in the kmers. It has the following
	* dimensions :
	* 1st dim k
	* 2nd dim 4 for A, C, G, T
	* \param comp2 a matrix containing the number of counts of
	* each pair of dinucleotide in the kmers. It has the
	* following dimensions :
	* 1st 4 for A, C, G, T (1st base of the dinucleotide)
	* 2nd k-1
	* 3rd 4 for A, C, G, T (2n base of the dinucleotide)
	* \return the expected number of occurences for each
	* kmer.
	*/
	std::vector<double> compute_exp_values(size_t k,
	const std::vector<std::string>& kmers,
	const Matrix2D<int>& comp1,
	const Matrix3D<int>& comp2) ;

	/*!
	* \brief Computes the number of occurences of each dinucleotide
	* pair at each position of a given set of kmers.
	* \param k the length of the kmers.
	* \param kmers the kmers in integer format (A:0, C:1, G:2, T:3).
	* \param counts the number of occurences of each kmer.
	* \return a matrix containing the number of counts of
	* each pair of dinucleotide in the kmers. It has the
	* following dimensions :
	* 1st 4 for A, C, G, T (1st base of the dinucleotide)
	* 2nd k-1
	* 3rd 4 for A, C, G, T (2n base of the dinucleotide)
	*/
	Matrix3D<int> compute_dinucleotide(size_t k,
	const std::vector<std::string>& kmers,
	const std::vector<int>& counts) ;

	/*!
	* \brief Computes the number of occurences of each nucleotide
	* at each position of a given set of kmers.
	* \param k the length of the kmers.
	* \param kmers the kmers in integer format (A:0, C:1, G:2, T:3).
	* \param counts the number of occurences of each kmer
	* \return a matrix with the number of counts of each
	* base at each position in the kmers. It has the following
	* dimensions :
	* 1st dim k
	* 2nd dim 4 for A, C, G, T
	*/
	Matrix2D<int> compute_mononucleotide(size_t k,
	const std::vector<std::string>& kmers,
	const std::vector<int>& counts) ;

	}

	#endif // KMER_STATISTICS_HPP
	diff --git a/src/main_test.cpp b/src/main_test.cpp
	index a827475..5a9e265 100644
	--- a/src/main_test.cpp
	+++ b/src/main_test.cpp
	@@ -1,713 +1,404 @@
	#include <iostream>
	#include <string>
	#include <vector>
	#include <utility>
	#include <cmath>
	#include <unordered_map>

	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>
	#include <Matrix4D.hpp>
	#include <ConsensusSequenceModelComputer.hpp>
	#include <EMConsensusSequence.hpp>
	#include <SequenceModelComputer.hpp>
	#include <EMSequence.hpp>
	#include <dna_utility.hpp>
	#include <Statistics.hpp>
	#include <sorting_utility.hpp>

	#include <boost/math/distributions.hpp>

	#include <KmersStatistics.hpp>

	template<class T>
	std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
	{ for(const auto& x : v)
	{ stream << x << " " ; }
	return stream ;
	}

	template<class T, class U>
	std::ostream& operator << (std::ostream& stream, const std::unordered_map<T,U>& m)
	{ for(const auto& bucket : m)
	{ stream << bucket.first << " " << bucket.second << std::endl ; }
	return stream ;
	}


	Matrix3D<double> compute_consensus_sequences(const Matrix2D<int>& seq,
	const Matrix4D<double>& prob,
	size_t class_k)
	{
	size_t n_seq = seq.get_nrow() ;
	size_t l_seq = seq.get_ncol() ;
	// size_t n_class = prob.get_dim()[1] ;
	size_t n_shift = prob.get_dim()[2] ;
	size_t n_flip = prob.get_dim()[3] ;
	bool flip = n_flip == 2 ;
	size_t l_model = l_seq - n_shift + 1 ;

	Matrix3D<double> cons_seq(n_seq, l_model, 4, 0.) ;

	// aggregate
	for(size_t i=0; i<n_seq; i++)
	{ std::cerr << "i " << i << std::endl ;
	for(size_t s=0; s<n_shift; s++)
	{ //std::cerr << "s " << s << std::endl ;
	double tot = 0. ;
	// aggregate
	for(size_t j=0; j<l_model; j++)
	{ // std::cerr << "j " << j << std::endl ;
	// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
	int base = seq(i,s+j) ;
	// N should participate to A,C,G,T equivalently -> same as doing nothing
	if(base == dna::char_to_int('N'))
	{ continue ; }
	int base_rv = 4 - base - 1 ;
	// --------- forward ---------
	// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
	cons_seq(i,j,base) += prob(i,class_k,s,0) ;
	tot += prob(i,class_k,s,0) ;

	// --------- reverse ---------
	if(flip)
	{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
	cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ;
	tot += prob(i,class_k,s,1) ;
	}
	}
	}
	}

	// normalize
	for(size_t i=0; i<n_seq; i++)
	{ for(size_t j=0; j<l_model; j++)
	{ double tot = 0. ;
	for(size_t k=0; k<4; k++)
	{ tot += cons_seq(i,j,k) ; }
	for(size_t k=0; k<4; k++)
	{ cons_seq(i,j,k) /= tot ; }
	}
	}

	return cons_seq ;
	}

	Matrix2D<int> char_to_int(const Matrix2D<char>& seq)
	{
	size_t n_row = seq.get_nrow() ;
	size_t n_col = seq.get_ncol() ;

	Matrix2D<int> iseq(n_row, n_col, 0.) ;

	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_col; j++)
	{ iseq(i,j) = dna::char_to_int(seq(i,j)) ; }
	}
	return iseq ;
	}

	Matrix3D<double> sequence_to_consensus(const Matrix2D<int>& seq)
	{
	size_t n_row = seq.get_nrow() ;
	size_t n_col = seq.get_ncol() ;
	size_t n_dim3 = 4 ; // A, C, G, T

	int base_n = dna::char_to_int('N') ;

	Matrix3D<double> cseq(n_row, n_col, n_dim3, 1e-100) ;


	double prob = 1. - 3*(1e-100) ; // p(base) - p(!base)

	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_col; j++)
	{ int base = seq(i,j) ;
	// N
	if(base == base_n)
	{ cseq(i,j,0) = 0.25 ;
	cseq(i,j,1) = 0.25 ;
	cseq(i,j,2) = 0.25 ;
	cseq(i,j,3) = 0.25 ;
	}
	// A C G T
	else
	{ cseq(i,j, base) = prob ; }
	}
	}
	return cseq ;
	}

	double score_consensus_subseq(const Matrix3D<double>& cons_seq,
	size_t row,
	size_t start,
	const Matrix2D<double>& model)
	{ size_t ncol = cons_seq.get_dim()[1] ;
	// size_t dim1 = cons_seq.get_dim()[0] ;
	// size_t dim2 = cons_seq.get_dim()[1] ;
	size_t dim3 = cons_seq.get_dim()[2] ;

	if(start > ncol - model.get_nrow())
	{ char msg[4096] ;
	sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
	start, ncol - model.get_nrow()) ;
	throw std::invalid_argument(msg) ;
	}
	else if(model.get_nrow() > ncol)
	{ char msg[4096] ;
	sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
	model.get_nrow(), ncol) ;
	throw std::invalid_argument(msg) ;
	}
	else if(model.get_ncol() != 4)
	{ char msg[4096] ;
	sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
	model.get_ncol()) ;
	throw std::invalid_argument(msg) ;
	}
	else if(dim3 != 4)
	{ char msg[4096] ;
	sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
	dim3) ;
	throw std::invalid_argument(msg) ;
	}

	/*

	double ll = 0 ;
	for(size_t j_seq=from, j_mod=0; j_seq<to; j_seq++, j_mod++)
	{ for(size_t k=0; k<dim3; k++)
	{ std::cerr << ll
	<< " *= "
	<< "("
	<< model(j_mod,k)
	<< " * "
	<< cons_seq(row,j_seq,k)
	<< ")"
	<< std::endl ;
	ll = model(j_mod,k) cons_seq(row,j_seq,k) ;
	}
	}*/

	size_t from = start ;
	size_t to = from + model.get_nrow() ; // will score [from, to)

	double ll = 0. ;

	for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
	{ double sum = 0. ;
	for(size_t k=0; k<dim3; k++)
	{ std::cerr << sum
	<< "+= ("
	<< cons_seq(row, j_seq, k)
	<< " * "
	<< model(j_mod, k)
	<< ")"
	<< std::endl ;
	sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ;
	}
	std::cerr << sum << std::endl ;
	std::cerr << "---------------" << std::endl ;
	ll += log(sum) ;
	}

	return ll ;
	}


	Matrix2D<double> test_consensus(const std::string& file,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed,
	size_t n_thread)
	{ size_t n_flip = flip + 1 ;

	Matrix2D<int> seq(file) ;
	// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
	Matrix3D<double> cseq = sequence_to_consensus(seq) ;

	// partition
	EMConsensusSequence em(cseq,
	n_class,
	n_iter,
	n_shift,
	flip,
	bckg_class,
	seed,
	n_thread) ;
	em.classify() ;

	// get post prob
	Matrix4D<double> prob = em.get_post_prob() ;

	// compute models
	ConsensusSequenceModelComputer mc(std::move(cseq),
	prob,
	bckg_class,
	n_thread) ;

	Matrix2D<double> model = mc.get_model() ;

	// compute the class prob
	size_t n_row = prob.get_dim()[0] ;
	std::vector<double> class_prob(n_class, 0.) ;
	double p_tot = 0. ;
	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_class; j++)
	{ for(size_t k=0; k<n_shift; k++)
	{ for(size_t l=0; l<n_flip; l++)
	{ class_prob[j] += prob(i,j,k,l) ;
	p_tot += prob(i,j,k,l) ;
	}
	}
	}
	}
	for(auto& prob : class_prob)
	{ prob /= p_tot ; }
	// create a matrix containing the class prob in the 1st
	// column and the model in the remaining columns
	Matrix2D<double> model_final(model.get_nrow(),
	model.get_ncol() + 1) ;
	size_t i_class = 0 ;
	for(size_t i=0; i<model_final.get_nrow(); i++)
	{ if((i != 0) and (i % 4 == 0))
	{ i_class++ ; }
	model_final(i,0) = class_prob[i_class] ;
	}
	// fill the remaining with the model parameters
	for(size_t i=0; i<model.get_nrow(); i++)
	{ for(size_t j=0; j<model.get_ncol(); j++)
	{ model_final(i,j+1) = model(i,j) ; }
	}
	return model_final ;
	}

	Matrix2D<double> test_sequence(const std::string& file,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed,
	size_t n_thread)
	{ size_t n_flip = flip + 1 ;

	Matrix2D<int> seq(file) ;
	// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;

	// partition
	EMSequence em(seq,
	n_class,
	n_iter,
	n_shift,
	flip,
	bckg_class,
	seed,
	n_thread) ;
	em.classify() ;

	// get post prob
	Matrix4D<double> prob = em.get_post_prob() ;

	// compute models
	SequenceModelComputer mc(std::move(seq),
	prob,
	bckg_class,
	n_thread) ;

	Matrix2D<double> model = mc.get_model() ;

	// compute the class prob
	size_t n_row = prob.get_dim()[0] ;
	std::vector<double> class_prob(n_class, 0.) ;
	double p_tot = 0. ;
	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_class; j++)
	{ for(size_t k=0; k<n_shift; k++)
	{ for(size_t l=0; l<n_flip; l++)
	{ class_prob[j] += prob(i,j,k,l) ;
	p_tot += prob(i,j,k,l) ;
	}
	}
	}
	}
	for(auto& prob : class_prob)
	{ prob /= p_tot ; }
	// create a matrix containing the class prob in the 1st
	// column and the model in the remaining columns
	Matrix2D<double> model_final(model.get_nrow(),
	model.get_ncol() + 1) ;
	size_t i_class = 0 ;
	for(size_t i=0; i<model_final.get_nrow(); i++)
	{ if((i != 0) and (i % 4 == 0))
	{ i_class++ ; }
	model_final(i,0) = class_prob[i_class] ;
	}
	// fill the remaining with the model parameters
	for(size_t i=0; i<model.get_nrow(); i++)
	{ for(size_t j=0; j<model.get_ncol(); j++)
	{ model_final(i,j+1) = model(i,j) ; }
	}
	return model_final ;
	}

	-Matrix2D<double> test_aggregate(const std::string& file)
	-{ Matrix2D<int> seq(file) ;
	- size_t nrow = seq.get_nrow() ;
	- size_t ncol = seq.get_ncol() ;

	- Matrix2D<double> model_final(4, ncol+1, 1e-100) ;

	- for(size_t i=0; i<4; i++)
	- { model_final(i,0) = 1. ; }
	-
	- std::vector<double> sum(ncol+1, 0.) ;
	-
	- for(size_t i=0; i<nrow; i++)
	- { for(size_t j=0; j<ncol; j++)
	- { int base = seq(i,j) ;
	- if(base == dna::char_to_int('N'))
	- { continue ; }
	- model_final(base, j+1) += 1. ;
	- sum[j+1] += 1. ;
	- }
	- }
	-
	- for(size_t i=0; i<model_final.get_nrow(); i++)
	- { for(size_t j=1; j<model_final.get_ncol()+1; j++)
	- { model_final(i,j) /= sum[j] ; }
	- }
	-
	-
	- return model_final ;
	-}
	-
	-
	-std::unordered_map<std::string, int> get_kmer_count(const Matrix3D<double>& consensus_sequence,
	- size_t length)
	-{ std::unordered_map<std::string, int> kmer_count ;
	-
	- size_t n_row = consensus_sequence.get_dim()[0] ;
	- size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;
	-
	- for(size_t i=0; i<n_row; i++)
	- { for(size_t s=0; s<n_shift; s++)
	- { // get kmer
	- std::string kmer = dna::consensus_to_kmer(consensus_sequence,
	- i,
	- s,
	- s+length) ;
	- // insert
	- auto iter = kmer_count.find(kmer) ;
	- if(iter == kmer_count.end())
	- { kmer_count.emplace(kmer, 1) ; }
	- // update
	- else
	- { iter->second += 1 ; }
	- }
	- }
	- return kmer_count ;
	-}
	-
	-Matrix2D<int> compute_mononucleotide(size_t k,
	- const std::vector<std::string>& kmers,
	- const std::vector<int>& counts)
	-{
	- Matrix2D<int> comp(k, 4, 0) ;
	-
	- for(size_t i=0; i<kmers.size(); i++)
	- {
	- std::string kmer = kmers[i] ;
	- int count = counts[i] ;
	-
	- for (size_t j=0; j<k ; j++)
	- { int base = int(kmer[j]) - int('0') ;
	- comp(j, base) += count ;
	- }
	- }
	- return(comp);
	-}
	-
	-Matrix3D<int> compute_dinucleotide(size_t k,
	- const std::vector<std::string>& kmers,
	- const std::vector<int>& counts)
	-{
	- Matrix3D<int> comp2(4, k-1, 4) ;
	-
	- for(size_t i=0; i<kmers.size(); i++)
	- { std::string kmer = kmers[i] ;
	- int count = counts[i] ;
	- for (size_t j=0; j<k-1; j++)
	- { int base1 = int(kmer[j]) - int('0') ;
	- int base2 = int(kmer[j+1]) - int('0') ;
	- comp2(base1, j, base2) += count ;
	- }
	- }
	- return(comp2);
	-}
	-
	-std::vector<double> compute_exp_values(size_t k,
	- const std::vector<std::string>& kmers,
	- const Matrix2D<int>& comp1,
	- const Matrix3D<int>& comp2)
	-{ std::vector<double> expected(kmers.size()) ;
	-
	- for(size_t i=0; i<kmers.size(); i++)
	- { std::string kmer = kmers[i] ;
	- double exp = comp1(0, int(kmer[0]) - int('0')) ;
	-
	- for(size_t j=0; j<k-1; j++)
	- { int base1 = int(kmer[j]) - int('0') ;
	- int base2 = int(kmer[j+1]) - int('0') ;
	- exp *= ((double)comp2(base1, j, base2) /
	- (double)comp1(j+1, base2)) ;
	- }
	-
	- expected[i] = exp ;
	- }
	- return expected;
	-}
	-
	-std::vector<double> compute_pvalues(const std::vector<int>& counts,
	- const std::vector<double>& expected)
	-{
	- std::vector<double> pvalues(counts.size()) ;
	-
	- for(size_t i=0; i<counts.size(); i++)
	- {
	- pvalues[i] = poisson_cdf((double)counts[i],
	- expected[i],
	- false) ;
	- }
	- return pvalues ;
	-}
	-
	-std::pair<std::vector<std::string>, std::vector<double>>
	-compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
	- size_t k)
	-{
	-
	- // get kmer count
	- std::unordered_map<std::string, int> kmer_count = get_kmer_count(consensus_sequences,
	- k) ;
	-
	- /*
	- consensus_sequences.get_dim() ;
	- std::unordered_map<std::string, int> kmer_count ;
	- // ANN
	- kmer_count["000"] = 1 ;
	- kmer_count["001"] = 2 ;
	- kmer_count["002"] = 3 ;
	- kmer_count["003"] = 4 ;
	- kmer_count["010"] = 5 ;
	- kmer_count["011"] = 6 ;
	- kmer_count["012"] = 7 ;
	- kmer_count["013"] = 8 ;
	- kmer_count["020"] = 9 ;
	- kmer_count["021"] = 10 ;
	- kmer_count["022"] = 11 ;
	- kmer_count["023"] = 12 ;
	- kmer_count["030"] = 13 ;
	- kmer_count["031"] = 14 ;
	- kmer_count["032"] = 15 ;
	- kmer_count["033"] = 16 ;
	- // CNN
	- kmer_count["100"] = 1 ;
	- kmer_count["101"] = 2 ;
	- kmer_count["102"] = 3 ;
	- kmer_count["103"] = 4 ;
	- kmer_count["110"] = 5 ;
	- kmer_count["111"] = 6 ;
	- kmer_count["112"] = 7 ;
	- kmer_count["113"] = 8 ;
	- kmer_count["120"] = 9 ;
	- kmer_count["121"] = 10 ;
	- kmer_count["122"] = 11 ;
	- kmer_count["123"] = 12 ;
	- kmer_count["130"] = 13 ;
	- kmer_count["131"] = 14 ;
	- kmer_count["132"] = 15 ;
	- kmer_count["133"] = 16 ;
	- // GNN
	- kmer_count["200"] = 1 ;
	- kmer_count["201"] = 2 ;
	- kmer_count["202"] = 3 ;
	- kmer_count["203"] = 4 ;
	- kmer_count["210"] = 5 ;
	- kmer_count["211"] = 6 ;
	- kmer_count["212"] = 7 ;
	- kmer_count["213"] = 8 ;
	- kmer_count["220"] = 9 ;
	- kmer_count["221"] = 10 ;
	- kmer_count["222"] = 11 ;
	- kmer_count["223"] = 12 ;
	- kmer_count["230"] = 13 ;
	- kmer_count["231"] = 14 ;
	- kmer_count["232"] = 15 ;
	- kmer_count["233"] = 16 ;
	- // TNN
	- kmer_count["300"] = 1 ;
	- kmer_count["301"] = 2 ;
	- kmer_count["302"] = 3 ;
	- kmer_count["303"] = 4 ;
	- kmer_count["310"] = 5 ;
	- kmer_count["311"] = 6 ;
	- kmer_count["312"] = 7 ;
	- kmer_count["313"] = 8 ;
	- kmer_count["320"] = 9 ;
	- kmer_count["321"] = 10 ;
	- kmer_count["322"] = 11 ;
	- kmer_count["323"] = 12 ;
	- kmer_count["330"] = 13 ;
	- kmer_count["331"] = 14 ;
	- kmer_count["332"] = 15 ;
	- kmer_count["333"] = 16 ;
	- */
	-
	- // turn to vectors
	- std::vector<std::string> kmers(kmer_count.size()) ;
	- size_t i=0 ;
	- for(const auto& iter : kmer_count)
	- { kmers[i] = iter.first ;
	- i++ ;
	- }
	- std::sort(kmers.begin(), kmers.end()) ;
	- std::vector<int> counts(kmer_count.size()) ;
	- i=0 ;
	- for(const auto& kmer : kmers)
	- { counts[i] = kmer_count[kmer] ;
	- i++ ;
	- }
	- kmer_count.clear() ;
	-
	- // get mononucleotide composition in kmer
	- // this is a probability matrix modeling the kmer
	- Matrix2D<int> comp1 = compute_mononucleotide(k, kmers, counts) ;
	-
	- // get dinucleotide composition in kmer
	- // this is a probability matrix modeling the kmer
	- Matrix3D<int> comp2 = compute_dinucleotide(k, kmers, counts) ;
	-
	- // get expected number of occurence for each kmer
	- std::vector<double> expected = compute_exp_values(k,
	- kmers,
	- comp1,
	- comp2) ;
	-
	- // compute the pvalue associated to each kmer
	- std::vector<double> pvalues = compute_pvalues(counts,
	- expected) ;
	-
	- return std::make_pair(kmers, pvalues) ;
	-}

	int main()
	{
	- // Matrix3D<double> mat(1,5,4,0.) ;
	- // mat(0,0,0) = 1. ; mat(0,1,1) = 1. ; mat(0,2,2) = 1. ; mat(0,3,3) = 1. ; mat(0,4,2) = 1. ;
	-
	-
	-
	std::string file = "/local/groux/scATAC-seq/data/"
	"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
	- Matrix3D<double> mat = sequence_to_consensus(Matrix2D<int>(file)) ;
	-
	-
	- size_t k = 12 ;
	- auto kmers_pvalues = kmers::compute_kmer_pvalue(mat, k) ;
	-
	- // for(size_t i=0; i<kmers_pvalues.first.size(); i++)
	- // { std::cerr << kmers_pvalues.first[i] << " " << kmers_pvalues.second[i] << std::endl ; }
	-
	- std::vector<size_t> index = order(kmers_pvalues.second, true) ;
	-
	- for(size_t i=0 ; i<10; i++)
	- { size_t idx = index[i] ;
	- std::cout << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ;
	- }
	+ sequence_to_consensus(Matrix2D<int>(file)).save("sp1_motifs_10e-7_consensussequences.mat3D") ;

	/*
	size_t n_class = 2 ;
	- size_t n_iter = 50 ;sequence_to_consensus
	+ size_t n_iter = 50 ;
	size_t n_shift = 1 ;
	bool flip = true ;
	bool bckg_class = false ;
	std::string seed = "" ;
	size_t n_thread = 1 ;

	-
	- std::string file = "/local/groux/scATAC-seq/data/"
	- "10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
	- */
	-
	- // std::string file = "/local/groux/scATAC-seq/test.mat" ;
	- // std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_noflip.mat" ;
	- // std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_flip.mat" ;
	- // std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
	-
	- /*
	Matrix2D<double> model_final = test_consensus(file,
	n_class,
	n_iter,
	n_shift,
	flip,
	bckg_class,
	seed,
	n_thread) ;
	-
	*/
	+
	/*
	Matrix2D<double> model_final = test_sequence(file,
	n_class,
	n_iter,
	n_shift,
	flip,
	bckg_class,
	seed,
	n_thread) ;
	*/


	/*
	Matrix2D<double> model_final = test_aggregate(file) ;
	*/

	// std::cout << model_final << std::endl ;

	-
	- /*
	- double p0 = 0 ;
	- // double l0 = log(p0) ;
	-
	- double p1 = 1. - (3.*p0) ;
	- // double l1 = log(p1) ;
	-
	- // Matrix3D<double> cseq(1, 4, 4) ;
	- // cseq(0, 0, 0) = p1 ; cseq(0, 0, 1) = p0 ; cseq(0, 0, 2) = p0 ; cseq(0, 0, 3) = p0 ;
	- // cseq(0, 1, 0) = p0 ; cseq(0, 1, 1) = p1 ; cseq(0, 1, 2) = p0 ; cseq(0, 1, 3) = p0 ;
	- // cseq(0, 2, 0) = p0 ; cseq(0, 2, 1) = p0 ; cseq(0, 2, 2) = p1 ; cseq(0, 2, 3) = p0 ;
	- // cseq(0, 3, 0) = p0 ; cseq(0, 3, 1) = p0 ; cseq(0, 3, 2) = p0 ; cseq(0, 3, 3) = p1 ;
	-
	- Matrix2D<int> seq(1, 4, -1) ;
	- seq(0, 0) = 0 ; seq(0, 1) = 1 ; seq(0, 2) = 2 ; seq(0, 3) = 3 ;
	- Matrix3D<double> cseq = sequence_to_consensus(seq) ;
	-
	- std::cout << cseq << std::endl << std::endl ;
	-
	- Matrix2D<double> mod(4, 4, -1) ;
	- mod(0,0) = p1 ; mod(0,1) = p0 ; mod(0,2) = p0 ; mod(0,3) = p0 ;
	- mod(1,0) = p0 ; mod(1,1) = p1 ; mod(1,2) = p0 ; mod(1,3) = p0 ;
	- mod(2,0) = p0 ; mod(2,1) = p0 ; mod(2,2) = p1 ; mod(2,3) = p0 ;
	- mod(3,0) = p0 ; mod(3,1) = p0 ; mod(3,2) = p0 ; mod(3,3) = p1 ;
	-
	- std::cout << mod << std::endl ;
	-
	- std::cout << score_consensus_subseq(cseq, 0,0,mod)
	- << std::endl ;
	- */
	-
	return EXIT_SUCCESS ;
	}

No OneTemporaryActions

File Metadata

View Options

Event Timeline

No OneTemporary
Actions