EMSequence.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, May 7, 23:42

EMSequence.cpp
View Options

	#include <EMSequence.hpp>

	#include <string>
	#include <vector>
	#include <future> // std::promise, std::future
	#include <utility> // std::pair, std::move()
	#include <functional> // std::bind(), std::ref()
	#include <cmath> // exp()

	#include <SequenceLayer.hpp> // SequenceLayer
	#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
	#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
	#include <ThreadPool.hpp> // ThreadPool
	#include <dna_utility.hpp> // dna::base_composition()


	Matrix2D<double> EMSequence::generate_bckg_motif(const Matrix2D<int>& sequence_matrix,
	size_t motif_length,
	bool reverse_compl)
	{ // sequence composition
	std::vector<double> base_comp =
	dna::base_composition(sequence_matrix,
	reverse_compl) ;
	// create a motif
	Matrix2D<double> bckg_motif(4,motif_length) ;
	for(size_t i=0; i<bckg_motif.get_nrow(); i++)
	{ for(size_t j=0; j<bckg_motif.get_ncol(); j++)
	{ bckg_motif(i,j) = base_comp[i] ; }
	}

	return bckg_motif ;
	}

	EMSequence::EMSequence(const Matrix2D<int>& seq_matrix,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed,
	size_t n_threads)
	: EMBase(seq_matrix.get_nrow(),
	seq_matrix.get_ncol(),
	n_class,
	n_iter,
	n_shift,
	flip,
	n_threads),
	loglikelihood_max(n_row, 0.),
	seq_layer(nullptr)
	{
	this->loglikelihood_max = vector_d(n_row, 0.) ;

	// initialise post prob randomly
	// getRandomGenerator(seed) ;
	this->set_post_prob_random(seed) ;

	// data and models
	this->seq_layer = new SequenceLayer(seq_matrix,
	this->n_class,
	this->n_shift,
	this->flip,
	bckg_class) ;

	// intialise the models with the post prob
	this->seq_layer->update_model(this->post_prob,
	this->threads) ;

	// compute background and
	// overwrite last class as background class
	if(bckg_class)
	{ size_t motif_len = seq_matrix.get_ncol() - this->n_shift + 1 ;
	Matrix2D<double> bckg_motif = EMSequence::generate_bckg_motif(seq_matrix,
	motif_len,
	this->flip) ;
	this->seq_layer->set_class(this->n_class-1,
	bckg_motif) ;
	}
	}

	EMSequence::EMSequence(Matrix2D<int>&& seq_matrix,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed,
	size_t n_threads)
	: EMBase(seq_matrix.get_nrow(),
	seq_matrix.get_ncol(),
	n_class,
	n_iter,
	n_shift,
	flip,
	n_threads),
	loglikelihood_max(n_row, 0.),
	seq_layer(nullptr)
	{
	this->loglikelihood_max = vector_d(n_row, 0.) ;

	// initialise post prob randomly
	// getRandomGenerator(seed) ;
	this->set_post_prob_random(seed) ;

	// compute background before giving data to
	// SequenceLayer
	Matrix2D<double> bckg_motif ;
	if(bckg_class)
	{ size_t motif_len = seq_matrix.get_ncol() - this->n_shift + 1 ;
	bckg_motif = EMSequence::generate_bckg_motif(seq_matrix,
	motif_len,
	this->flip) ;
	}
	// data and models
	this->seq_layer = new SequenceLayer(std::move(seq_matrix),
	this->n_class,
	this->n_shift,
	this->flip,
	bckg_class) ;

	// intialise the models with the post prob
	this->seq_layer->update_model(this->post_prob,
	this->threads) ;
	// overwrite last class as background class
	if(bckg_class)
	{ this->seq_layer->set_class(this->n_class-1,
	bckg_motif) ;
	}
	}

	EMSequence::EMSequence(const Matrix2D<int>& seq_matrix,
	const Matrix3D<double>& motifs,
	size_t n_iter,
	bool flip,
	bool bckg_class,
	size_t n_threads)
	: EMBase(seq_matrix.get_nrow(),
	seq_matrix.get_ncol(),
	motifs.get_dim()[0],
	n_iter,
	seq_matrix.get_ncol() - motifs.get_dim()[1] + 1,
	flip,
	n_threads),
	loglikelihood_max(n_row, 0.),
	seq_layer(nullptr)
	{

	this->loglikelihood_max = vector_d(n_row, 0.) ;

	// data and models
	// background motif (if any) is the last of the given motifs
	this->seq_layer = new SequenceLayer(seq_matrix,
	motifs,
	this->flip,
	bckg_class) ;

	// intialise the class prob uniformly
	this->set_state_prob_uniform() ;
	}

	EMSequence::EMSequence(Matrix2D<int>&& seq_matrix,
	Matrix3D<double>&& motifs,
	size_t n_iter,
	bool flip,
	bool bckg_class,
	size_t n_threads)
	: EMBase(seq_matrix.get_nrow(),
	seq_matrix.get_ncol(),
	motifs.get_dim()[0],
	n_iter,
	seq_matrix.get_ncol() - motifs.get_dim()[1] + 1,
	flip,
	n_threads),
	loglikelihood_max(n_row, 0.),
	seq_layer(nullptr)
	{

	this->loglikelihood_max = vector_d(n_row, 0.) ;

	// data and models
	// background motif (if any) is the last of the given motifs
	this->seq_layer = new SequenceLayer(std::move(seq_matrix),
	std::move(motifs),
	this->flip,
	bckg_class) ;

	// intialise the class prob uniformly
	this->set_state_prob_uniform() ;
	}


	EMSequence::~EMSequence()
	{ if(this->seq_layer == nullptr)
	{ delete this->seq_layer ;
	this->seq_layer = nullptr ;
	}
	}

	Matrix3D<double> EMSequence::get_sequence_models() const
	{ return seq_layer->get_model() ; }

	EMSequence::exit_codes EMSequence::classify()
	{
	size_t bar_update_n = this->n_iter ;
	ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;

	// optimize the partition
	for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
	{ // E-step
	this->compute_loglikelihood() ;
	this->compute_post_prob() ;
	// M-step
	this->compute_class_prob() ;
	this->update_models() ;
	this->center_post_state_prob() ;
	bar.update() ;
	}
	bar.update() ; std::cerr << std::endl ;
	return EMSequence::exit_codes::ITER_MAX ;
	}

	void EMSequence::compute_loglikelihood()
	{ // compute the loglikelihood
	this->seq_layer->compute_loglikelihoods(this->loglikelihood,
	this->loglikelihood_max,
	this->threads) ;
	// rescale the values
	// don't parallelize
	if(this->threads == nullptr)
	{ std::promise<bool> promise ;
	std::future<bool> future = promise.get_future() ;
	this->compute_loglikelihood_routine(0,
	this->n_row,
	promise) ;
	future.get() ;
	}
	// parallelize
	else
	{ size_t n_threads = this->threads->getNThread() ;

	// compute the slices on which each thread will work
	std::vector<std::pair<size_t,size_t>> slices =
	ThreadPool::split_range(0, this->n_row,n_threads) ;

	// get promises and futures
	std::vector<std::promise<bool>> promises(n_threads) ;
	std::vector<std::future<bool>> futures(n_threads) ;
	for(size_t i=0; i<n_threads; i++)
	{ futures[i] = promises[i].get_future() ; }

	// distribute work to threads
	// -------------------------- threads start --------------------------
	for(size_t i=0; i<n_threads; i++)
	{ auto slice = slices[i] ;
	this->threads->addJob(std::move(
	std::bind(&EMSequence::compute_loglikelihood_routine,
	this,
	slice.first,
	slice.second,
	std::ref(promises[i])))) ;
	}
	// wait until all threads are done working
	for(auto& future : futures)
	{ future.get() ; }
	// -------------------------- threads stop ---------------------------
	}
	}

	void EMSequence::compute_loglikelihood_routine(size_t from,
	size_t to,
	std::promise<bool>& done)
	{
	// rescale the values
	for(size_t i=from; i<to; i++)
	{ for(size_t j=0; j<this->n_class; j++)
	{ for(size_t k=0; k<this->n_shift; k++)
	{ for(size_t l=0; l<this->n_flip; l++)
	{ this->loglikelihood(i,j,k,l) =
	(this->loglikelihood(i,j,k,l) -
	this->loglikelihood_max[i]) ;
	}
	}
	}
	}
	done.set_value(true) ;
	}

	void EMSequence::compute_post_prob()
	{ // don't parallelize
	if(this->threads == nullptr)
	{ std::promise<vector_d> promise ;
	std::future<vector_d> future = promise.get_future() ;
	this->compute_post_prob_routine(0, this->n_row, promise) ;
	// compute the sum of post prob and the per class sum of post prob
	// from the partial results computed on each slice
	this->post_prob_tot = 0. ;
	this->post_prob_colsum = future.get() ;
	for(const auto& prob : this->post_prob_colsum)
	{ this->post_prob_tot += prob ; }
	}
	// parallelize
	else
	{ size_t n_threads = this->threads->getNThread() ;

	// compute the slices on which each thread will work
	std::vector<std::pair<size_t,size_t>> slices =
	ThreadPool::split_range(0, this->n_row,n_threads) ;

	// get promises and futures
	// the function run by the threads will compute
	// the partial sum per class of post_prob for the given slice
	// this should be used to compute the complete sum of post_prob
	// and the complete sum per class of post_prob
	std::vector<std::promise<vector_d>> promises(n_threads) ;
	std::vector<std::future<vector_d>> futures(n_threads) ;
	for(size_t i=0; i<n_threads; i++)
	{ futures[i] = promises[i].get_future() ; }

	// distribute work to threads
	// -------------------------- threads start --------------------------
	for(size_t i=0; i<n_threads; i++)
	{ auto slice = slices[i] ;
	this->threads->addJob(std::move(
	std::bind(&EMSequence::compute_post_prob_routine,
	this,
	slice.first,
	slice.second,
	std::ref(promises[i])))) ;
	}
	// wait until all threads are done working
	// compute the sum of post prob and the per class sum of post prob
	// from the partial results computed on each slice
	this->post_prob_tot = 0. ;
	this->post_prob_colsum = vector_d(this->n_class, 0.) ;
	for(auto& future : futures)
	{ auto probs = future.get() ;
	for(size_t i=0; i<this->n_class; i++)
	{ double prob = probs[i] ;
	this->post_prob_colsum[i] += prob ;
	this->post_prob_tot += prob ;
	}
	}
	// -------------------------- threads stop ---------------------------
	}
	}


	void EMSequence::compute_post_prob_routine(size_t from,
	size_t to,
	std::promise<vector_d>& post_prob_colsum)
	{ vector_d colsums(this->n_class, 0.) ;

	// reset grand total
	// this->post_prob_tot = 0 ;
	// this->post_prob_colsum = vector_d(n_class, 0) ;

	// post prob
	for(size_t i=from; i<to; i++)
	{ // reset row sum to 0
	this->post_prob_rowsum[i] = 0. ;
	for(size_t n_class=0; n_class<this->n_class; n_class++)
	{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
	{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
	{
	double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
	this->post_state_prob(n_class,n_shift,n_flip) ;
	this->post_prob(i,n_class,n_shift,n_flip) = p ;
	this->post_prob_rowsum[i] += p ;
	}
	}
	}
	// normalize
	for(size_t n_class=0; n_class<this->n_class; n_class++)
	{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
	{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
	{
	double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
	this->post_prob_rowsum[i],
	SequenceLayer::p_min) ;
	this->post_prob(i,n_class,n_shift,n_flip) = p ;
	colsums[n_class] += p ;
	}
	}
	}
	}
	post_prob_colsum.set_value(colsums) ;
	}

	void EMSequence::update_models()
	{ this->seq_layer->update_model(this->post_prob,
	this->threads) ;
	}

EMSequence.cppNo OneTemporaryActions

File Metadata

EMSequence.cppView Options

Event Timeline

EMSequence.cpp
No OneTemporary
Actions

EMSequence.cpp
View Options