SequenceLayer.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Apr 28, 18:12

SequenceLayer.cpp
View Options

	#include <SequenceLayer.hpp>
	#include <stdexcept> // std::invalid_argument
	#include <limits> // numeric_limits
	#include <cmath> // log(), pow()
	#include <vector>
	#include <algorithm> // std::max_element()

	#include <Statistics.hpp> // beta_pmf()
	#include <Random.hpp> // rand_real_uniform(), rand_int_uniform()
	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>
	#include <Matrix4D.hpp>
	#include <dna_utility.hpp>


	double SequenceLayer::score_subseq(const Matrix2D<int>& seq,
	size_t row,
	size_t start,
	const Matrix2D<double>& model_log)
	{
	if(start > seq.get_ncol() - model_log.get_nrow())
	{ char msg[4096] ;
	sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
	start, seq.get_ncol() - model_log.get_nrow()) ;
	throw std::invalid_argument(msg) ;
	}
	else if(model_log.get_nrow() > seq.get_ncol())
	{ char msg[4096] ;
	sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
	model_log.get_nrow(), seq.get_ncol()) ;
	throw std::invalid_argument(msg) ;
	}
	else if(model_log.get_ncol() != 4)
	{ char msg[4096] ;
	sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
	model_log.get_ncol()) ;
	throw std::invalid_argument(msg) ;
	}

	size_t from = start ;
	size_t to = from + model_log.get_nrow() ; // will score [from, to)

	int n_code = dna::char_to_int('N') ;
	double ll = 0 ;
	for(size_t i=from, j=0; i<to; i++, j++)
	{ int base = seq(row,i) ;
	// N char -> get max score
	if(base == n_code)
	{ std::vector<double> row = model_log.get_row(j) ;
	ll += *(std::max_element(std::begin(row),
	std::end(row))) ;
	}
	// A,C,G,T -> get its score
	else
	{ ll += model_log(j,base) ; }
	}
	return ll ;
	}

	SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
	size_t n_class,
	size_t n_shift,
	bool flip,
	bool last_class_cst)
	: DataLayer(data, n_class, n_shift, flip),
	last_class_cst(last_class_cst)
	{ this->n_category = 4 ;
	// initialise the empty model
	this->model = Matrix3D<double>(this->n_class,
	this->l_model,
	this->n_category,
	0.) ;
	}

	SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
	const Matrix3D<double>& model,
	bool flip,
	bool last_class_cst)
	: DataLayer(data, model,flip),
	last_class_cst(last_class_cst)
	{}

	SequenceLayer::~SequenceLayer()
	{}

	void SequenceLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
	vector_d& loglikelihood_max,
	ThreadPool* threads) const
	{
	// dimension checks
	this->check_loglikelihood_dim(loglikelihood) ;
	this->check_loglikelihood_max_dim(loglikelihood_max) ;

	// compute the log prob model and the log prob reverse-complement model
	std::vector<Matrix2D<double>> model_log(this->n_class,
	Matrix2D<double>(this->l_model,
	this->n_category,
	0.)) ;
	std::vector<Matrix2D<double>> model_log_rev = model_log ;
	/*
	Matrix3D<double> model_log(this->n_class,
	this->l_model,
	this->n_category,
	0.) ;
	Matrix3D<double> model_log_rev = model_log ;
	*/
	for(size_t i=0; i<this->n_class; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ // forward
	model_log[i](j,k) = log(this->model(i,j,k)) ;
	// reverse
	model_log_rev[i](this->l_model-j-1,this->n_category-k-1)
	= log(this->model(i,j,k)) ;
	}
	}
	}

	// don't parallelize
	if(threads == nullptr)
	{ std::promise<bool> promise ;
	std::future<bool> future = promise.get_future() ;
	this->compute_loglikelihoods_routine(0, this->n_row,
	loglikelihood,
	loglikelihood_max,
	model_log,
	model_log_rev,
	promise) ;
	future.get() ;
	}
	// parallelize
	else
	{ size_t n_threads = threads->getNThread() ;
	// compute the slices on which each thread will work
	std::vector<std::pair<size_t,size_t>> slices =
	ThreadPool::split_range(0, this->n_row, n_threads) ;

	// get promises and futures
	// the function run by the threads will simply fill the promise with
	// "true" to indicate that they are done
	std::vector<std::promise<bool>> promises(n_threads) ;
	std::vector<std::future<bool>> futures(n_threads) ;
	for(size_t i=0; i<n_threads; i++)
	{ futures[i] = promises[i].get_future() ; }
	// distribute work to threads
	// -------------------------- threads start --------------------------
	for(size_t i=0; i<n_threads; i++)
	{ auto slice = slices[i] ;
	threads->addJob(std::move(
	std::bind(&SequenceLayer::compute_loglikelihoods_routine,
	this,
	slice.first,
	slice.second,
	std::ref(loglikelihood),
	std::ref(loglikelihood_max),
	std::ref(model_log),
	std::ref(model_log_rev),
	std::ref(promises[i])))) ;
	}
	// wait until all threads are done working
	for(auto& future : futures)
	{ future.get() ; }
	// -------------------------- threads stop ---------------------------
	}
	}

	void SequenceLayer::compute_loglikelihoods_routine(size_t from,
	size_t to,
	Matrix4D<double>& loglikelihood,
	vector_d& loglikelihood_max,
	const std::vector<Matrix2D<double>>& model_log,
	const std::vector<Matrix2D<double>>& model_log_rev,
	std::promise<bool>& done) const
	{
	// compute log likelihood
	for(size_t i=from; i<to; i++)
	{
	// set max to min possible value
	loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;

	for(size_t j=0; j<this->n_class; j++)
	{
	for(size_t s=0; s<this->n_shift; s++)
	{ // forward strand
	{ double ll_fw = score_subseq(this->data, i, s, model_log[j]) ;
	loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
	// keep track of max per row
	if(ll_fw > loglikelihood_max[i])
	{ loglikelihood_max[i] = ll_fw ; }

	}
	// reverse
	if(this->flip)
	{ double ll_rev = score_subseq(this->data, i, s, model_log_rev[j]) ;
	loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
	// keep track of max per row
	if(ll_rev > loglikelihood_max[i])
	{ loglikelihood_max[i] = ll_rev ; }
	}
	}
	}
	}
	done.set_value(true) ;
	}

	void SequenceLayer::update_model(const Matrix4D<double>& posterior_prob,
	ThreadPool* threads)
	{ // don't parallelize
	if(threads == nullptr)
	{ std::promise<Matrix3D<double>> promise ;
	std::future<Matrix3D<double>> future = promise.get_future() ;
	this->update_model_routine(0,
	this->n_row,
	posterior_prob,
	promise) ;
	// this->model = future.get() ;
	auto model = future.get() ;
	size_t n_class_to_update = this->n_class - this->last_class_cst ;
	for(size_t i=0; i<n_class_to_update; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ this->model(i,j,k) = model(i,j,k) ; }
	}
	}
	}
	// parallelize
	else
	{ size_t n_threads = threads->getNThread() ;
	// compute the slices on which each thread will work
	std::vector<std::pair<size_t,size_t>> slices =
	ThreadPool::split_range(0, this->n_row, n_threads) ;

	// get promises and futures
	// the function run by the threads will simply fill the promise with
	// "true" to indicate that they are done
	std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
	std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
	for(size_t i=0; i<n_threads; i++)
	{ futures[i] = promises[i].get_future() ; }
	// distribute work to threads
	// -------------------------- threads start --------------------------
	for(size_t i=0; i<n_threads; i++)
	{ auto slice = slices[i] ;
	threads->addJob(std::move(
	std::bind(&SequenceLayer::update_model_routine,
	this,
	slice.first,
	slice.second,
	std::ref(posterior_prob),
	std::ref(promises[i])))) ;
	}
	// reinitialise the model
	/*
	this->model = Matrix3D<double>(this->n_class,
	this->l_model,
	this->n_category,
	0.) ;
	*/
	size_t n_class_to_update = this->n_class - this->last_class_cst ;
	for(size_t i=0; i<n_class_to_update; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ this->model(i,j,k) = 0. ; }
	}
	}
	// wait until all threads are done working
	// and update the model
	for(auto& future : futures)
	{ Matrix3D<double> model_part = future.get() ;
	for(size_t i=0; i<n_class_to_update; i++)
	// for(size_t i=0; i<this->n_class; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ this->model(i,j,k) += model_part(i,j,k) ; }
	}
	}
	}
	// -------------------------- threads stop ---------------------------
	}
	// make sure to have no 0 values
	for(size_t i=0; i<this->n_class; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ for(size_t k=0; k<this->n_category; k++)
	{ this->model(i,j,k) =
	std::max(this->model(i,j,k), SequenceLayer::p_min) ;
	}
	}
	}
	// normalize to get probs
	for(size_t i=0; i<this->n_class; i++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ double sum = 0. ;
	for(size_t k=0; k<this->n_category; k++)
	{ sum += this->model(i,j,k) ; }
	for(size_t k=0; k<this->n_category; k++)
	{ double p = this->model(i,j,k) / sum ;
	this->model(i,j,k) = p ;
	/*
	this->model(i,j,k) =
	std::max(p, SequenceLayer::p_min) ;
	*/
	}
	}
	}
	}


	void SequenceLayer::update_model_routine(size_t from,
	size_t to,
	const Matrix4D<double>& posterior_prob,
	std::promise<Matrix3D<double>>& promise) const
	{ // dimension checks
	this->check_posterior_prob_dim(posterior_prob) ;

	Matrix3D<double> model(this->n_class,
	this->l_model,
	this->n_category,
	0.) ;

	// the int code of A, C, G, T, N
	static int a_code = dna::char_to_int('A') ;
	static int c_code = dna::char_to_int('C') ;
	static int g_code = dna::char_to_int('G') ;
	static int t_code = dna::char_to_int('T') ;
	static int n_code = dna::char_to_int('N') ;
	// the int code of the reverse complement of A, C, G, T
	static int a_code_r = dna::char_to_int('A', true) ;
	static int c_code_r = dna::char_to_int('C', true) ;
	static int g_code_r = dna::char_to_int('G', true) ;
	static int t_code_r = dna::char_to_int('T', true) ;

	size_t n_class_to_update = this->n_class - this->last_class_cst ;

	for(size_t k=0; k < n_class_to_update; k++)
	// for(size_t k=0; k < this->n_class; k++)
	{ for(size_t s=0; s<this->n_shift; s++)
	{ for(size_t j=0; j<this->l_model; j++)
	{ // base prob on fw and rv strand
	vector_d base_prob_fw(this->n_category, 0.) ;
	vector_d base_prob_rv(this->n_category, 0.) ;
	for(size_t i=from; i<to; i++)
	{ int base = this->data(i,s+j) ;
	int base_rev = this->n_category - base - 1 ;
	// N
	if(base == n_code)
	{ // --------------- forward ---------------
	{ base_prob_fw[a_code] +=
	posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
	base_prob_fw[c_code] +=
	posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
	base_prob_fw[g_code] +=
	posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
	base_prob_fw[t_code] +=
	posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
	}
	// --------------- reverse ---------------
	if(this->flip)
	{ base_prob_rv[a_code_r] +=
	posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
	base_prob_rv[c_code_r] +=
	posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
	base_prob_rv[g_code_r] +=
	posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
	base_prob_rv[t_code_r] +=
	posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
	}
	}
	// A, C, G, T
	else
	{ { base_prob_fw[base] +=
	posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
	}
	// --------------- reverse ---------------
	if(this->flip)
	{ base_prob_rv[base_rev] +=
	posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
	}
	}
	}
	// update this position of the model
	for(size_t i=0,i_rev=base_prob_fw.size()-1; i<base_prob_fw.size(); i++,i_rev--)
	{ // --------------- forward ---------------
	{ model(k,j,i) += base_prob_fw[i] ; }
	// --------------- reverse ---------------
	if(this->flip)
	{ model(k,this->l_model-j-1,i) += base_prob_rv[i] ; }
	}
	}
	}
	}
	promise.set_value(model) ;
	}

	void SequenceLayer::set_class(size_t i, const Matrix2D<double>& motif)
	{ // check dimensions
	if(motif.get_nrow() != this->n_category)
	{ char msg[4096] ;
	sprintf(msg, "Error! the given class model is incompatible "
	"with the SequenceLayer (%zu rows instead of %zu)",
	motif.get_nrow(), this->n_category) ;
	throw std::invalid_argument(msg) ;
	}
	else if(motif.get_ncol() != this->l_model)
	{ char msg[4096] ;
	sprintf(msg, "Error! the given class model is incompatible "
	"with the SequenceLayer (%zu columns instead of %zu)",
	motif.get_ncol(), this->l_model) ;
	throw std::invalid_argument(msg) ;
	}

	for(size_t j=0; j<motif.get_ncol(); j++)
	{ for(size_t k=0; k<motif.get_nrow(); k++)
	{ this->model(i,j,k) = motif(k,j) ; }
	}
	}

SequenceLayer.cppNo OneTemporaryActions

File Metadata

SequenceLayer.cppView Options

Event Timeline

SequenceLayer.cpp
No OneTemporary
Actions

SequenceLayer.cpp
View Options