Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F75790051
SequenceLayer.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Aug 4, 07:15
Size
17 KB
Mime Type
text/x-c
Expires
Tue, Aug 6, 07:15 (2 d)
Engine
blob
Format
Raw Data
Handle
19608905
Attached To
R8820 scATAC-seq
SequenceLayer.cpp
View Options
#include <SequenceLayer.hpp>
#include <stdexcept> // std::invalid_argument
#include <limits> // numeric_limits
#include <cmath> // log(), pow()
#include <vector>
#include <algorithm> // std::max_element()
#include <Statistics.hpp> // beta_pmf()
#include <Random.hpp> // rand_real_uniform(), rand_int_uniform()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <dna_utility.hpp>
double SequenceLayer::score_subseq(const Matrix2D<int>& seq,
size_t row,
size_t start,
const Matrix2D<double>& model_log)
{
if(start > seq.get_ncol() - model_log.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, seq.get_ncol() - model_log.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model_log.get_nrow() > seq.get_ncol())
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model_log.get_nrow(), seq.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(model_log.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model_log.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
size_t from = start ;
size_t to = from + model_log.get_nrow() ; // will score [from, to)
int n_code = dna::char_to_int('N') ;
double ll = 0 ;
for(size_t i=from, j=0; i<to; i++, j++)
{ int base = seq(row,i) ;
// N char -> get max score
if(base == n_code)
{ std::vector<double> row = model_log.get_row(j) ;
ll += *(std::max_element(std::begin(row),
std::end(row))) ;
}
// A,C,G,T -> get its score
else
{ ll += model_log(j,base) ; }
}
return ll ;
}
SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool last_class_cst)
: DataLayer(data, n_class, n_shift, flip),
last_class_cst(last_class_cst)
{ this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
}
SequenceLayer::SequenceLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool last_class_cst)
: DataLayer(std::move(data), n_class, n_shift, flip),
last_class_cst(last_class_cst)
{ this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
}
SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool last_class_cst)
: DataLayer(data, model,flip),
last_class_cst(last_class_cst)
{}
SequenceLayer::SequenceLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool last_class_cst)
: DataLayer(std::move(data), std::move(model),flip),
last_class_cst(last_class_cst)
{}
SequenceLayer::~SequenceLayer()
{}
void SequenceLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads) const
{
// dimension checks
this->check_loglikelihood_dim(loglikelihood) ;
this->check_loglikelihood_max_dim(loglikelihood_max) ;
// compute the log prob model and the log prob reverse-complement model
std::vector<Matrix2D<double>> model_log(this->n_class,
Matrix2D<double>(this->l_model,
this->n_category,
0.)) ;
std::vector<Matrix2D<double>> model_log_rev = model_log ;
/*
Matrix3D<double> model_log(this->n_class,
this->l_model,
this->n_category,
0.) ;
Matrix3D<double> model_log_rev = model_log ;
*/
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ // forward
model_log[i](j,k) = log(this->model(i,j,k)) ;
// reverse
model_log_rev[i](this->l_model-j-1,this->n_category-k-1)
= log(this->model(i,j,k)) ;
}
}
}
// don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihoods_routine(0, this->n_row,
loglikelihood,
loglikelihood_max,
model_log,
model_log_rev,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&SequenceLayer::compute_loglikelihoods_routine,
this,
slice.first,
slice.second,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
std::ref(model_log),
std::ref(model_log_rev),
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void SequenceLayer::compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model_log,
const std::vector<Matrix2D<double>>& model_log_rev,
std::promise<bool>& done) const
{
// compute log likelihood
for(size_t i=from; i<to; i++)
{
// set max to min possible value
loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;
for(size_t j=0; j<this->n_class; j++)
{
for(size_t s=0; s<this->n_shift; s++)
{ // forward strand
{ double ll_fw = score_subseq(this->data, i, s, model_log[j]) ;
loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// keep track of max per row
if(ll_fw > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_fw ; }
}
// reverse
if(this->flip)
{ double ll_rev = score_subseq(this->data, i, s, model_log_rev[j]) ;
loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// keep track of max per row
if(ll_rev > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_rev ; }
}
}
}
}
done.set_value(true) ;
}
void SequenceLayer::update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads)
{ // don't parallelize
if(threads == nullptr)
{ std::promise<Matrix3D<double>> promise ;
std::future<Matrix3D<double>> future = promise.get_future() ;
this->update_model_routine(0,
this->n_row,
posterior_prob,
promise) ;
// this->model = future.get() ;
auto model = future.get() ;
size_t n_class_to_update = this->n_class - this->last_class_cst ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = model(i,j,k) ; }
}
}
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&SequenceLayer::update_model_routine,
this,
slice.first,
slice.second,
std::ref(posterior_prob),
std::ref(promises[i])))) ;
}
// reinitialise the model
/*
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
*/
size_t n_class_to_update = this->n_class - this->last_class_cst ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = 0. ; }
}
}
// wait until all threads are done working
// and update the model
for(auto& future : futures)
{ Matrix3D<double> model_part = future.get() ;
for(size_t i=0; i<n_class_to_update; i++)
// for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) += model_part(i,j,k) ; }
}
}
}
// -------------------------- threads stop ---------------------------
}
// make sure to have no 0 values
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) =
std::max(this->model(i,j,k), SequenceLayer::p_min) ;
}
}
}
// normalize to get probs
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ double sum = 0. ;
for(size_t k=0; k<this->n_category; k++)
{ sum += this->model(i,j,k) ; }
for(size_t k=0; k<this->n_category; k++)
{ double p = this->model(i,j,k) / sum ;
this->model(i,j,k) = p ;
/*
this->model(i,j,k) =
std::max(p, SequenceLayer::p_min) ;
*/
}
}
}
}
void SequenceLayer::update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& promise) const
{ // dimension checks
this->check_posterior_prob_dim(posterior_prob) ;
Matrix3D<double> model(this->n_class,
this->l_model,
this->n_category,
0.) ;
// the int code of A, C, G, T, N
static int a_code = dna::char_to_int('A') ;
static int c_code = dna::char_to_int('C') ;
static int g_code = dna::char_to_int('G') ;
static int t_code = dna::char_to_int('T') ;
static int n_code = dna::char_to_int('N') ;
// the int code of the reverse complement of A, C, G, T
static int a_code_r = dna::char_to_int('A', true) ;
static int c_code_r = dna::char_to_int('C', true) ;
static int g_code_r = dna::char_to_int('G', true) ;
static int t_code_r = dna::char_to_int('T', true) ;
size_t n_class_to_update = this->n_class - this->last_class_cst ;
for(size_t k=0; k < n_class_to_update; k++)
// for(size_t k=0; k < this->n_class; k++)
{ for(size_t s=0; s<this->n_shift; s++)
{ for(size_t j=0; j<this->l_model; j++)
{ // base prob on fw and rv strand
vector_d base_prob_fw(this->n_category, 0.) ;
vector_d base_prob_rv(this->n_category, 0.) ;
for(size_t i=from; i<to; i++)
{ int base = this->data(i,s+j) ;
int base_rev = this->n_category - base - 1 ;
// N
if(base == n_code)
{ // --------------- forward ---------------
{ base_prob_fw[a_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[c_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[g_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[t_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
{ base_prob_rv[a_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[c_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[g_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[t_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
}
}
// A, C, G, T
else
{ { base_prob_fw[base] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
{ base_prob_rv[base_rev] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
}
}
}
// update this position of the model
for(size_t i=0,i_rev=base_prob_fw.size()-1; i<base_prob_fw.size(); i++,i_rev--)
{ // --------------- forward ---------------
{ model(k,j,i) += base_prob_fw[i] ; }
// --------------- reverse ---------------
if(this->flip)
{ model(k,this->l_model-j-1,i) += base_prob_rv[i] ; }
}
}
}
}
promise.set_value(model) ;
}
void SequenceLayer::set_class(size_t i, const Matrix2D<double>& motif)
{ // check dimensions
if(motif.get_nrow() != this->n_category)
{ char msg[4096] ;
sprintf(msg, "Error! the given class model is incompatible "
"with the SequenceLayer (%zu rows instead of %zu)",
motif.get_nrow(), this->n_category) ;
throw std::invalid_argument(msg) ;
}
else if(motif.get_ncol() != this->l_model)
{ char msg[4096] ;
sprintf(msg, "Error! the given class model is incompatible "
"with the SequenceLayer (%zu columns instead of %zu)",
motif.get_ncol(), this->l_model) ;
throw std::invalid_argument(msg) ;
}
for(size_t j=0; j<motif.get_ncol(); j++)
{ for(size_t k=0; k<motif.get_nrow(); k++)
{ this->model(i,j,k) = motif(k,j) ; }
}
}
Event Timeline
Log In to Comment