Page MenuHomec4science

ClassSequenceDataCreator.cpp
No OneTemporary

File Metadata

Created
Wed, May 15, 08:28

ClassSequenceDataCreator.cpp

#include <ClassSequenceDataCreator.hpp>
#include <string>
#include <vector>
#include <stdexcept> // std::invalid_argument
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <dna_utility.hpp> // char_to_int()
ClassSequenceDataCreator::ClassSequenceDataCreator(const std::string& bed_file_path,
const std::string& fasta_file_path,
const std::string& prob_file_path,
int from,
int to,
size_t class_k)
: bed_file_path(bed_file_path),
fasta_file_path(fasta_file_path),
prob_file_path(prob_file_path),
from(from),
to(to),
class_k(class_k-1)
{ ; }
ClassSequenceDataCreator::~ClassSequenceDataCreator()
{ ; }
Matrix3D<double> ClassSequenceDataCreator::create_matrix()
{
// load posterior prob
Matrix4D<double> prob ;
prob.load(this->prob_file_path) ;
// prob.load(this->file_prob) ;
size_t n_row = prob.get_dim()[0] ;
size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
bool flip = n_flip == 2 ;
// compute class prob
std::vector<double> prob_colsum(n_class, 0.) ;
double tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ double p = prob(i,j,k,l) ;
prob_colsum[j] += p ;
tot += p ;
}
}
}
}
for(auto& p : prob_colsum)
{ p /= tot ; }
// class of interest (1 -> 0-based)
if(this->class_k > n_class)
{ char msg[4096] ;
sprintf(msg, "Error! invalid class of interest (%zu and %zu found)",
this->class_k, n_class) ;
throw std::invalid_argument(msg) ;
}
// realigning the data using the same shift as the partition
// will "trim" the matrix -> create a wider matrix such that
// the realigned matrix will be of the desired wide (original
// from and to parameters).;
int ext = n_shift - 1 ;
int ext_r = ext / 2 ;
int ext_l = ext - ext_r ;
SequenceMatrixCreator mc(this->bed_file_path,
this->fasta_file_path,
this->from - ext_l,
this->to + ext_r) ;
Matrix2D<int> data = mc.create_matrix() ;
// check the compatibility with the prob matrix
if(data.get_nrow() != prob.get_dim()[0])
{ char msg[4096] ;
sprintf(msg, "Error! the number of references in the BED file is not equal to the"
"1st dimension of the probability matrix (%zu, %zu).",
data.get_nrow(), prob.get_dim()[0]) ;
throw(std::invalid_argument(msg)) ;
}
else if(data.get_nrow() < prob.get_dim()[1])
{ char msg[4096] ;
sprintf(msg, "Error! the number of references in the BED file is smaller than"
"the number of classes in the probability matrix (%zu, %zu).",
data.get_nrow(), prob.get_dim()[1]) ;
throw(std::invalid_argument(msg)) ;
}
else if(data.get_ncol() < prob.get_dim()[2])
{ char msg[4096] ;
sprintf(msg, "Error! the data matrix has fewer columns (bins) than the"
"the shifting freedom in the probability matrix (%zu, %zu).",
data.get_ncol(), prob.get_dim()[2]) ;
throw(std::invalid_argument(msg)) ;
}
// realign matrix
size_t n_seq = data.get_nrow() ;
size_t n_col2 = this->to - this->from + 1 ;
Matrix3D<double> data2(n_seq, n_col2, 4, 0.) ;
// aggregate
for(size_t i=0; i<n_seq; i++)
{ // std::cerr << "i " << i << std::endl ;
for(size_t s=0; s<n_shift; s++)
{ //std::cerr << "s " << s << std::endl ;
// aggregate
for(size_t j=0; j<n_col2; j++)
{ // std::cerr << "j " << j << std::endl ;
// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
int base = data(i,s+j) ;
// N should participate to A,C,G,T equivalently -> same as doing nothing
if(base == dna::char_to_int('N'))
{ continue ; }
int base_rv = 4 - base - 1 ;
// --------- forward ---------
// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
data2(i,j,base) += prob(i,class_k,s,0) ;
// --------- reverse ---------
if(flip)
{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
data2(i,j,base_rv) += prob(i,class_k,s,1) ;
}
}
}
}
// clean memory
data = Matrix2D<int>() ;
prob = Matrix4D<double>() ;
// normalize
for(size_t i=0; i<n_seq; i++)
{ for(size_t j=0; j<n_col2; j++)
{ double tot = 0. ;
for(size_t k=0; k<4; k++)
{ tot += data2(i,j,k) ; }
for(size_t k=0; k<4; k++)
{ data2(i,j,k) /= tot ; }
}
}
return data2 ;
}

Event Timeline