Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F62749493
ClassSequenceDataCreator.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, May 15, 08:28
Size
5 KB
Mime Type
text/x-c
Expires
Fri, May 17, 08:28 (2 d)
Engine
blob
Format
Raw Data
Handle
17683631
Attached To
R8820 scATAC-seq
ClassSequenceDataCreator.cpp
View Options
#include <ClassSequenceDataCreator.hpp>
#include <string>
#include <vector>
#include <stdexcept> // std::invalid_argument
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <dna_utility.hpp> // char_to_int()
ClassSequenceDataCreator::ClassSequenceDataCreator(const std::string& bed_file_path,
const std::string& fasta_file_path,
const std::string& prob_file_path,
int from,
int to,
size_t class_k)
: bed_file_path(bed_file_path),
fasta_file_path(fasta_file_path),
prob_file_path(prob_file_path),
from(from),
to(to),
class_k(class_k-1)
{ ; }
ClassSequenceDataCreator::~ClassSequenceDataCreator()
{ ; }
Matrix3D<double> ClassSequenceDataCreator::create_matrix()
{
// load posterior prob
Matrix4D<double> prob ;
prob.load(this->prob_file_path) ;
// prob.load(this->file_prob) ;
size_t n_row = prob.get_dim()[0] ;
size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
bool flip = n_flip == 2 ;
// compute class prob
std::vector<double> prob_colsum(n_class, 0.) ;
double tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ double p = prob(i,j,k,l) ;
prob_colsum[j] += p ;
tot += p ;
}
}
}
}
for(auto& p : prob_colsum)
{ p /= tot ; }
// class of interest (1 -> 0-based)
if(this->class_k > n_class)
{ char msg[4096] ;
sprintf(msg, "Error! invalid class of interest (%zu and %zu found)",
this->class_k, n_class) ;
throw std::invalid_argument(msg) ;
}
// realigning the data using the same shift as the partition
// will "trim" the matrix -> create a wider matrix such that
// the realigned matrix will be of the desired wide (original
// from and to parameters).;
int ext = n_shift - 1 ;
int ext_r = ext / 2 ;
int ext_l = ext - ext_r ;
SequenceMatrixCreator mc(this->bed_file_path,
this->fasta_file_path,
this->from - ext_l,
this->to + ext_r) ;
Matrix2D<int> data = mc.create_matrix() ;
// check the compatibility with the prob matrix
if(data.get_nrow() != prob.get_dim()[0])
{ char msg[4096] ;
sprintf(msg, "Error! the number of references in the BED file is not equal to the"
"1st dimension of the probability matrix (%zu, %zu).",
data.get_nrow(), prob.get_dim()[0]) ;
throw(std::invalid_argument(msg)) ;
}
else if(data.get_nrow() < prob.get_dim()[1])
{ char msg[4096] ;
sprintf(msg, "Error! the number of references in the BED file is smaller than"
"the number of classes in the probability matrix (%zu, %zu).",
data.get_nrow(), prob.get_dim()[1]) ;
throw(std::invalid_argument(msg)) ;
}
else if(data.get_ncol() < prob.get_dim()[2])
{ char msg[4096] ;
sprintf(msg, "Error! the data matrix has fewer columns (bins) than the"
"the shifting freedom in the probability matrix (%zu, %zu).",
data.get_ncol(), prob.get_dim()[2]) ;
throw(std::invalid_argument(msg)) ;
}
// realign matrix
size_t n_seq = data.get_nrow() ;
size_t n_col2 = this->to - this->from + 1 ;
Matrix3D<double> data2(n_seq, n_col2, 4, 0.) ;
// aggregate
for(size_t i=0; i<n_seq; i++)
{ // std::cerr << "i " << i << std::endl ;
for(size_t s=0; s<n_shift; s++)
{ //std::cerr << "s " << s << std::endl ;
// aggregate
for(size_t j=0; j<n_col2; j++)
{ // std::cerr << "j " << j << std::endl ;
// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
int base = data(i,s+j) ;
// N should participate to A,C,G,T equivalently -> same as doing nothing
if(base == dna::char_to_int('N'))
{ continue ; }
int base_rv = 4 - base - 1 ;
// --------- forward ---------
// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
data2(i,j,base) += prob(i,class_k,s,0) ;
// --------- reverse ---------
if(flip)
{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
data2(i,j,base_rv) += prob(i,class_k,s,1) ;
}
}
}
}
// clean memory
data = Matrix2D<int>() ;
prob = Matrix4D<double>() ;
// normalize
for(size_t i=0; i<n_seq; i++)
{ for(size_t j=0; j<n_col2; j++)
{ double tot = 0. ;
for(size_t k=0; k<4; k++)
{ tot += data2(i,j,k) ; }
for(size_t k=0; k<4; k++)
{ data2(i,j,k) /= tot ; }
}
}
return data2 ;
}
Event Timeline
Log In to Comment