Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F59998971
main_test.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Apr 26, 16:47
Size
22 KB
Mime Type
application/octet-stream
Expires
Sun, Apr 28, 16:47 (2 d)
Engine
blob
Format
Raw Data
Handle
17284027
Attached To
R8820 scATAC-seq
main_test.cpp
View Options
#include <iostream>
#include <string>
#include <vector>
#include <utility>
#include <cmath>
#include <unordered_map>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ConsensusSequenceModelComputer.hpp>
#include <EMConsensusSequence.hpp>
#include <SequenceModelComputer.hpp>
#include <EMSequence.hpp>
#include <dna_utility.hpp>
#include <Statistics.hpp>
#include <sorting_utility.hpp>
#include <boost/math/distributions.hpp>
#include <KmersStatistics.hpp>
template<class T>
std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
template<class T, class U>
std::ostream& operator << (std::ostream& stream, const std::unordered_map<T,U>& m)
{ for(const auto& bucket : m)
{ stream << bucket.first << " " << bucket.second << std::endl ; }
return stream ;
}
Matrix3D<double> compute_consensus_sequences(const Matrix2D<int>& seq,
const Matrix4D<double>& prob,
size_t class_k)
{
size_t n_seq = seq.get_nrow() ;
size_t l_seq = seq.get_ncol() ;
// size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
bool flip = n_flip == 2 ;
size_t l_model = l_seq - n_shift + 1 ;
Matrix3D<double> cons_seq(n_seq, l_model, 4, 0.) ;
// aggregate
for(size_t i=0; i<n_seq; i++)
{ std::cerr << "i " << i << std::endl ;
for(size_t s=0; s<n_shift; s++)
{ //std::cerr << "s " << s << std::endl ;
double tot = 0. ;
// aggregate
for(size_t j=0; j<l_model; j++)
{ // std::cerr << "j " << j << std::endl ;
// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
int base = seq(i,s+j) ;
// N should participate to A,C,G,T equivalently -> same as doing nothing
if(base == dna::char_to_int('N'))
{ continue ; }
int base_rv = 4 - base - 1 ;
// --------- forward ---------
// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
cons_seq(i,j,base) += prob(i,class_k,s,0) ;
tot += prob(i,class_k,s,0) ;
// --------- reverse ---------
if(flip)
{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ;
tot += prob(i,class_k,s,1) ;
}
}
}
}
// normalize
for(size_t i=0; i<n_seq; i++)
{ for(size_t j=0; j<l_model; j++)
{ double tot = 0. ;
for(size_t k=0; k<4; k++)
{ tot += cons_seq(i,j,k) ; }
for(size_t k=0; k<4; k++)
{ cons_seq(i,j,k) /= tot ; }
}
}
return cons_seq ;
}
Matrix2D<int> char_to_int(const Matrix2D<char>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
Matrix2D<int> iseq(n_row, n_col, 0.) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ iseq(i,j) = dna::char_to_int(seq(i,j)) ; }
}
return iseq ;
}
Matrix3D<double> sequence_to_consensus(const Matrix2D<int>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
size_t n_dim3 = 4 ; // A, C, G, T
int base_n = dna::char_to_int('N') ;
Matrix3D<double> cseq(n_row, n_col, n_dim3, 1e-100) ;
double prob = 1. - 3*(1e-100) ; // p(base) - p(!base)
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ int base = seq(i,j) ;
// N
if(base == base_n)
{ cseq(i,j,0) = 0.25 ;
cseq(i,j,1) = 0.25 ;
cseq(i,j,2) = 0.25 ;
cseq(i,j,3) = 0.25 ;
}
// A C G T
else
{ cseq(i,j, base) = prob ; }
}
}
return cseq ;
}
double score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t start,
const Matrix2D<double>& model)
{ size_t ncol = cons_seq.get_dim()[1] ;
// size_t dim1 = cons_seq.get_dim()[0] ;
// size_t dim2 = cons_seq.get_dim()[1] ;
size_t dim3 = cons_seq.get_dim()[2] ;
if(start > ncol - model.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, ncol - model.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_nrow() > ncol)
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model.get_nrow(), ncol) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(dim3 != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
dim3) ;
throw std::invalid_argument(msg) ;
}
/*
double ll = 0 ;
for(size_t j_seq=from, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ for(size_t k=0; k<dim3; k++)
{ std::cerr << ll
<< " *= "
<< "("
<< model(j_mod,k)
<< " * "
<< cons_seq(row,j_seq,k)
<< ")"
<< std::endl ;
ll *= model(j_mod,k) * cons_seq(row,j_seq,k) ;
}
}*/
size_t from = start ;
size_t to = from + model.get_nrow() ; // will score [from, to)
double ll = 0. ;
for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ double sum = 0. ;
for(size_t k=0; k<dim3; k++)
{ std::cerr << sum
<< "+= ("
<< cons_seq(row, j_seq, k)
<< " * "
<< model(j_mod, k)
<< ")"
<< std::endl ;
sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ;
}
std::cerr << sum << std::endl ;
std::cerr << "---------------" << std::endl ;
ll += log(sum) ;
}
return ll ;
}
Matrix2D<double> test_consensus(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
// partition
EMConsensusSequence em(cseq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
// compute models
ConsensusSequenceModelComputer mc(std::move(cseq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_sequence(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
// partition
EMSequence em(seq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
// compute models
SequenceModelComputer mc(std::move(seq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_aggregate(const std::string& file)
{ Matrix2D<int> seq(file) ;
size_t nrow = seq.get_nrow() ;
size_t ncol = seq.get_ncol() ;
Matrix2D<double> model_final(4, ncol+1, 1e-100) ;
for(size_t i=0; i<4; i++)
{ model_final(i,0) = 1. ; }
std::vector<double> sum(ncol+1, 0.) ;
for(size_t i=0; i<nrow; i++)
{ for(size_t j=0; j<ncol; j++)
{ int base = seq(i,j) ;
if(base == dna::char_to_int('N'))
{ continue ; }
model_final(base, j+1) += 1. ;
sum[j+1] += 1. ;
}
}
for(size_t i=0; i<model_final.get_nrow(); i++)
{ for(size_t j=1; j<model_final.get_ncol()+1; j++)
{ model_final(i,j) /= sum[j] ; }
}
return model_final ;
}
std::unordered_map<std::string, int> get_kmer_count(const Matrix3D<double>& consensus_sequence,
size_t length)
{ std::unordered_map<std::string, int> kmer_count ;
size_t n_row = consensus_sequence.get_dim()[0] ;
size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;
for(size_t i=0; i<n_row; i++)
{ for(size_t s=0; s<n_shift; s++)
{ // get kmer
std::string kmer = dna::consensus_to_kmer(consensus_sequence,
i,
s,
s+length) ;
// insert
auto iter = kmer_count.find(kmer) ;
if(iter == kmer_count.end())
{ kmer_count.emplace(kmer, 1) ; }
// update
else
{ iter->second += 1 ; }
}
}
return kmer_count ;
}
Matrix2D<int> compute_mononucleotide(size_t k,
const std::vector<std::string>& kmers,
const std::vector<int>& counts)
{
Matrix2D<int> comp(k, 4, 0) ;
for(size_t i=0; i<kmers.size(); i++)
{
std::string kmer = kmers[i] ;
int count = counts[i] ;
for (size_t j=0; j<k ; j++)
{ int base = int(kmer[j]) - int('0') ;
comp(j, base) += count ;
}
}
return(comp);
}
Matrix3D<int> compute_dinucleotide(size_t k,
const std::vector<std::string>& kmers,
const std::vector<int>& counts)
{
Matrix3D<int> comp2(4, k-1, 4) ;
for(size_t i=0; i<kmers.size(); i++)
{ std::string kmer = kmers[i] ;
int count = counts[i] ;
for (size_t j=0; j<k-1; j++)
{ int base1 = int(kmer[j]) - int('0') ;
int base2 = int(kmer[j+1]) - int('0') ;
comp2(base1, j, base2) += count ;
}
}
return(comp2);
}
std::vector<double> compute_exp_values(size_t k,
const std::vector<std::string>& kmers,
const Matrix2D<int>& comp1,
const Matrix3D<int>& comp2)
{ std::vector<double> expected(kmers.size()) ;
for(size_t i=0; i<kmers.size(); i++)
{ std::string kmer = kmers[i] ;
double exp = comp1(0, int(kmer[0]) - int('0')) ;
for(size_t j=0; j<k-1; j++)
{ int base1 = int(kmer[j]) - int('0') ;
int base2 = int(kmer[j+1]) - int('0') ;
exp *= ((double)comp2(base1, j, base2) /
(double)comp1(j+1, base2)) ;
}
expected[i] = exp ;
}
return expected;
}
std::vector<double> compute_pvalues(const std::vector<int>& counts,
const std::vector<double>& expected)
{
std::vector<double> pvalues(counts.size()) ;
for(size_t i=0; i<counts.size(); i++)
{
pvalues[i] = poisson_cdf((double)counts[i],
expected[i],
false) ;
}
return pvalues ;
}
std::pair<std::vector<std::string>, std::vector<double>>
compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
size_t k)
{
// get kmer count
std::unordered_map<std::string, int> kmer_count = get_kmer_count(consensus_sequences,
k) ;
/*
consensus_sequences.get_dim() ;
std::unordered_map<std::string, int> kmer_count ;
// ANN
kmer_count["000"] = 1 ;
kmer_count["001"] = 2 ;
kmer_count["002"] = 3 ;
kmer_count["003"] = 4 ;
kmer_count["010"] = 5 ;
kmer_count["011"] = 6 ;
kmer_count["012"] = 7 ;
kmer_count["013"] = 8 ;
kmer_count["020"] = 9 ;
kmer_count["021"] = 10 ;
kmer_count["022"] = 11 ;
kmer_count["023"] = 12 ;
kmer_count["030"] = 13 ;
kmer_count["031"] = 14 ;
kmer_count["032"] = 15 ;
kmer_count["033"] = 16 ;
// CNN
kmer_count["100"] = 1 ;
kmer_count["101"] = 2 ;
kmer_count["102"] = 3 ;
kmer_count["103"] = 4 ;
kmer_count["110"] = 5 ;
kmer_count["111"] = 6 ;
kmer_count["112"] = 7 ;
kmer_count["113"] = 8 ;
kmer_count["120"] = 9 ;
kmer_count["121"] = 10 ;
kmer_count["122"] = 11 ;
kmer_count["123"] = 12 ;
kmer_count["130"] = 13 ;
kmer_count["131"] = 14 ;
kmer_count["132"] = 15 ;
kmer_count["133"] = 16 ;
// GNN
kmer_count["200"] = 1 ;
kmer_count["201"] = 2 ;
kmer_count["202"] = 3 ;
kmer_count["203"] = 4 ;
kmer_count["210"] = 5 ;
kmer_count["211"] = 6 ;
kmer_count["212"] = 7 ;
kmer_count["213"] = 8 ;
kmer_count["220"] = 9 ;
kmer_count["221"] = 10 ;
kmer_count["222"] = 11 ;
kmer_count["223"] = 12 ;
kmer_count["230"] = 13 ;
kmer_count["231"] = 14 ;
kmer_count["232"] = 15 ;
kmer_count["233"] = 16 ;
// TNN
kmer_count["300"] = 1 ;
kmer_count["301"] = 2 ;
kmer_count["302"] = 3 ;
kmer_count["303"] = 4 ;
kmer_count["310"] = 5 ;
kmer_count["311"] = 6 ;
kmer_count["312"] = 7 ;
kmer_count["313"] = 8 ;
kmer_count["320"] = 9 ;
kmer_count["321"] = 10 ;
kmer_count["322"] = 11 ;
kmer_count["323"] = 12 ;
kmer_count["330"] = 13 ;
kmer_count["331"] = 14 ;
kmer_count["332"] = 15 ;
kmer_count["333"] = 16 ;
*/
// turn to vectors
std::vector<std::string> kmers(kmer_count.size()) ;
size_t i=0 ;
for(const auto& iter : kmer_count)
{ kmers[i] = iter.first ;
i++ ;
}
std::sort(kmers.begin(), kmers.end()) ;
std::vector<int> counts(kmer_count.size()) ;
i=0 ;
for(const auto& kmer : kmers)
{ counts[i] = kmer_count[kmer] ;
i++ ;
}
kmer_count.clear() ;
// get mononucleotide composition in kmer
// this is a probability matrix modeling the kmer
Matrix2D<int> comp1 = compute_mononucleotide(k, kmers, counts) ;
// get dinucleotide composition in kmer
// this is a probability matrix modeling the kmer
Matrix3D<int> comp2 = compute_dinucleotide(k, kmers, counts) ;
// get expected number of occurence for each kmer
std::vector<double> expected = compute_exp_values(k,
kmers,
comp1,
comp2) ;
// compute the pvalue associated to each kmer
std::vector<double> pvalues = compute_pvalues(counts,
expected) ;
return std::make_pair(kmers, pvalues) ;
}
int main()
{
// Matrix3D<double> mat(1,5,4,0.) ;
// mat(0,0,0) = 1. ; mat(0,1,1) = 1. ; mat(0,2,2) = 1. ; mat(0,3,3) = 1. ; mat(0,4,2) = 1. ;
std::string file = "/local/groux/scATAC-seq/data/"
"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
Matrix3D<double> mat = sequence_to_consensus(Matrix2D<int>(file)) ;
size_t k = 12 ;
auto kmers_pvalues = kmers::compute_kmer_pvalue(mat, k) ;
// for(size_t i=0; i<kmers_pvalues.first.size(); i++)
// { std::cerr << kmers_pvalues.first[i] << " " << kmers_pvalues.second[i] << std::endl ; }
std::vector<size_t> index = order(kmers_pvalues.second, true) ;
for(size_t i=0 ; i<10; i++)
{ size_t idx = index[i] ;
std::cout << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ;
}
/*
size_t n_class = 2 ;
size_t n_iter = 50 ;sequence_to_consensus
size_t n_shift = 1 ;
bool flip = true ;
bool bckg_class = false ;
std::string seed = "" ;
size_t n_thread = 1 ;
std::string file = "/local/groux/scATAC-seq/data/"
"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
*/
// std::string file = "/local/groux/scATAC-seq/test.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_noflip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_flip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
/*
Matrix2D<double> model_final = test_consensus(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
*/
/*
Matrix2D<double> model_final = test_sequence(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
*/
/*
Matrix2D<double> model_final = test_aggregate(file) ;
*/
// std::cout << model_final << std::endl ;
/*
double p0 = 0 ;
// double l0 = log(p0) ;
double p1 = 1. - (3.*p0) ;
// double l1 = log(p1) ;
// Matrix3D<double> cseq(1, 4, 4) ;
// cseq(0, 0, 0) = p1 ; cseq(0, 0, 1) = p0 ; cseq(0, 0, 2) = p0 ; cseq(0, 0, 3) = p0 ;
// cseq(0, 1, 0) = p0 ; cseq(0, 1, 1) = p1 ; cseq(0, 1, 2) = p0 ; cseq(0, 1, 3) = p0 ;
// cseq(0, 2, 0) = p0 ; cseq(0, 2, 1) = p0 ; cseq(0, 2, 2) = p1 ; cseq(0, 2, 3) = p0 ;
// cseq(0, 3, 0) = p0 ; cseq(0, 3, 1) = p0 ; cseq(0, 3, 2) = p0 ; cseq(0, 3, 3) = p1 ;
Matrix2D<int> seq(1, 4, -1) ;
seq(0, 0) = 0 ; seq(0, 1) = 1 ; seq(0, 2) = 2 ; seq(0, 3) = 3 ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
std::cout << cseq << std::endl << std::endl ;
Matrix2D<double> mod(4, 4, -1) ;
mod(0,0) = p1 ; mod(0,1) = p0 ; mod(0,2) = p0 ; mod(0,3) = p0 ;
mod(1,0) = p0 ; mod(1,1) = p1 ; mod(1,2) = p0 ; mod(1,3) = p0 ;
mod(2,0) = p0 ; mod(2,1) = p0 ; mod(2,2) = p1 ; mod(2,3) = p0 ;
mod(3,0) = p0 ; mod(3,1) = p0 ; mod(3,2) = p0 ; mod(3,3) = p1 ;
std::cout << mod << std::endl ;
std::cout << score_consensus_subseq(cseq, 0,0,mod)
<< std::endl ;
*/
return EXIT_SUCCESS ;
}
Event Timeline
Log In to Comment