Page MenuHomec4science

main_test.cpp
No OneTemporary

File Metadata

Created
Fri, Apr 26, 16:47

main_test.cpp

#include <iostream>
#include <string>
#include <vector>
#include <utility>
#include <cmath>
#include <unordered_map>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ConsensusSequenceModelComputer.hpp>
#include <EMConsensusSequence.hpp>
#include <SequenceModelComputer.hpp>
#include <EMSequence.hpp>
#include <dna_utility.hpp>
#include <Statistics.hpp>
#include <sorting_utility.hpp>
#include <boost/math/distributions.hpp>
#include <KmersStatistics.hpp>
template<class T>
std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
template<class T, class U>
std::ostream& operator << (std::ostream& stream, const std::unordered_map<T,U>& m)
{ for(const auto& bucket : m)
{ stream << bucket.first << " " << bucket.second << std::endl ; }
return stream ;
}
Matrix3D<double> compute_consensus_sequences(const Matrix2D<int>& seq,
const Matrix4D<double>& prob,
size_t class_k)
{
size_t n_seq = seq.get_nrow() ;
size_t l_seq = seq.get_ncol() ;
// size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
bool flip = n_flip == 2 ;
size_t l_model = l_seq - n_shift + 1 ;
Matrix3D<double> cons_seq(n_seq, l_model, 4, 0.) ;
// aggregate
for(size_t i=0; i<n_seq; i++)
{ std::cerr << "i " << i << std::endl ;
for(size_t s=0; s<n_shift; s++)
{ //std::cerr << "s " << s << std::endl ;
double tot = 0. ;
// aggregate
for(size_t j=0; j<l_model; j++)
{ // std::cerr << "j " << j << std::endl ;
// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
int base = seq(i,s+j) ;
// N should participate to A,C,G,T equivalently -> same as doing nothing
if(base == dna::char_to_int('N'))
{ continue ; }
int base_rv = 4 - base - 1 ;
// --------- forward ---------
// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
cons_seq(i,j,base) += prob(i,class_k,s,0) ;
tot += prob(i,class_k,s,0) ;
// --------- reverse ---------
if(flip)
{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ;
tot += prob(i,class_k,s,1) ;
}
}
}
}
// normalize
for(size_t i=0; i<n_seq; i++)
{ for(size_t j=0; j<l_model; j++)
{ double tot = 0. ;
for(size_t k=0; k<4; k++)
{ tot += cons_seq(i,j,k) ; }
for(size_t k=0; k<4; k++)
{ cons_seq(i,j,k) /= tot ; }
}
}
return cons_seq ;
}
Matrix2D<int> char_to_int(const Matrix2D<char>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
Matrix2D<int> iseq(n_row, n_col, 0.) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ iseq(i,j) = dna::char_to_int(seq(i,j)) ; }
}
return iseq ;
}
Matrix3D<double> sequence_to_consensus(const Matrix2D<int>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
size_t n_dim3 = 4 ; // A, C, G, T
int base_n = dna::char_to_int('N') ;
Matrix3D<double> cseq(n_row, n_col, n_dim3, 1e-100) ;
double prob = 1. - 3*(1e-100) ; // p(base) - p(!base)
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ int base = seq(i,j) ;
// N
if(base == base_n)
{ cseq(i,j,0) = 0.25 ;
cseq(i,j,1) = 0.25 ;
cseq(i,j,2) = 0.25 ;
cseq(i,j,3) = 0.25 ;
}
// A C G T
else
{ cseq(i,j, base) = prob ; }
}
}
return cseq ;
}
double score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t start,
const Matrix2D<double>& model)
{ size_t ncol = cons_seq.get_dim()[1] ;
// size_t dim1 = cons_seq.get_dim()[0] ;
// size_t dim2 = cons_seq.get_dim()[1] ;
size_t dim3 = cons_seq.get_dim()[2] ;
if(start > ncol - model.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, ncol - model.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_nrow() > ncol)
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model.get_nrow(), ncol) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(dim3 != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
dim3) ;
throw std::invalid_argument(msg) ;
}
/*
double ll = 0 ;
for(size_t j_seq=from, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ for(size_t k=0; k<dim3; k++)
{ std::cerr << ll
<< " *= "
<< "("
<< model(j_mod,k)
<< " * "
<< cons_seq(row,j_seq,k)
<< ")"
<< std::endl ;
ll *= model(j_mod,k) * cons_seq(row,j_seq,k) ;
}
}*/
size_t from = start ;
size_t to = from + model.get_nrow() ; // will score [from, to)
double ll = 0. ;
for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ double sum = 0. ;
for(size_t k=0; k<dim3; k++)
{ std::cerr << sum
<< "+= ("
<< cons_seq(row, j_seq, k)
<< " * "
<< model(j_mod, k)
<< ")"
<< std::endl ;
sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ;
}
std::cerr << sum << std::endl ;
std::cerr << "---------------" << std::endl ;
ll += log(sum) ;
}
return ll ;
}
Matrix2D<double> test_consensus(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
// partition
EMConsensusSequence em(cseq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
// compute models
ConsensusSequenceModelComputer mc(std::move(cseq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_sequence(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
// partition
EMSequence em(seq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
// compute models
SequenceModelComputer mc(std::move(seq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_aggregate(const std::string& file)
{ Matrix2D<int> seq(file) ;
size_t nrow = seq.get_nrow() ;
size_t ncol = seq.get_ncol() ;
Matrix2D<double> model_final(4, ncol+1, 1e-100) ;
for(size_t i=0; i<4; i++)
{ model_final(i,0) = 1. ; }
std::vector<double> sum(ncol+1, 0.) ;
for(size_t i=0; i<nrow; i++)
{ for(size_t j=0; j<ncol; j++)
{ int base = seq(i,j) ;
if(base == dna::char_to_int('N'))
{ continue ; }
model_final(base, j+1) += 1. ;
sum[j+1] += 1. ;
}
}
for(size_t i=0; i<model_final.get_nrow(); i++)
{ for(size_t j=1; j<model_final.get_ncol()+1; j++)
{ model_final(i,j) /= sum[j] ; }
}
return model_final ;
}
std::unordered_map<std::string, int> get_kmer_count(const Matrix3D<double>& consensus_sequence,
size_t length)
{ std::unordered_map<std::string, int> kmer_count ;
size_t n_row = consensus_sequence.get_dim()[0] ;
size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;
for(size_t i=0; i<n_row; i++)
{ for(size_t s=0; s<n_shift; s++)
{ // get kmer
std::string kmer = dna::consensus_to_kmer(consensus_sequence,
i,
s,
s+length) ;
// insert
auto iter = kmer_count.find(kmer) ;
if(iter == kmer_count.end())
{ kmer_count.emplace(kmer, 1) ; }
// update
else
{ iter->second += 1 ; }
}
}
return kmer_count ;
}
Matrix2D<int> compute_mononucleotide(size_t k,
const std::vector<std::string>& kmers,
const std::vector<int>& counts)
{
Matrix2D<int> comp(k, 4, 0) ;
for(size_t i=0; i<kmers.size(); i++)
{
std::string kmer = kmers[i] ;
int count = counts[i] ;
for (size_t j=0; j<k ; j++)
{ int base = int(kmer[j]) - int('0') ;
comp(j, base) += count ;
}
}
return(comp);
}
Matrix3D<int> compute_dinucleotide(size_t k,
const std::vector<std::string>& kmers,
const std::vector<int>& counts)
{
Matrix3D<int> comp2(4, k-1, 4) ;
for(size_t i=0; i<kmers.size(); i++)
{ std::string kmer = kmers[i] ;
int count = counts[i] ;
for (size_t j=0; j<k-1; j++)
{ int base1 = int(kmer[j]) - int('0') ;
int base2 = int(kmer[j+1]) - int('0') ;
comp2(base1, j, base2) += count ;
}
}
return(comp2);
}
std::vector<double> compute_exp_values(size_t k,
const std::vector<std::string>& kmers,
const Matrix2D<int>& comp1,
const Matrix3D<int>& comp2)
{ std::vector<double> expected(kmers.size()) ;
for(size_t i=0; i<kmers.size(); i++)
{ std::string kmer = kmers[i] ;
double exp = comp1(0, int(kmer[0]) - int('0')) ;
for(size_t j=0; j<k-1; j++)
{ int base1 = int(kmer[j]) - int('0') ;
int base2 = int(kmer[j+1]) - int('0') ;
exp *= ((double)comp2(base1, j, base2) /
(double)comp1(j+1, base2)) ;
}
expected[i] = exp ;
}
return expected;
}
std::vector<double> compute_pvalues(const std::vector<int>& counts,
const std::vector<double>& expected)
{
std::vector<double> pvalues(counts.size()) ;
for(size_t i=0; i<counts.size(); i++)
{
pvalues[i] = poisson_cdf((double)counts[i],
expected[i],
false) ;
}
return pvalues ;
}
std::pair<std::vector<std::string>, std::vector<double>>
compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
size_t k)
{
// get kmer count
std::unordered_map<std::string, int> kmer_count = get_kmer_count(consensus_sequences,
k) ;
/*
consensus_sequences.get_dim() ;
std::unordered_map<std::string, int> kmer_count ;
// ANN
kmer_count["000"] = 1 ;
kmer_count["001"] = 2 ;
kmer_count["002"] = 3 ;
kmer_count["003"] = 4 ;
kmer_count["010"] = 5 ;
kmer_count["011"] = 6 ;
kmer_count["012"] = 7 ;
kmer_count["013"] = 8 ;
kmer_count["020"] = 9 ;
kmer_count["021"] = 10 ;
kmer_count["022"] = 11 ;
kmer_count["023"] = 12 ;
kmer_count["030"] = 13 ;
kmer_count["031"] = 14 ;
kmer_count["032"] = 15 ;
kmer_count["033"] = 16 ;
// CNN
kmer_count["100"] = 1 ;
kmer_count["101"] = 2 ;
kmer_count["102"] = 3 ;
kmer_count["103"] = 4 ;
kmer_count["110"] = 5 ;
kmer_count["111"] = 6 ;
kmer_count["112"] = 7 ;
kmer_count["113"] = 8 ;
kmer_count["120"] = 9 ;
kmer_count["121"] = 10 ;
kmer_count["122"] = 11 ;
kmer_count["123"] = 12 ;
kmer_count["130"] = 13 ;
kmer_count["131"] = 14 ;
kmer_count["132"] = 15 ;
kmer_count["133"] = 16 ;
// GNN
kmer_count["200"] = 1 ;
kmer_count["201"] = 2 ;
kmer_count["202"] = 3 ;
kmer_count["203"] = 4 ;
kmer_count["210"] = 5 ;
kmer_count["211"] = 6 ;
kmer_count["212"] = 7 ;
kmer_count["213"] = 8 ;
kmer_count["220"] = 9 ;
kmer_count["221"] = 10 ;
kmer_count["222"] = 11 ;
kmer_count["223"] = 12 ;
kmer_count["230"] = 13 ;
kmer_count["231"] = 14 ;
kmer_count["232"] = 15 ;
kmer_count["233"] = 16 ;
// TNN
kmer_count["300"] = 1 ;
kmer_count["301"] = 2 ;
kmer_count["302"] = 3 ;
kmer_count["303"] = 4 ;
kmer_count["310"] = 5 ;
kmer_count["311"] = 6 ;
kmer_count["312"] = 7 ;
kmer_count["313"] = 8 ;
kmer_count["320"] = 9 ;
kmer_count["321"] = 10 ;
kmer_count["322"] = 11 ;
kmer_count["323"] = 12 ;
kmer_count["330"] = 13 ;
kmer_count["331"] = 14 ;
kmer_count["332"] = 15 ;
kmer_count["333"] = 16 ;
*/
// turn to vectors
std::vector<std::string> kmers(kmer_count.size()) ;
size_t i=0 ;
for(const auto& iter : kmer_count)
{ kmers[i] = iter.first ;
i++ ;
}
std::sort(kmers.begin(), kmers.end()) ;
std::vector<int> counts(kmer_count.size()) ;
i=0 ;
for(const auto& kmer : kmers)
{ counts[i] = kmer_count[kmer] ;
i++ ;
}
kmer_count.clear() ;
// get mononucleotide composition in kmer
// this is a probability matrix modeling the kmer
Matrix2D<int> comp1 = compute_mononucleotide(k, kmers, counts) ;
// get dinucleotide composition in kmer
// this is a probability matrix modeling the kmer
Matrix3D<int> comp2 = compute_dinucleotide(k, kmers, counts) ;
// get expected number of occurence for each kmer
std::vector<double> expected = compute_exp_values(k,
kmers,
comp1,
comp2) ;
// compute the pvalue associated to each kmer
std::vector<double> pvalues = compute_pvalues(counts,
expected) ;
return std::make_pair(kmers, pvalues) ;
}
int main()
{
// Matrix3D<double> mat(1,5,4,0.) ;
// mat(0,0,0) = 1. ; mat(0,1,1) = 1. ; mat(0,2,2) = 1. ; mat(0,3,3) = 1. ; mat(0,4,2) = 1. ;
std::string file = "/local/groux/scATAC-seq/data/"
"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
Matrix3D<double> mat = sequence_to_consensus(Matrix2D<int>(file)) ;
size_t k = 12 ;
auto kmers_pvalues = kmers::compute_kmer_pvalue(mat, k) ;
// for(size_t i=0; i<kmers_pvalues.first.size(); i++)
// { std::cerr << kmers_pvalues.first[i] << " " << kmers_pvalues.second[i] << std::endl ; }
std::vector<size_t> index = order(kmers_pvalues.second, true) ;
for(size_t i=0 ; i<10; i++)
{ size_t idx = index[i] ;
std::cout << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ;
}
/*
size_t n_class = 2 ;
size_t n_iter = 50 ;sequence_to_consensus
size_t n_shift = 1 ;
bool flip = true ;
bool bckg_class = false ;
std::string seed = "" ;
size_t n_thread = 1 ;
std::string file = "/local/groux/scATAC-seq/data/"
"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
*/
// std::string file = "/local/groux/scATAC-seq/test.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_noflip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_flip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
/*
Matrix2D<double> model_final = test_consensus(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
*/
/*
Matrix2D<double> model_final = test_sequence(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
*/
/*
Matrix2D<double> model_final = test_aggregate(file) ;
*/
// std::cout << model_final << std::endl ;
/*
double p0 = 0 ;
// double l0 = log(p0) ;
double p1 = 1. - (3.*p0) ;
// double l1 = log(p1) ;
// Matrix3D<double> cseq(1, 4, 4) ;
// cseq(0, 0, 0) = p1 ; cseq(0, 0, 1) = p0 ; cseq(0, 0, 2) = p0 ; cseq(0, 0, 3) = p0 ;
// cseq(0, 1, 0) = p0 ; cseq(0, 1, 1) = p1 ; cseq(0, 1, 2) = p0 ; cseq(0, 1, 3) = p0 ;
// cseq(0, 2, 0) = p0 ; cseq(0, 2, 1) = p0 ; cseq(0, 2, 2) = p1 ; cseq(0, 2, 3) = p0 ;
// cseq(0, 3, 0) = p0 ; cseq(0, 3, 1) = p0 ; cseq(0, 3, 2) = p0 ; cseq(0, 3, 3) = p1 ;
Matrix2D<int> seq(1, 4, -1) ;
seq(0, 0) = 0 ; seq(0, 1) = 1 ; seq(0, 2) = 2 ; seq(0, 3) = 3 ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
std::cout << cseq << std::endl << std::endl ;
Matrix2D<double> mod(4, 4, -1) ;
mod(0,0) = p1 ; mod(0,1) = p0 ; mod(0,2) = p0 ; mod(0,3) = p0 ;
mod(1,0) = p0 ; mod(1,1) = p1 ; mod(1,2) = p0 ; mod(1,3) = p0 ;
mod(2,0) = p0 ; mod(2,1) = p0 ; mod(2,2) = p1 ; mod(2,3) = p0 ;
mod(3,0) = p0 ; mod(3,1) = p0 ; mod(3,2) = p0 ; mod(3,3) = p1 ;
std::cout << mod << std::endl ;
std::cout << score_consensus_subseq(cseq, 0,0,mod)
<< std::endl ;
*/
return EXIT_SUCCESS ;
}

Event Timeline