Page MenuHomec4science

No OneTemporary

File Metadata

Created
Thu, Apr 25, 10:18
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh
index 740d6f3..54f1c72 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh
@@ -1,38 +1,38 @@
# some paths
## directories
-results_dir='results/10xgenomics_PBMC_5k_classification_3'
+results_dir='results/10xgenomics_PBMC_5k_motifs_classification_3'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat"
## file with seeds
file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
n_core=24
# sequences
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat'
file_mod4=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4
done
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
index 1fc02f9..2ef7199 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
@@ -1,104 +1,104 @@
setwd(file.path("/", "local", "groux", "scATAC-seq"))
# libraries
library(RColorBrewer)
# functions
source(file.path("scripts", "functions.R"))
# the number of classes searched
n.classes = c(23)
# path to the images for the logo
path.a = file.path("res/A.png")
path.c = file.path("res/C.png")
path.g = file.path("res/G.png")
path.t = file.path("res/T.png")
################## sequence patterns around ctcf motifs ##################
for(k in n.classes)
{
# sequence
data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k)))
model.seq = data$models
model.prob = data$prob
data = NULL
# open chromatin
model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
# nucleosomes
model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models
# plot classes
col = brewer.pal(3, "Set1")
- X11(width=26, height=12)
- # png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
- # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
- # units="in", res=720, width=26, height=12)
+ # X11(width=26, height=12)
+ png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
+ sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
+ units="in", res=720, width=26, height=12)
m = matrix(1:24, nrow=6, ncol=4, byrow=F)
layout(m)
# order from most to least probable class
ord = order(model.prob, decreasing=T)
ref.open = model.open[ord,, drop=F][,]
ref.nucl = model.nucl[ord,, drop=F][,]
ref.seq = model.seq[,,ord, drop=F][,,]
prob = model.prob[ord]
class = c(1:nrow(ref.open))[ord]
for(i in 1:nrow(ref.open))
{ # plot logo
plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
main=sprintf("class %d (p=%.2f)", class[i], prob[i]))
# x-axis
x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
axis(1, at=x.at, labels=x.lab)
# y-axis is [0,1] for min/max signal
y.at = seq(0, 2, length.out=2)
y.lab = c("min", "max")
axis(2, at=y.at, labels=y.lab)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
}
# inlets with center
row_n = 1 # row counter
col_n = 1 # column counter
row_h = 1/nrow(m) # height of row
col_w = 1/ncol(m) # width of column
row_cor = row_h / 2
col_cor = col_w / 3
for(i in 1:nrow(ref.open))
{ # plot logo center
left = (col_w*col_n) - col_w
right = left + col_w
left = right - col_cor
bottom = 1 - (row_h*row_n)
top = bottom + row_h
bottom = top - row_cor
par(fig=c(left, right, bottom, top), new=T)
idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
# xaxis
# x.at = seq(1, length(idx), length.out = 3)
# x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
x.at = ceiling(length(idx)/2)
x.lab = 0
axis(1, at=x.at, labels=x.lab)
# yaxis
axis(2, at=y.at, labels=y.lab)
row_n = row_n + 1
if(i %% nrow(m) == 0)
{ col_n = col_n + 1
row_n = 1
}
}
- # dev.off()
+ dev.off()
}
diff --git a/scripts/toy_data/generate_matrix_data_sequence.R b/scripts/toy_data/generate_matrix_data_sequence.R
index c80a82a..d34cbd7 100644
--- a/scripts/toy_data/generate_matrix_data_sequence.R
+++ b/scripts/toy_data/generate_matrix_data_sequence.R
@@ -1,255 +1,255 @@
setwd(file.path("/", "local", "groux", "scATAC-seq"))
# required librairies and functions
library(abind)
# functions
#' Converts a vector of characters containing a DNA sequence
#' into a vector of integers : A->1, C->2, G->3, T->4. Any
#' non ACGT character triggers an error.
#' \param sequence the DNA sequence stored as a vector of
#' characters.
#' \return a vector of integers.
#' \author Romain Groux
dna.to.int = function(sequence)
{ seq.len = length(sequence)
seq.int = vector(length=seq.len, mode="numeric")
for(i in 1:seq.len)
{ if(sequence[i] == "A")
{ seq.int[i] = 1 }
else if(sequence[i] == "C")
{ seq.int[i] = 2 }
else if(sequence[i] == "G")
{ seq.int[i] = 3 }
else if(sequence[i] == "T")
{ seq.int[i] = 4}
else
{ stop(sprintf("Error! Unrecognized character in DNA sequence at position %d : %s", i, sequence[i])) }
}
return(seq.int)
}
#' The complementary function to dna.to.int().
#' \param sequence the DNA stored as a vector of int :
#' A->1, C->2, G->3, T->4
#' \return a vector of characters.
#' \author Romain Groux
int.to.dna = function(sequence)
{ seq.len = length(sequence)
seq.let = vector(length=seq.len, mode="character")
for(i in 1:seq.len)
{ if(sequence[i] == 1)
{ seq.let[i] = "A" }
else if(sequence[i] == 2)
{ seq.let[i] = "C" }
else if(sequence[i] == 3)
{ seq.let[i] = "G" }
else if(sequence[i] == 4)
{ seq.let[i] = "T"}
else
{ stop(sprintf("Error! Unrecognized character in int sequence at position %d : %d", i, sequence[i])) }
}
return(seq.let)
}
simulate_data = function(n_seq, l_seq, classes, prob_bg, p_classes, p_flip)
{
# the alphabet A->1, C->2, G->3, T->4
alphabet = c(1, 2, 3, 4)
l_alpha = length(alphabet)
# binding site length
l_bs = ncol(classes[,,1])
# number of classes
n_class = dim(classes)[3]
# checks
if(length(p_classes )!= n_class)
{ stop(sprintf("Error! %d classes detected but %d class probability given!", n_class, length(p_classes))) }
for(k in 1:n_class)
{ if((nrow(classes[,,k]) != 4) || (ncol(classes[,,k]) != l_bs))
{ stop(sprintf("Error! Check the dimensions of class %d motif : %d / %d!", k, nrow(classes[,,k]), ncol(classes[,,k]))) }
}
# last position (comprised) for a binding site to begin and be entirely in the seq
last_pos_bs = l_seq - l_bs + 1
# data structures
sequences = matrix(data=0, n_seq, l_seq) # the sequences
bs_starts = vector(length=n_seq, mode="numeric") # the starting positions of the binding site
bs_flips = vector(length=n_seq, mode="numeric") # the orientation of the binding site
bs_classes = vector(length=n_seq, mode="numeric") # the class from which the binding site was sampled
bs_contents = matrix(data=0, nrow=n_seq, ncol=l_bs) # the binding site sequences
bs_probs = array(data=0, dim=c(l_alpha, l_bs, n_class)) # the class binding site probability matrices
for(i in 1:n_seq)
{
# sample from a uniform distribution where the binding site should start
bs_starts[i] = sample(1:last_pos_bs, 1)
# sample a class
class = sample(1:n_class, 1, prob=p_classes)
bs_classes[i] = class
# sample a flip state (0->forward, 1->reverse)
flip = rbinom(1, 1, prob=p_flip)
bs_flips[i] = flip
# to store the int seq
seq = vector(length=l_seq, mode="numeric")
seq_bs = vector(length=l_bs, mode="numeric")
# over the sequence
j = 1
while(j <= l_seq)
{ # binding site starts
if(j == bs_starts[i])
{ for(k in 0:(l_bs-1))
{ # reverse strand
if(flip)
{ base = sample(alphabet, 1, prob=rev(classes[,,class][,l_bs-k]))
seq[j+k] = base
seq_bs[k+1] = base
bs_probs[l_alpha-base+1, l_bs-k, class] = bs_probs[l_alpha-base+1, l_bs-k, class] + 1
}
# forward strand
else
{ base = sample(alphabet, 1, prob=classes[,,class][,k+1])
seq[j+k] = base
seq_bs[k+1] = base
bs_probs[base, k+1, class] = bs_probs[base, k+1, class] + 1
}
}
j = j + k
# this is background sequence
} else {
seq[j] = sample(alphabet, 1, prob=prob_bg)
}
j = j + 1
}
sequences[i,] = int.to.dna(seq)
bs_contents[i,] = int.to.dna(seq_bs)
}
# normalize
for(i in 1:n_class)
{ bs_probs[,,i] = bs_probs[,,i] / colSums(bs_probs[,,i]) }
return(list(sequences=sequences, sites=bs_contents, motifs=bs_probs, starts=bs_starts, flips=bs_flips, classes=bs_classes))
}
# some general parameters
-n_seq = 2000 # number of sequences
-l_seq = 30 # length of sequences
+n_seq = 10 # number of sequences
+l_seq = 10 # length of sequences
# the base probabilities inside the binding site (A,C,G,T)
-motif_class1 = matrix(data=c(0.7, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.7,
- 0.1, 0.7, 0.1, 0.1, 0.1, 0.1, 0.7, 0.1,
- 0.1, 0.1, 0.7, 0.1, 0.1, 0.7, 0.1, 0.1,
- 0.1, 0.1, 0.1, 0.7, 0.7, 0.1, 0.1, 0.1),
+motif_class1 = matrix(data=c(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+ 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+ 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
nrow=4, ncol=8, byrow=T)
-motif_class2 = matrix(data=c(0.1, 0.1, 0.1, 0.7, 0.7, 0.1, 0.1, 0.1,
- 0.1, 0.7, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
- 0.7, 0.1, 0.7, 0.1, 0.1, 0.1, 0.1, 0.1,
- 0.1, 0.1, 0.1, 0.1, 0.1, 0.7, 0.7, 0.7),
+motif_class2 = matrix(data=c(0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0,
+ 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
nrow=4, ncol=8, byrow=T)
dir.create(file.path("data", "toy_data"), showWarnings=FALSE)
# ------------------------------------------- 1 classes with 1 motif/sequence, no flip, a really toy exemple -------------------------------------------
data = matrix(nrow=4, ncol=10, byrow=T,
data=c('T', 'T', 'C', 'C', 'T', 'T', 'A', 'G', 'C', 'T',
'T', 'T', 'C', 'C', 'T', 'T', 'G', 'C', 'T', 'A',
'T', 'T', 'C', 'C', 'T', 'T', 'C', 'T', 'A', 'G',
'T', 'T', 'C', 'C', 'T', 'T', 'T', 'A', 'G', 'C'))
write.table(data, file=file.path("data", "toy_data", "simulated_sequences_toy.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
rm(data)
# -------------------------------------------------------- 1 classes with 1 motif/sequence, no flip -----------------------------------------------------
# seed
set.seed(20190715)
# the class binding site
motif_classes = array(data=motif_class1, dim=c(dim(motif_class1), 1))
# the class probability
p_classes = c(1)
# the probability of having a binding site on the reverse strand
p_flip = 0
# the base probabilities outside the binding site (A,C,G,T)
prob_bg = rep(0.25, 4)
# simulate the data
data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip)
# save
write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(motif_class1, file=file.path("data", "toy_data", "simulated_sequences_motif.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$sites, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$starts, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$flips, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$classes, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
# clean
rm(motif_classes, p_classes, p_flip, prob_bg, data)
# -------------------------------------------------------- 1 classes with 1 motif/sequence, flip --------------------------------------------------------
# seed
set.seed(201803142)
# the class binding site
motif_classes = array(data=motif_class1, dim=c(dim(motif_class1), 1))
# the class probability
p_classes = c(1)
# the probability of having a binding site on the reverse strand
p_flip = 0.5
# the base probabilities outside the binding site (A,C,G,T)
prob_bg = rep(0.25, 4)
# simulate the data
data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip)
# save
write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_1class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$sites, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$starts, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_1class_flip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$flips, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$classes, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
# clean
rm(motif_classes, p_classes, p_flip, prob_bg, data)
# -------------------------------------------------------- 2 classes with 1 bs/sequence, flip --------------------------------------------------------
# seed
set.seed(201803143)
# the class binding site
motif_classes = abind(motif_class1, motif_class2, along=3)
# the class probability
-p_classes = c(0.7, 0.3)
+p_classes = c(0.5, 0.5)
# the probability of having a binding site on the reverse strand
-p_flip = 0.4
+p_flip = 0.5
# the base probabilities outside the binding site (A,C,G,T)
prob_bg = rep(0.25, 4)
# simulate the data
data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip)
# save
write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_2class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$sites, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$starts, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_2class_flip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$motifs[,,2], file=file.path("data", "toy_data", "simulated_sequences_2class_flip_motif2.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$flips, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
write.table(data$classes, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n')
# clean
rm(motif_classes, p_classes, p_flip, prob_bg, data)
diff --git a/src/Applications/EMReadApplication.cpp b/src/Applications/EMReadApplication.cpp
index 5ece4c9..d2e12b3 100644
--- a/src/Applications/EMReadApplication.cpp
+++ b/src/Applications/EMReadApplication.cpp
@@ -1,155 +1,175 @@
#include <EMReadApplication.hpp>
#include <EMRead.hpp>
#include <iostream>
#include <string>
+#include <vector>
+#include <utility> // std::move()
#include <stdexcept> // std::invalid_argument
#include <boost/program_options.hpp>
#include <Matrix2D.hpp>
#include <Matrix4D.hpp>
+#include <matrix_utility.hpp> // filter()
+
namespace po = boost::program_options ;
EMReadApplication::EMReadApplication(int argn, char** argv)
- : file_read(""), file_out(""),
+ : file_read(""), file_filter(""), file_out(""),
n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false),
n_threads(0), seed(""), runnable(true)
{
// parse command line options and set the fields
this->parseOptions(argn, argv) ;
}
int EMReadApplication::run()
{ if(this->runnable)
- { EMRead em(Matrix2D<int>(this->file_read),
+ {
+ // read data
+ Matrix2D<int> data(this->file_read) ;
+
+ // filter out some rows if needed
+ std::vector<size_t> filter ;
+ if(this->file_filter != "")
+ { // it is a column vector, easier to use the Matrix2D interface
+ // to read it rather than coding a function for :)
+ filter = Matrix2D<size_t>(this->file_filter).get_data() ;
+ std::sort(filter.begin(), filter.end()) ;
+ data = filter_rows(filter, data) ;
+ }
+
+ EMRead em(std::move(data),
this->n_class,
this->n_iter,
this->n_shift,
this->flip,
this->bckg_class,
this->seed,
this->n_threads) ;
em.classify() ;
em.get_post_prob().save(this->file_out) ;
return EXIT_SUCCESS ;
}
else
{ return EXIT_FAILURE ; }
}
void EMReadApplication::parseOptions(int argn, char** argv)
{
// no option to parse
if(argv == nullptr)
{ std::string message = "no options to parse!" ;
throw std::invalid_argument(message) ;
}
// help messages
std::string desc_msg = "\n"
"EMRead is a probabilistic partitioning algorithm that \n"
"sofetly assigns genomic regions to classes given the shape \n"
"of the read density over the region. The assignment \n"
"probabilities are returned through stdout.\n\n" ;
std::string opt_help_msg = "Produces this help message." ;
std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n "
"by default 0 (no parallelization)." ;
std::string opt_read_msg = "The path to the file containing the read density data" ;
+ std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n"
+ "indices of rows to filter out in the data." ;
std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n"
"in binary format." ;
std::string opt_iter_msg = "The number of iterations." ;
std::string opt_class_msg = "The number of classes to find." ;
std::string opt_shift_msg = "Enables this number of column of shifting "
"freedom to realign the data. By default, shifting is "
"disabled (equivalent to --shift 1)." ;
std::string opt_flip_msg = "Enables flipping to realign the data.";
std::string opt_bckg_msg = "Adds a class to model the signal background. This class\n"
"contains the mean number of read at each position and\n"
"is never updated." ;
std::string opt_seed_msg = "A value to seed the random number generator.";
// option parser
boost::program_options::variables_map vm ;
boost::program_options::options_description desc(desc_msg) ;
std::string seeding_tmp ;
desc.add_options()
("help,h", opt_help_msg.c_str())
- ("read", po::value<std::string>(&(this->file_read)), opt_read_msg.c_str())
-
- ("out", po::value<std::string>(&(this->file_out)), opt_file_out_msg.c_str())
+ ("read", po::value<std::string>(&(this->file_read)), opt_read_msg.c_str())
+ ("filter", po::value<std::string>(&(this->file_filter)), opt_filter_msg.c_str())
+ ("out", po::value<std::string>(&(this->file_out)), opt_file_out_msg.c_str())
- ("iter,i", po::value<size_t>(&(this->n_iter)), opt_iter_msg.c_str())
- ("class,c", po::value<size_t>(&(this->n_class)), opt_class_msg.c_str())
- ("shift,s", po::value<size_t>(&(this->n_shift)), opt_shift_msg.c_str())
+ ("iter,i", po::value<size_t>(&(this->n_iter)), opt_iter_msg.c_str())
+ ("class,c", po::value<size_t>(&(this->n_class)), opt_class_msg.c_str())
+ ("shift,s", po::value<size_t>(&(this->n_shift)), opt_shift_msg.c_str())
("flip", opt_flip_msg.c_str())
("bgclass", opt_bckg_msg.c_str())
- ("seed", po::value<std::string>(&(this->seed)), opt_seed_msg.c_str())
- ("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;
+ ("seed", po::value<std::string>(&(this->seed)), opt_seed_msg.c_str())
+ ("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;
// parse
try
{ po::store(po::parse_command_line(argn, argv, desc), vm) ;
po::notify(vm) ;
}
catch(std::invalid_argument& e)
{ std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
throw std::invalid_argument(msg) ;
}
catch(...)
{ throw std::invalid_argument("An unknown error occured while parsing the options") ; }
bool help = vm.count("help") ;
// checks unproper option settings
if(this->file_read == "" and
(not help))
{ std::string msg("Error! No data were given (--read)!") ;
throw std::invalid_argument(msg) ;
}
if(this->file_out == "" and
(not help))
{ std::string msg("Error! No output file given (--out)!") ;
throw std::invalid_argument(msg) ;
}
// no iter given -> 1 iter
if(this->n_iter == 0)
{ this->n_iter = 1 ; }
// no shift class given -> 1 class
if(this->n_class == 0)
{ this->n_class = 1 ; }
// no shift given, value of 1 -> no shift
if(this->n_shift == 0)
{ this->n_shift = 1 ; }
// set flip
if(vm.count("flip"))
{ this->flip = true ; }
// set background class
if(vm.count("bgclass"))
{ this->bckg_class = true ; }
// help invoked, run() cannot be invoked
if(help)
{ std::cout << desc << std::endl ;
this->runnable = false ;
return ;
}
// everything fine, run() can be called
else
{ this->runnable = true ;
return ;
}
}
int main(int argn, char** argv)
{ EMReadApplication app(argn, argv) ;
return app.run() ;
}
diff --git a/src/Applications/EMReadApplication.hpp b/src/Applications/EMReadApplication.hpp
index 0847a54..5f76c66 100644
--- a/src/Applications/EMReadApplication.hpp
+++ b/src/Applications/EMReadApplication.hpp
@@ -1,103 +1,109 @@
#ifndef EMREADAPPLICATION_HPP
#define EMREADAPPLICATION_HPP
#include <ApplicationInterface.hpp>
#include <string>
/*!
* \brief The EMReadApplication class is a wrapper around an EMRead
* instance creating an autonomous application to classify data by directly
* passing all the options and parameters from the command line.
*/
class EMReadApplication: public ApplicationInterface
{
public:
EMReadApplication() = delete ;
EMReadApplication(const EMReadApplication& app) = delete ;
/*!
* \brief Constructs an object from the command line
* options.
* \param argn the number of options passed to the
* main() function.
* \param argv the vector of options passed to the
* main() function.
*/
EMReadApplication(int argn, char** argv) ;
/*!
* \brief Runs the application. The data are classified
* using the given settings and the posterior probability
* matrix is returned through the stdout.
* The matrix is a 4D matrix with dimensions :
* regions, class, shift flip.
* \return an exit code EXIT_SUCCESS or EXIT_FAILURE
* to return to the OS.
*/
virtual int run() override ;
private:
/*!
* \brief Parses the program command line options and
* sets the object field accordingly.
* If the help option is detected, the "runnable"
* field is set to false and subsequent calls to
* run() will produce nothing.
* \param argn the number of options passed to the
* main() function.
* \param argv the vector of options passed to the
* main() function.
* \throw std::invalid_argument if an error is found
* in the program options.
*/
void parseOptions(int argn, char** argv) ;
/*!
* \brief the paths to the file containing the read
* density data.
*/
std::string file_read ;
+ /*!
+ * \brief the path to the file containing the
+ * (0-based) index of the rows to filter out
+ * from the data.
+ */
+ std::string file_filter ;
/*!
* \brief the path to the file in which the probability
* matrix will be saved.
*/
std::string file_out ;
/*!
* \brief the number of classes to partition the data into.
*/
size_t n_class ;
/*!
* \brief the number of iterations allowed.
*/
size_t n_iter ;
/*!
* \brief the shifting freedom.
*/
size_t n_shift ;
/*!
* \brief whether flipping freedom is allowed.
*/
bool flip ;
/*!
* \brief whether a constant class to model the
* read count background should be added. This
* class has mean number of read count at each
* position.
*/
bool bckg_class ;
/*!
* \brief the number of threads.
*/
size_t n_threads ;
/*!
* \brief a seed to initialise the random number generator.
*/
std::string seed ;
/*!
* \brief a flag indicating whether the core of run() can be
* run or not.
*/
bool runnable ;
} ;
#endif // EMREADAPPLICATION_HPP
diff --git a/src/Applications/EMSequenceApplication.cpp b/src/Applications/EMSequenceApplication.cpp
index 3260df7..1b8ab5f 100644
--- a/src/Applications/EMSequenceApplication.cpp
+++ b/src/Applications/EMSequenceApplication.cpp
@@ -1,283 +1,295 @@
#include <EMSequenceApplication.hpp>
#include <EMSequence.hpp>
#include <iostream>
#include <string>
#include <utility> // std::move()
#include <stdexcept> // std::invalid_argument
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp> // boost::split()
#include <Matrix2D.hpp>
#include <Matrix4D.hpp>
-#include <dna_utility.hpp>
+#include <matrix_utility.hpp> // filter()
+
namespace po = boost::program_options ;
EMSequenceApplication::EMSequenceApplication(int argn, char** argv)
- : file_seq(""), files_motif(""), file_out(""),
+ : file_seq(""), files_motif(""), file_filter(""), file_out(""),
n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false),
n_threads(0), seed(""), runnable(true)
{
// parse command line options and set the fields
this->parseOptions(argn, argv) ;
}
int EMSequenceApplication::run()
{ if(this->runnable)
{ EMSequence* em(nullptr) ;
// data
Matrix2D<int> data(this->file_seq) ;
+ // filter out some rows if needed
+ std::vector<size_t> filter ;
+ if(this->file_filter != "")
+ { // it is a column vector, easier to use the Matrix2D interface
+ // to read it rather than coding a function for :)
+ filter = Matrix2D<size_t>(this->file_filter).get_data() ;
+ std::sort(filter.begin(), filter.end()) ;
+ data = filter_rows(filter, data) ;
+ }
+
// seeds motifs randomly
if(this->files_motif == "")
{ em = new EMSequence(std::move(data),
this->n_class,
this->n_iter,
this->n_shift,
this->flip,
this->bckg_class,
this->seed,
this->n_threads) ;
}
// seeds motifs with the given matrices
else
{ // model
std::vector<std::string> motif_paths ;
boost::split(motif_paths, this->files_motif, [](char c){return c == ',';}) ;
// this->n_class = motif_paths.size() + this->bckg_class ;
size_t model_ncol = data.get_ncol() - this->n_shift + 1 ;
// add the given motif, random motifs (if needed) and
// background class (if needed)
Matrix3D<double> model = this->init_model(model_ncol,
data,
motif_paths) ;
em = new EMSequence(std::move(data),
std::move(model),
this->n_iter,
this->flip,
this->n_threads) ;
}
// classify
em->classify() ;
em->get_post_prob().save(this->file_out) ;
// clean
delete em ;
em = nullptr ;
return EXIT_SUCCESS ;
}
else
{ return EXIT_FAILURE ; }
}
void EMSequenceApplication::parseOptions(int argn, char** argv)
{
// no option to parse
if(argv == nullptr)
{ std::string message = "no options to parse!" ;
throw std::invalid_argument(message) ;
}
// help messages
std::string desc_msg = "\n"
"EMSequence is a probabilistic partitioning algorithm that \n"
"sofetly assigns sequences to classes given their motif content.\n"
"The assignment probabilities are written in binary format as a 4D "
"matrix.\n\n" ;
std::string opt_help_msg = "Produces this help message." ;
std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n "
"by default 0 (no parallelization)." ;
std::string opt_seq_msg = "The path to the file containing the sequences" ;
std::string opt_motifs_msg = "A coma separated list of path to files containing the initial motifs\n"
"values. The motifs should be probability matrices in horizontal format.\n"
"If the motifs are too short after accounting for shifting, extra\n"
"columns with uniform probabilities will be added on each side. The\n"
"given number of classes (--class) should at least be the number of\n"
"initial motifs. If the number of classes is bigger than the number of"
"given motifs, the remaining classes are initialised randomly\n." ;
+ std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n"
+ "indices of rows to filter out in the data." ;
std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n"
"in binary format." ;
std::string opt_iter_msg = "The number of iterations." ;
std::string opt_class_msg = "The number of classes to find." ;
std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n"
"the data. By default, shifting is disabled (equivalent to\n"
"--shift 1)." ;
std::string opt_flip_msg = "Enables flipping to realign the data.";
std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n"
"contains the sequence background probabilities at each position\n"
"and is never updated." ;
std::string opt_seed_msg = "A value to seed the random number generator.";
// option parser
boost::program_options::variables_map vm ;
boost::program_options::options_description desc(desc_msg) ;
std::string seeding_tmp ;
desc.add_options()
("help,h", opt_help_msg.c_str())
- ("seq", po::value<std::string>(&(this->file_seq)), opt_seq_msg.c_str())
-
- ("motifs", po::value<std::string>(&(this->files_motif)), opt_motifs_msg.c_str())
-
- ("out", po::value<std::string>(&(this->file_out)), opt_file_out_msg.c_str())
+ ("seq", po::value<std::string>(&(this->file_seq)), opt_seq_msg.c_str())
+ ("motifs", po::value<std::string>(&(this->files_motif)), opt_motifs_msg.c_str())
+ ("filter", po::value<std::string>(&(this->file_filter)), opt_filter_msg.c_str())
+ ("out", po::value<std::string>(&(this->file_out)), opt_file_out_msg.c_str())
- ("iter,i", po::value<size_t>(&(this->n_iter)), opt_iter_msg.c_str())
- ("class,c", po::value<size_t>(&(this->n_class)), opt_class_msg.c_str())
- ("shift,s", po::value<size_t>(&(this->n_shift)), opt_shift_msg.c_str())
+ ("iter,i", po::value<size_t>(&(this->n_iter)), opt_iter_msg.c_str())
+ ("class,c", po::value<size_t>(&(this->n_class)), opt_class_msg.c_str())
+ ("shift,s", po::value<size_t>(&(this->n_shift)), opt_shift_msg.c_str())
("flip", opt_flip_msg.c_str())
("bgclass", opt_bckg_msg.c_str())
- ("seed", po::value<std::string>(&(this->seed)), opt_seed_msg.c_str())
- ("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;
+ ("seed", po::value<std::string>(&(this->seed)), opt_seed_msg.c_str())
+ ("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;
// parse
try
{ po::store(po::parse_command_line(argn, argv, desc), vm) ;
po::notify(vm) ;
}
catch(std::invalid_argument& e)
{ std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
throw std::invalid_argument(msg) ;
}
catch(...)
{ throw std::invalid_argument("An unknown error occured while parsing the options") ; }
bool help = vm.count("help") ;
// checks unproper option settings
if(this->file_seq == "" and
(not help))
{ std::string msg("Error! No data were given (--seq)!") ;
throw std::invalid_argument(msg) ;
}
if(this->file_out == "" and
(not help))
{ std::string msg("Error! No output file given (--out)!") ;
throw std::invalid_argument(msg) ;
}
// no iter given -> 1 iter
if(this->n_iter == 0)
{ this->n_iter = 1 ; }
// no shift class given -> 1 class
if(this->n_class == 0)
{ this->n_class = 1 ; }
// no shift given, value of 1 -> no shift
if(this->n_shift == 0)
{ this->n_shift = 1 ; }
// set flip
if(vm.count("flip"))
{ this->flip = true ; }
// set background class
if(vm.count("bgclass"))
{ this->bckg_class = true ; }
// help invoked, run() cannot be invoked
if(help)
{ std::cout << desc << std::endl ;
this->runnable = false ;
return ;
}
// everything fine, run() can be called
else
{ this->runnable = true ;
return ;
}
}
Matrix3D<double> EMSequenceApplication::init_model(size_t model_len,
const Matrix2D<int>& data,
const std::vector<std::string>& motif_paths) const
{
int n_class_given = motif_paths.size() ;
int n_class_bckg = this->bckg_class ;
int n_class_rand = this->n_class - n_class_given - n_class_bckg ;
// number of classes should at least be number of motifs
if(n_class_given > (int)this->n_class)
{ char msg[4096] ;
sprintf(msg, "Error! number of class given (--class %zu) should at "
"least be equal to number of motifs (--motifs %d)",
this->n_class, n_class_given) ;
throw std::invalid_argument(msg) ;
}
// check if there is room for a background class
if((int)this->n_class < n_class_given+this->bckg_class)
{ char msg[4096] ;
sprintf(msg, "Error! no class left to add a background "
"class (--bgclass) with the given motifs (--motifs) (--class %zu)",
this->n_class) ;
throw std::invalid_argument(msg) ;
}
// init empty model
Matrix3D<double> model(this->n_class,
model_len,
4,
0.25) ;
// add given motifs
for(size_t i=0; i<motif_paths.size(); i++)
{ Matrix2D<double> matrix(motif_paths[i]) ;
// motif is too big for this shift
if(matrix.get_ncol() > model_len)
{ char msg[4096] ;
sprintf(msg,
"Error! In %s, motif column number is bigger "
"than data column number - shift + 1 "
"(%zu > %zu - %zu + 1)",
motif_paths[i].c_str(),
matrix.get_ncol(),
data.get_ncol(),
this->n_shift) ;
throw std::invalid_argument(msg) ;
}
// insert motif in middle of matrix
else
{ // size_t j_model = this->n_shift / 2 ;
size_t j_model = (model_len - matrix.get_ncol()) / 2 ;
for(size_t j_mat=0, j_mod=j_model; j_mat<matrix.get_ncol(); j_mat++, j_mod++)
{ for(size_t k=0; k<4; k++)
{ model(i,j_mod,k) = matrix(k,j_mat) ; }
}
}
}
// add random motifs and background class
// delegate this to EMSequence constructor
// (ensure that it is done properly)
if(n_class_rand > 0)
{ // initialise randomly
EMSequence em(data,
n_class_rand,
this->n_iter,
this->n_shift,
this->flip,
this->bckg_class,
this->seed,
this->n_threads) ;
Matrix3D<double> model_rand = em.get_sequence_models() ;
// copy them into model
for(int i_rand=0, i_mod=n_class_given; i_rand<n_class_rand; i_rand++, i_mod++)
{ for(int j=0; j<(int)model_len; j++)
{ for(int k=0; k<4; k++)
{ model(i_mod,j,k) = model_rand(i_rand,j,k) ; }
}
}
}
return model ;
}
int main(int argn, char** argv)
{ EMSequenceApplication app(argn, argv) ;
return app.run() ;
}
diff --git a/src/Applications/EMSequenceApplication.hpp b/src/Applications/EMSequenceApplication.hpp
index ba1c5c9..1047b58 100644
--- a/src/Applications/EMSequenceApplication.hpp
+++ b/src/Applications/EMSequenceApplication.hpp
@@ -1,138 +1,141 @@
#ifndef EMSEQUENCEAPPLICATION_HPP
#define EMSEQUENCEAPPLICATION_HPP
#include <ApplicationInterface.hpp>
#include <string>
#include <vector>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
/*!
* \brief The EMSequenceApplication class is a wrapper around an EMSequence
* instance creating an autonomous application to classify sequences by directly
* passing all the options and parameters from the command line.
*/
class EMSequenceApplication: public ApplicationInterface
{
public:
EMSequenceApplication() = delete ;
EMSequenceApplication(const EMSequenceApplication& app) = delete ;
/*!
* \brief Constructs an object from the command line
* options.
* \param argn the number of options passed to the
* main() function.
* \param argv the vector of options passed to the
* main() function.
*/
EMSequenceApplication(int argn, char** argv) ;
/*!
* \brief Runs the application. The data are classified
* using the given settings and the posterior probability
* matrix is returned through the stdout.
* The matrix is a 4D matrix with dimensions :
* regions, class, shift flip.
* \return an exit code EXIT_SUCCESS or EXIT_FAILURE
* to return to the OS.
*/
virtual int run() override ;
private:
/*!
* \brief Parses the program command line options and
* sets the object field accordingly.
* If the help option is detected, the "runnable"
* field is set to false and subsequent calls to
* run() will produce nothing.
* \param argn the number of options passed to the
* main() function.
* \param argv the vector of options passed to the
* main() function.
* \throw std::invalid_argument if an error is found
* in the program options.
*/
void parseOptions(int argn, char** argv) ;
/*!
* \brief Initialise the class models if matrices
* are given as initial class motifs.
* If the given class motifs are shorter than the
* model after accounting for shifting, extra columns
* with uniform probabilities will be added on each
* side.
* If the number of classes is higher than the
* number of given motifs, extra classes will be
* initialised randomly.A background class is included
* if needed.
* \param model_len the number of positions (columns)
* of the model to initialise.
* \param data the sequence matrix, in integer format.
* \param motif_paths the paths to the files containing
* the probability matrices to use to initialise the
* class motifs.
* \return
*/
Matrix3D<double> init_model(size_t model_len,
const Matrix2D<int>& data,
const std::vector<std::string>& motif_paths) const ;
/*!
* \brief the paths to the file containing the sequence
* data.
*/
std::string file_seq ;
-
/*!
* \brief a coma separated list of files containing the
* initial motif matrices.
*/
std::string files_motif ;
-
+ /*!
+ * \brief the path to the file containing the
+ * (0-based) index of the rows to filter out
+ * from the data.
+ */
+ std::string file_filter ;
/*!
* \brief the path to the file in which the probability
* matrix will be saved.
*/
std::string file_out ;
-
/*!
* \brief the number of classes to partition the data into.
*/
size_t n_class ;
/*!
* \brief the number of iterations allowed.
*/
size_t n_iter ;
/*!
* \brief the shifting freedom.
*/
size_t n_shift ;
/*!
* \brief whether flipping freedom is allowed.
*/
bool flip ;
/*!
* \brief whether a constant class to model the
* sequence background should be added. This
* class has the sequence background probabilities
* at each position.
*/
bool bckg_class ;
/*!
* \brief the number of threads.
*/
size_t n_threads ;
/*!
* \brief a seed to initialise the random number generator.
*/
std::string seed ;
/*!
* \brief a flag indicating whether the core of run() can be
* run or not.
*/
bool runnable ;
} ;
#endif // EMSEQUENCEAPPLICATION_HPP
diff --git a/src/Clustering/ConsensusSequenceLayer.cpp b/src/Clustering/ConsensusSequenceLayer.cpp
index b8a5a89..5eab0b1 100644
--- a/src/Clustering/ConsensusSequenceLayer.cpp
+++ b/src/Clustering/ConsensusSequenceLayer.cpp
@@ -1,371 +1,377 @@
#include <ConsensusSequenceLayer.hpp>
#include <vector>
#include <future> // std::promise, std::future
#include <stdexcept> // std::invalid_argument
#include <cmath> // log()
#include <functional> // std::bind(), std::ref()
#include <ThreadPool.hpp>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <dna_utility.hpp> // dna::base_composition()
double ConsensusSequenceLayer::score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t start,
- const Matrix2D<double>& model_log)
+ const Matrix2D<double>& model)
{ size_t ncol = cons_seq.get_dim()[1] ;
size_t dim3 = cons_seq.get_dim()[2] ;
- if(start > ncol - model_log.get_nrow())
+ if(start > ncol - model.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
- start, ncol - model_log.get_nrow()) ;
+ start, ncol - model.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
- else if(model_log.get_nrow() > ncol)
+ else if(model.get_nrow() > ncol)
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
- model_log.get_nrow(), ncol) ;
+ model.get_nrow(), ncol) ;
throw std::invalid_argument(msg) ;
}
- else if(model_log.get_ncol() != 4)
+ else if(model.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
- model_log.get_ncol()) ;
+ model.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(dim3 != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
dim3) ;
throw std::invalid_argument(msg) ;
}
size_t from = start ;
- size_t to = from + model_log.get_nrow() ; // will score [from, to)
+ size_t to = from + model.get_nrow() ; // will score [from, to)
+
double ll = 0 ;
- for(size_t i=from, j=0; i<to; i++, j++)
- { for(size_t k=0; k<dim3; k++)
- { ll += model_log(j,k) + log(cons_seq(row,j,k)) ; }
+
+ for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
+ { double sum = 0. ;
+ for(size_t k=0; k<dim3; k++)
+ { sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ; }
+ ll += log(sum) ;
}
return ll ;
}
+
ConsensusSequenceLayer::ConsensusSequenceLayer(const Matrix3D<double>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data3DLayer(data, n_class, n_shift, flip,bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
ConsensusSequenceLayer::ConsensusSequenceLayer(Matrix3D<double>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data3DLayer(std::move(data), n_class, n_shift, flip, bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
ConsensusSequenceLayer::~ConsensusSequenceLayer()
{ ; }
void ConsensusSequenceLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads) const
{
// dimension checks
this->check_loglikelihood_dim(loglikelihood) ;
this->check_loglikelihood_max_dim(loglikelihood_max) ;
- // compute the log prob model and the log prob reverse-complement model
- std::vector<Matrix2D<double>> model_log(this->n_class,
- Matrix2D<double>(this->l_model,
- this->n_category,
- 0.)) ;
- std::vector<Matrix2D<double>> model_log_rev = model_log ;
+ // compute the prob model and the prob reverse-complement model
+ std::vector<Matrix2D<double>> model(this->n_class,
+ Matrix2D<double>(this->l_model,
+ this->n_category,
+ 0.)) ;
+ std::vector<Matrix2D<double>> model_rev = model ;
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ // forward
- model_log[i](j,k) = log(this->model(i,j,k)) ;
+ model[i](j,k) = this->model(i,j,k) ;
// reverse
- model_log_rev[i](this->l_model-j-1,this->n_category-k-1)
- = log(this->model(i,j,k)) ;
+ model_rev[i](this->l_model-j-1,this->n_category-k-1)
+ = this->model(i,j,k) ;
}
}
}
// don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihoods_routine(0, this->n_dim1,
loglikelihood,
loglikelihood_max,
- model_log,
- model_log_rev,
+ model,
+ model_rev,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_dim1, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ConsensusSequenceLayer::compute_loglikelihoods_routine,
this,
slice.first,
slice.second,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
- std::ref(model_log),
- std::ref(model_log_rev),
+ std::ref(model),
+ std::ref(model_rev),
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void ConsensusSequenceLayer::compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
- const std::vector<Matrix2D<double>>& model_log,
- const std::vector<Matrix2D<double>>& model_log_rev,
+ const std::vector<Matrix2D<double>>& model,
+ const std::vector<Matrix2D<double>>& model_rev,
std::promise<bool>& done) const
{
// compute log likelihood
for(size_t i=from; i<to; i++)
{
// set max to min possible value
loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;
for(size_t j=0; j<this->n_class; j++)
{
for(size_t s=0; s<this->n_shift; s++)
{ // forward strand
- { double ll_fw = std::max(score_consensus_subseq(this->data, i, s, model_log[j]),
+ { double ll_fw = std::max(score_consensus_subseq(this->data, i, s, model[j]),
ConsensusSequenceLayer::p_min_log);
// loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// apply lower bound here -> log(1e-100)
loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// keep track of max per row
if(ll_fw > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_fw ; }
}
// reverse
if(this->flip)
- { double ll_rev = std::max(score_consensus_subseq(this->data, i, s, model_log_rev[j]),
+ { double ll_rev = std::max(score_consensus_subseq(this->data, i, s, model_rev[j]),
ConsensusSequenceLayer::p_min_log) ;
// loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// apply lower bound here -> log(1e-100)
loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// keep track of max per row
if(ll_rev > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_rev ; }
}
}
}
}
done.set_value(true) ;
}
void ConsensusSequenceLayer::update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads)
{ // don't parallelize
if(threads == nullptr)
{ std::promise<Matrix3D<double>> promise ;
std::future<Matrix3D<double>> future = promise.get_future() ;
this->update_model_routine(0,
this->n_dim1,
posterior_prob,
promise) ;
// this->model = future.get() ;
auto model = future.get() ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = model(i,j,k) ; }
}
}
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_dim1, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ConsensusSequenceLayer::update_model_routine,
this,
slice.first,
slice.second,
std::ref(posterior_prob),
std::ref(promises[i])))) ;
}
// reinitialise the model
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = 0. ; }
}
}
// wait until all threads are done working
// and update the model
for(auto& future : futures)
{ Matrix3D<double> model_part = future.get() ;
// for(size_t i=0; i<this->n_class; i++)
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) += model_part(i,j,k) ; }
}
}
}
// -------------------------- threads stop ---------------------------
}
// make sure to have no 0 values
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) =
std::max(this->model(i,j,k), ConsensusSequenceLayer::p_min) ;
}
}
}
// normalize to get probs
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ double sum = 0. ;
for(size_t k=0; k<this->n_category; k++)
{ sum += this->model(i,j,k) ; }
for(size_t k=0; k<this->n_category; k++)
{ double p = this->model(i,j,k) / sum ;
this->model(i,j,k) = p ;
}
}
}
}
void ConsensusSequenceLayer::update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& promise) const
{ // dimension checks
this->check_posterior_prob_dim(posterior_prob) ;
Matrix3D<double> model(this->n_class,
this->l_model,
this->n_category,
0.) ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t k=0; k < n_class_to_update; k++)
{ for(size_t s=0; s<this->n_shift; s++)
{ for(size_t j=0; j<this->l_model; j++)
{ // base prob on fw and rv strand
vector_d base_prob_fw(this->n_category, 0.) ;
vector_d base_prob_rv(this->n_category, 0.) ;
for(size_t i=from; i<to; i++)
{ for(size_t base=0, base_rv=3; base<this->n_dim3; base++, base_rv--)
{ // --------------- forward ---------------
{ base_prob_fw[base] +=
this->data(i,s+j,base) *
posterior_prob(i,k,s,ConsensusSequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
- { base_prob_rv[base_rv] +=
+ {
+ base_prob_rv[base_rv] +=
this->data(i,s+j,base) *
posterior_prob(i,k,s,ConsensusSequenceLayer::REVERSE) ;
}
}
}
// update this position of the model
for(size_t i=0,i_rev=base_prob_fw.size()-1; i<base_prob_fw.size(); i++,i_rev--)
{ // --------------- forward ---------------
{ model(k,j,i) += base_prob_fw[i] ; }
// --------------- reverse ---------------
if(this->flip)
{ model(k,this->l_model-j-1,i) += base_prob_rv[i] ; }
}
}
}
}
promise.set_value(model) ;
}
void ConsensusSequenceLayer::create_bckg_class()
{ // sequence composition
std::vector<double> base_comp =
dna::base_composition(this->data,
this->flip) ;
// set last class as background class
for(size_t i=0; i<this->n_category; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ this->model(this->n_class-1,j,i) = base_comp[i] ; }
}
}
diff --git a/src/Clustering/ConsensusSequenceLayer.hpp b/src/Clustering/ConsensusSequenceLayer.hpp
index 5ebba92..5eb71e8 100644
--- a/src/Clustering/ConsensusSequenceLayer.hpp
+++ b/src/Clustering/ConsensusSequenceLayer.hpp
@@ -1,196 +1,196 @@
#ifndef CONSENSUSSEQUENCELAYER_HPP
#define CONSENSUSSEQUENCELAYER_HPP
#include <Data3DLayer.hpp>
#include <vector>
#include <future> // std::promise
#include <ThreadPool.hpp>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
typedef std::vector<double> vector_d ;
class ConsensusSequenceLayer : public Data3DLayer
{
public:
/*!
* \brief Computes the log-likelihood of the sub-
* sequence - stored in a given row (1st dimension) -
* and starting at the offset <col> (2nd dimension) in
* the given sequence matrix.
* The subsequence length is determined by the model
* lenght.
- * \param log_cons_seq a probability matrix containing
+ * \param cons_seq a probability matrix containing
* the consensus sequence. Its dimensions are :
* 1st the different sequences
* 2nd the sequences length
* 3rd 4 for A,C,G,T.
* \param row the row containing the sequence of
* interest.
* \param col the index at which the sub-sequence
* is starting (1st index inside the subsequence
* of interest).
- * \param model_log a model containing the log
- * probability model.
+ * \param model a model containing the probability
+ * model.
* \return the log-likelihood of the sub-sequence
* given the model.
* \throw std::invalid_argument if 1) the offset is
* invalid, 2) the sequence and the model have
* incompatible dimensions or 3) the model 2n dimension
* is not 4 (A,C,G,T).
*/
static double score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t col,
- const Matrix2D<double>& model_log) ;
+ const Matrix2D<double>& model) ;
public:
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
ConsensusSequenceLayer(const Matrix3D<double>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
ConsensusSequenceLayer(Matrix3D<double>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Destructor.
*/
virtual ~ConsensusSequenceLayer() ;
/*!
* \brief Computes the log likelihood of the data
* given the current model parameters.
* \param logliklihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of loglikelihood.
* Its length should be equal to the data row number.
* \throw std::invalid_argument if the dimensions are
* incorrect.
*/
virtual void compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads=nullptr) const override ;
/*!
* \brief Updates the model given the posterior
* probabilities (the probabilities of each row
* in the data to be assigned to each class,
* for each shift and flip state).
* \param posterior_prob the data assignment probabilities to
* the different classes.
*/
virtual void update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads=nullptr) override ;
protected:
/*!
* \brief The routine that effectively performs the
* loglikelihood computations.
* \param from the index of the first row of the data
* to considered.
* \param to the index of the past last row of the data
* to considered.
* \param loglikelihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of log_likelihood.
* Its length should be equal to the data row number.
* \param model_log a vector containing the matrices with
* the log values of the model for each class.
* \param model_log_rev a vector containing the matrices with
* the log values of the reverse strand model for each class
* (the 1st position in the model becomes the last in the
* reverse model and probabilities are swapped A<->T and C<->G).
* \param done a promise to be filled when the routine
* is done running.
*/
void compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model_log,
const std::vector<Matrix2D<double>>& model_log_rev,
std::promise<bool>& done) const ;
/*!
* \brief The routine that effectively update the model.
* \param from the index of the first row of the
* posterior probabilities to considered.
* \param to the index of the past last row of the
* posterior probabilities to considered.
* \param posterior_prob the data assignment probabilities
* to the different classes.
* \param
* \param done a promise containing the partial model
* computed from the given data slice. If several routines
* work together at updating the model, they need to be
* summed and normalized (by the column sum) to get the
* final model.
*/
void update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& done) const ;
/*!
* \brief Sets the last class as background class with the
* mean base probababilities of the data at each position.
*/
void create_bckg_class() ;
protected:
} ;
#endif // CONSENSUSSEQUENCELAYER_HPP
diff --git a/src/Clustering/EMConsensusSequence.cpp b/src/Clustering/EMConsensusSequence.cpp
index 5a52a0e..9cac9ee 100644
--- a/src/Clustering/EMConsensusSequence.cpp
+++ b/src/Clustering/EMConsensusSequence.cpp
@@ -1,294 +1,293 @@
#include <EMConsensusSequence.hpp>
#include <string>
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <ConsensusSequenceLayer.hpp> // SequenceLayer
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
#include <ThreadPool.hpp> // ThreadPool
#include <dna_utility.hpp> // dna::base_composition()
EMConsensusSequence::EMConsensusSequence(const Matrix3D<double>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_dim()[0],
seq_matrix.get_dim()[1],
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
cseq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->cseq_layer = new ConsensusSequenceLayer(seq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->cseq_layer->update_model(this->post_prob,
this->threads) ;
}
EMConsensusSequence::EMConsensusSequence(Matrix3D<double>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_dim()[0],
seq_matrix.get_dim()[1],
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
cseq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->cseq_layer = new ConsensusSequenceLayer(std::move(seq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->cseq_layer->update_model(this->post_prob,
this->threads) ;
}
EMConsensusSequence::~EMConsensusSequence()
{ if(this->cseq_layer != nullptr)
{ delete this->cseq_layer ;
this->cseq_layer = nullptr ;
}
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
}
Matrix3D<double> EMConsensusSequence::get_sequence_models() const
{ return this->cseq_layer->get_model() ; }
EMConsensusSequence::exit_codes EMConsensusSequence::classify()
{
size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{ // E-step
this->compute_loglikelihood() ;
this->compute_post_prob() ;
// M-step
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
-
}
bar.update() ; std::cerr << std::endl ;
return EMConsensusSequence::exit_codes::ITER_MAX ;
}
void EMConsensusSequence::compute_loglikelihood()
{ // compute the loglikelihood
this->cseq_layer->compute_loglikelihoods(this->loglikelihood,
this->loglikelihood_max,
this->threads) ;
// rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMConsensusSequence::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void EMConsensusSequence::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{
// rescale the values
for(size_t i=from; i<to; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
(this->loglikelihood(i,j,k,l) -
this->loglikelihood_max[i]) ;
}
}
}
}
done.set_value(true) ;
}
void EMConsensusSequence::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMConsensusSequence::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMConsensusSequence::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// reset grand total
// this->post_prob_tot = 0 ;
// this->post_prob_colsum = vector_d(n_class, 0) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
ConsensusSequenceLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMConsensusSequence::update_models()
{ this->cseq_layer->update_model(this->post_prob,
this->threads) ;
}
diff --git a/src/Clustering/EMSequence.cpp b/src/Clustering/EMSequence.cpp
index bf2bb0e..3f5bebb 100644
--- a/src/Clustering/EMSequence.cpp
+++ b/src/Clustering/EMSequence.cpp
@@ -1,356 +1,357 @@
#include <EMSequence.hpp>
#include <string>
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <cmath> // exp()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <SequenceLayer.hpp> // SequenceLayer
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
#include <ThreadPool.hpp> // ThreadPool
EMSequence::EMSequence(const Matrix2D<int>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->seq_layer = new SequenceLayer(seq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
+
EMSequence::EMSequence(Matrix2D<int>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->seq_layer = new SequenceLayer(std::move(seq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
+
EMSequence::EMSequence(const Matrix2D<int>& seq_matrix,
const Matrix3D<double>& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
motifs.get_dim()[0],
n_iter,
seq_matrix.get_ncol() - motifs.get_dim()[1] + 1,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// data and models
// background motif (if any) is the last of the given motifs
this->seq_layer = new SequenceLayer(seq_matrix,
motifs,
this->flip,
bckg_class) ;
// intialise the class prob uniformly
this->set_state_prob_uniform() ;
}
EMSequence::EMSequence(Matrix2D<int>&& seq_matrix,
Matrix3D<double>&& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
motifs.get_dim()[0],
n_iter,
seq_matrix.get_ncol() - motifs.get_dim()[1] + 1,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// data and models
// background motif (if any) is the last of the given motifs
this->seq_layer = new SequenceLayer(std::move(seq_matrix),
std::move(motifs),
this->flip,
bckg_class) ;
// intialise the class prob uniformly
this->set_state_prob_uniform() ;
}
EMSequence::~EMSequence()
-{ if(this->seq_layer != nullptr)
+{ if(this->seq_layer != nullptr)
{ delete this->seq_layer ;
this->seq_layer = nullptr ;
}
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
}
Matrix3D<double> EMSequence::get_sequence_models() const
{ return seq_layer->get_model() ; }
EMSequence::exit_codes EMSequence::classify()
-{
- size_t bar_update_n = this->n_iter ;
+{ size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{ // E-step
this->compute_loglikelihood() ;
this->compute_post_prob() ;
// M-step
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return EMSequence::exit_codes::ITER_MAX ;
}
void EMSequence::compute_loglikelihood()
{ // compute the loglikelihood
this->seq_layer->compute_loglikelihoods(this->loglikelihood,
this->loglikelihood_max,
this->threads) ;
// rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMSequence::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void EMSequence::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{
// rescale the values
for(size_t i=from; i<to; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
(this->loglikelihood(i,j,k,l) -
this->loglikelihood_max[i]) ;
}
}
}
}
done.set_value(true) ;
}
void EMSequence::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMSequence::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMSequence::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// reset grand total
// this->post_prob_tot = 0 ;
// this->post_prob_colsum = vector_d(n_class, 0) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
SequenceLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMSequence::update_models()
{ this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
diff --git a/src/Clustering/SequenceLayer.cpp b/src/Clustering/SequenceLayer.cpp
index d3048b4..8906c96 100644
--- a/src/Clustering/SequenceLayer.cpp
+++ b/src/Clustering/SequenceLayer.cpp
@@ -1,423 +1,432 @@
#include <SequenceLayer.hpp>
#include <stdexcept> // std::invalid_argument
#include <limits> // numeric_limits
#include <cmath> // log(), pow()
#include <vector>
#include <algorithm> // std::max_element()
#include <future> // std::promise, std::future
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <dna_utility.hpp> // dna::base_composition()
double SequenceLayer::score_subseq(const Matrix2D<int>& seq,
size_t row,
size_t start,
const Matrix2D<double>& model_log)
{
if(start > seq.get_ncol() - model_log.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, seq.get_ncol() - model_log.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model_log.get_nrow() > seq.get_ncol())
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model_log.get_nrow(), seq.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(model_log.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model_log.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
size_t from = start ;
size_t to = from + model_log.get_nrow() ; // will score [from, to)
int n_code = dna::char_to_int('N') ;
double ll = 0 ;
for(size_t i=from, j=0; i<to; i++, j++)
{ int base = seq(row,i) ;
// N char -> get max score
if(base == n_code)
{ std::vector<double> row = model_log.get_row(j) ;
ll += *(std::max_element(std::begin(row),
std::end(row))) ;
}
// A,C,G,T -> get its score
else
{ ll += model_log(j,base) ; }
}
return ll ;
}
SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data2DLayer(data, n_class, n_shift, flip,bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
SequenceLayer::SequenceLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data2DLayer(std::move(data), n_class, n_shift, flip, bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool bckg_class)
: Data2DLayer(data, model,flip, bckg_class)
{}
SequenceLayer::SequenceLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool bckg_class)
: Data2DLayer(std::move(data), std::move(model),flip, bckg_class)
{}
SequenceLayer::~SequenceLayer()
{ ; }
void SequenceLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads) const
{
// dimension checks
this->check_loglikelihood_dim(loglikelihood) ;
this->check_loglikelihood_max_dim(loglikelihood_max) ;
// compute the log prob model and the log prob reverse-complement model
std::vector<Matrix2D<double>> model_log(this->n_class,
Matrix2D<double>(this->l_model,
this->n_category,
0.)) ;
std::vector<Matrix2D<double>> model_log_rev = model_log ;
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ // forward
model_log[i](j,k) = log(this->model(i,j,k)) ;
// reverse
model_log_rev[i](this->l_model-j-1,this->n_category-k-1)
= log(this->model(i,j,k)) ;
}
}
}
// don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihoods_routine(0, this->n_row,
loglikelihood,
loglikelihood_max,
model_log,
model_log_rev,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&SequenceLayer::compute_loglikelihoods_routine,
this,
slice.first,
slice.second,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
std::ref(model_log),
std::ref(model_log_rev),
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void SequenceLayer::compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model_log,
const std::vector<Matrix2D<double>>& model_log_rev,
std::promise<bool>& done) const
{
// compute log likelihood
for(size_t i=from; i<to; i++)
{
// set max to min possible value
loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;
for(size_t j=0; j<this->n_class; j++)
- {
- for(size_t s=0; s<this->n_shift; s++)
+ { for(size_t s=0; s<this->n_shift; s++)
{ // forward strand
{ double ll_fw = std::max(score_subseq(this->data, i, s, model_log[j]),
SequenceLayer::p_min_log) ;
// loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// apply lower bound here -> log(1e-100)
loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// keep track of max per row
if(ll_fw > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_fw ; }
}
// reverse
if(this->flip)
{ double ll_rev = std::max(score_subseq(this->data, i, s, model_log_rev[j]),
SequenceLayer::p_min_log) ;
// loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// apply lower bound here -> log(1e-100)
loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// keep track of max per row
if(ll_rev > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_rev ; }
}
}
}
}
done.set_value(true) ;
}
void SequenceLayer::update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads)
{ // don't parallelize
if(threads == nullptr)
{ std::promise<Matrix3D<double>> promise ;
std::future<Matrix3D<double>> future = promise.get_future() ;
this->update_model_routine(0,
this->n_row,
posterior_prob,
promise) ;
// this->model = future.get() ;
auto model = future.get() ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = model(i,j,k) ; }
}
}
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&SequenceLayer::update_model_routine,
this,
slice.first,
slice.second,
std::ref(posterior_prob),
std::ref(promises[i])))) ;
}
// reinitialise the model
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = 0. ; }
}
}
// wait until all threads are done working
// and update the model
for(auto& future : futures)
{ Matrix3D<double> model_part = future.get() ;
// for(size_t i=0; i<this->n_class; i++)
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) += model_part(i,j,k) ; }
}
}
}
// -------------------------- threads stop ---------------------------
}
// make sure to have no 0 values
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) =
std::max(this->model(i,j,k), SequenceLayer::p_min) ;
}
}
}
// normalize to get probs
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ double sum = 0. ;
for(size_t k=0; k<this->n_category; k++)
{ sum += this->model(i,j,k) ; }
for(size_t k=0; k<this->n_category; k++)
{ double p = this->model(i,j,k) / sum ;
this->model(i,j,k) = p ;
}
}
}
}
void SequenceLayer::update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& promise) const
{ // dimension checks
this->check_posterior_prob_dim(posterior_prob) ;
Matrix3D<double> model(this->n_class,
this->l_model,
this->n_category,
0.) ;
// the int code of A, C, G, T, N
static int a_code = dna::char_to_int('A') ;
static int c_code = dna::char_to_int('C') ;
static int g_code = dna::char_to_int('G') ;
static int t_code = dna::char_to_int('T') ;
static int n_code = dna::char_to_int('N') ;
// the int code of the reverse complement of A, C, G, T
static int a_code_r = dna::char_to_int('A', true) ;
static int c_code_r = dna::char_to_int('C', true) ;
static int g_code_r = dna::char_to_int('G', true) ;
static int t_code_r = dna::char_to_int('T', true) ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
- for(size_t k=0; k < n_class_to_update; k++)
+ for(size_t k=0; k<n_class_to_update; k++)
{ for(size_t s=0; s<this->n_shift; s++)
- { for(size_t j=0; j<this->l_model; j++)
+ { // for(size_t j=0; j<this->l_model; j++)
+ for(size_t j=0, j_rv=this->l_model-1; j<this->l_model; j++, j_rv--)
{ // base prob on fw and rv strand
vector_d base_prob_fw(this->n_category, 0.) ;
vector_d base_prob_rv(this->n_category, 0.) ;
for(size_t i=from; i<to; i++)
{ int base = this->data(i,s+j) ;
- int base_rev = this->n_category - base - 1 ;
+ int base_rv = this->n_category - base - 1 ; // complement
+ // int base_rv = this->n_category - this->data(i,s+j_rv) - 1 ;
+ // std::cerr << k << " "
+ // << s << " "
+ // << j << "/" << j_rv << " "
+ // << base << "/" << base_rv
+ // << std::endl ;
// N
if(base == n_code)
{ // --------------- forward ---------------
{ base_prob_fw[a_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[c_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[g_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[t_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
{ base_prob_rv[a_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[c_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[g_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[t_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
}
}
// A, C, G, T
else
{ // --------------- forward ---------------
{ base_prob_fw[base] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
- { base_prob_rv[base_rev] +=
+ { base_prob_rv[base_rv] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
}
}
}
// update this position of the model
- for(size_t i=0,i_rev=base_prob_fw.size()-1; i<base_prob_fw.size(); i++,i_rev--)
+ for(size_t i=0; i<base_prob_fw.size(); i++)
{ // --------------- forward ---------------
{ model(k,j,i) += base_prob_fw[i] ; }
// --------------- reverse ---------------
if(this->flip)
- { model(k,this->l_model-j-1,i) += base_prob_rv[i] ; }
+ { model(k,this->l_model-j-1,i) += base_prob_rv[i] ;
+ /* model(k,j_rv,i) += base_prob_rv[i] ; */
+
+ }
}
}
}
}
promise.set_value(model) ;
}
void SequenceLayer::create_bckg_class()
{ // sequence composition
std::vector<double> base_comp =
dna::base_composition(this->data,
this->flip) ;
// set last class as background class
for(size_t i=0; i<this->n_category; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ this->model(this->n_class-1,j,i) = base_comp[i] ; }
}
}
diff --git a/src/Utility/matrices.cpp b/src/Utility/matrices.cpp
deleted file mode 100644
index 1b12201..0000000
--- a/src/Utility/matrices.cpp
+++ /dev/null
@@ -1,555 +0,0 @@
-#include <matrices.hpp>
-
-#include <iostream>
-#include <string>
-#include <vector>
-#include <iomanip>
-#include <fstream> // ifstream
-#include <sstream> // istringstream
-#include <stdexcept> // std::runtime_error()
-
-matrix2d_i read_matrix2d_i(const std::string &file_address)
-{
- matrix2d_i data ;
-
- std::ifstream file(file_address, std::ifstream::in) ;
- if(file.fail())
- { char msg[4096] ;
- sprintf(msg, "error! cannot open %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
-
- std::string buffer_str ;
- vector_i buffer_vec ;
- int buffer_int ;
-
- // read file
- size_t n_line = 0 ;
- size_t row_len = 0 ;
-
-
- while(getline(file, buffer_str))
- { // check stream status and read content
- if(file.fail())
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "error! while reading %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- else if(buffer_str.size() == 0)
- { // this file only contains one eol char and should be considered as empty,
- // -> returns empty matrix not an error
- if(n_line == 0 and file.peek() == EOF and file.eof())
- { break ; }
-
- file.close() ;
- char msg[4096] ;
- sprintf(msg, "format error! while reading %s (empty line)", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- // parse line
- buffer_vec.clear() ;
- std::istringstream buffer_ss(buffer_str) ;
- while(buffer_ss >> buffer_int)
- { buffer_vec.push_back(buffer_int) ; }
- // check for an error which likely indicates that a value could not be
- // casted into a type T (mixed data types in the file)
- if(buffer_ss.fail() and not buffer_ss.eof())
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- // check that number of column is constant
- if(n_line == 0)
- { row_len = buffer_vec.size() ; }
- else if(buffer_vec.size() != row_len)
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "format error! variable number of columns in %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- // update matrix content
- data.push_back(vector_i()) ;
- for(auto i : buffer_vec)
- { data.back().push_back(i) ; }
- n_line++ ;
- }
- file.close() ;
-
- return data ;
-}
-
-matrix2d_c read_matrix2d_c(const std::string &file_address)
-{
- matrix2d_c data ;
-
- std::ifstream file(file_address, std::ifstream::in) ;
- if(file.fail())
- { char msg[4096] ;
- sprintf(msg, "error! cannot open %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
-
- std::string buffer_str ;
- vector_c buffer_vec ;
- char buffer_char ;
-
- // read file
- size_t n_line = 0 ;
- size_t row_len = 0 ;
-
-
- while(getline(file, buffer_str))
- { // check stream status and read content
- if(file.fail())
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "error! while reading %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- else if(buffer_str.size() == 0)
- { // this file only contains one eol char and should be considered as empty,
- // -> returns empty matrix not an error
- if(n_line == 0 and file.peek() == EOF and file.eof())
- { break ; }
-
- file.close() ;
- char msg[4096] ;
- sprintf(msg, "format error! while reading %s (empty line)", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- // parse line
- buffer_vec.clear() ;
- std::istringstream buffer_ss(buffer_str) ;
- while(buffer_ss >> buffer_char)
- { buffer_vec.push_back(buffer_char) ; }
- // check for an error which likely indicates that a value could not be
- // casted into a type T (mixed data types in the file)
- if(buffer_ss.fail() and not buffer_ss.eof())
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- // check that number of column is constant
- if(n_line == 0)
- { row_len = buffer_vec.size() ; }
- else if(buffer_vec.size() != row_len)
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "format error! variable number of columns in %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- // update matrix content
- data.push_back(vector_c()) ;
- for(auto i : buffer_vec)
- { data.back().push_back(i) ; }
- n_line++ ;
- }
- file.close() ;
-
- return data ;
-}
-
-matrix4d_d read_matrix4d_d(const std::string &file_address)
-{
-
- std::ifstream file(file_address, std::ifstream::in) ;
- if(file.fail())
- { char msg[4096] ;
- sprintf(msg, "error! cannot open %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
-
- // the data
- std::vector<size_t> data_dim(4,0) ;
- vector_d data ;
-
- // some buffers
- std::string buffer_str ;
- vector_d buffer_d ;
- std::vector<size_t> dim ;
-
- // read 1st line
- getline(file, buffer_str) ;
-
- // empty line
- if(buffer_str.size() == 0)
- { // this file only contains one eol char and should be considered as empty,
- // -> returns empty matrix not an error
- if(file.peek() == EOF and file.eof())
- { file.close() ;
- return matrix4d_d() ;
- }
- file.close() ;
- char msg[4096] ;
- sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- if(file.fail())
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "error! while reading %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
-
- bool found_4d_header = is_header_4d(buffer_str) ;
- do
- { if(file.fail())
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "error! while reading %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- // check empty line
- if(buffer_str.size() == 0)
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
-
- // this is the beginning of a 3D slice -> get it using routine
- if(found_4d_header)
- { try
- { // get slice
- buffer_d.clear() ;
- dim.clear() ;
- found_4d_header = get_3d_slice(file_address, file, buffer_d, dim);
- // update data
- for(const auto& i : buffer_d)
- { data.push_back(i) ; }
- // update dim only for the 1st slice (the 1st slice set the dimensions)
- if(data_dim[3] == 0)
- { data_dim[0] = dim[0] ;
- data_dim[1] = dim[1] ;
- data_dim[2] = dim[2] ;
- }
- // check dimensions of the slice
- else
- { if(dim[0] != data_dim[0] or
- dim[1] != data_dim[1] or
- dim[2] != data_dim[2])
- { char msg[4096] ;
- sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- }
- data_dim[3]++ ;
- }
- catch(std::runtime_error& e)
- { file.close() ;
- throw e ;
- }
- }
- // this is an error, everything between two ',,,N' header
- // should be read at once. The only way out of the loop
- // is that no more header has been read because of eof
- else if(not found_4d_header and not file.eof())
- { file.close() ;
- char msg[4096] ;
- sprintf(msg, "error! while reading %s", file_address.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- } while(found_4d_header) ;
- file.close() ;
-
- // reformat data
- matrix4d_d matrix(data_dim[1],
- matrix3d_d(data_dim[0],
- matrix2d_d(data_dim[2],
- vector_d(data_dim[3])))) ;
- size_t n=0 ;
- for(size_t l=0; l<matrix[0][0][0].size(); l++)
- { for(size_t k=0; k<matrix[0][0].size(); k++)
- { for(size_t i=0; i<matrix.size(); i++)
- { for(size_t j=0; j<matrix[0].size(); j++)
- { matrix[i][j][k][l] = data[n] ;
- n++ ;
- }
- }
- }
- }
-
- return matrix ;
-}
-
-bool is_header_3d(const std::string &str)
-{ if(str[0] == ',' and
- str[1] == ',' and
- str.find(',', 2) == std::string::npos)
- { return true ; }
- return false ;
-}
-
-bool is_header_4d(const std::string &str)
-{ if(str[0] == ',' and
- str[1] == ',' and
- str[2] == ',' and
- str.find(',', 3) == std::string::npos)
- { return true ; }
- return false ;
-}
-
-bool get_3d_slice(const std::string& file_name, std::ifstream& file,
- vector_d& data, std::vector<size_t>& dim)
-{
- bool found_4d_header = false ; // the flag to return
-
- dim = {0,0,0} ;
-
- std::string buffer_str ;
- vector_d buffer_vec_d ;
- double buffer_d ;
-
- size_t n_line = 0, n_line_data = 0 ; // number of line and of data line read
- size_t row_len = 0, col_len = 0 ; // length of row and column in nber of values
- size_t row_len_cur = 0, col_len_cur = 0 ; // current number of values read in row and col
-
- while(getline(file, buffer_str))
- { if(file.fail())
- { char msg[4096] ;
- sprintf(msg, "error! while reading %s", file_name.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- // check empty line
- if(buffer_str.size() == 0)
- { char msg[4096] ;
- sprintf(msg, "error! while reading %s (empty line)", file_name.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- // check whether this is the beginning of a 4D slice header, if so
- // break
- if(is_header_4d(buffer_str))
- { found_4d_header = true ;
- break ;
- }
- // check whether it is the beginning of a slice
- // 1st line in file should be
- if(is_header_3d(buffer_str))
- { // check that slice have a constant number of rows
- if(dim[2] == 1)
- { col_len = col_len_cur ;
- // dim[0] = row_len ;
- // dim[1] = col_len ;
- }
- else if(col_len_cur != col_len)
- { char msg[4096] ;
- sprintf(msg, "format error! slice have variable dimensions in %s", file_name.c_str()) ;
- throw std::runtime_error(msg) ;
- }
- dim[2]++ ;
- col_len_cur = 0 ;
- n_line++ ;
- continue ;
- }
- // 1st line in file should be a header and entering
- // this block is forbidden
- if(n_line == 0)
- { char msg[4096] ;
- sprintf(msg, "format error! first line is not a slice header in %s", file_name.c_str()) ;
- throw std::runtime_error(msg) ;
- }
-
- // parse line
- row_len_cur = 0 ;
- buffer_vec_d.clear() ;
- std::istringstream buffer_ss(buffer_str) ;
- while(buffer_ss >> buffer_d)
- { buffer_vec_d.push_back(buffer_d) ;
- row_len_cur++ ;
- }
- // check for an error which likely indicates that a value could not be
- // casted into a type T (mixed data types in the file)
- if(buffer_ss.fail() and not buffer_ss.eof())
- { char msg[4096] ;
- sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_name.c_str()) ;
- throw std::runtime_error(msg) ;
- }
-
- // check that number of column is constant
- if(n_line_data == 0)
- { row_len = row_len_cur ; }
- else if(row_len_cur != row_len)
- { char msg[4096] ;
- sprintf(msg, "format error! slice have variable dimensions in %s", file_name.c_str()) ;
- throw std::runtime_error(msg) ;
- }
-
- // update matrix content
- for(auto i : buffer_vec_d)
- { data.push_back(i) ; }
- col_len_cur++ ;
- n_line_data++ ;
- n_line++ ;
- // update dimension
- dim[0] = row_len_cur ;
- dim[1] = col_len_cur ;
- }
-
- // check dimensions of last slice
- if(col_len_cur != dim[1])
- { char msg[4096] ;
- sprintf(msg, "format error! slice have variable dimensions 333 in %s", file_name.c_str()) ;
- throw std::runtime_error(msg) ;
- }
-
- return found_4d_header ;
-}
-
-std::ostream& operator << (std::ostream& stream, const vector_d& vector)
-{ // set streams
- stream.setf(std::ios::left) ;
- // display parameters
- char sep = ' ' ;
- size_t width = 8 ;
-
- for(const auto& x : vector)
- { stream << std::setw(width) << x << sep ; }
- return stream ;
-
-}
-
-std::ostream& operator << (std::ostream& stream, const matrix2d_i& matrix)
-{
- // set streams
- stream.setf(std::ios::left) ;
- // display parameters
- char sep = ' ' ;
- size_t width = 8 ;
- // other
- size_t n = 0 ;
- size_t n_tot = matrix.size()*matrix[0].size() ;
-
-
- for(const auto& row : matrix)
- { for(const auto x : row)
- { stream << std::setw(width) << x << sep ;
- n++ ;
- }
- if(n<n_tot)
- { stream << std::endl ; }
- }
- return stream ;
-}
-
-std::ostream& operator << (std::ostream& stream, const matrix2d_c& matrix)
-{
- // set streams
- stream.setf(std::ios::left) ;
- // display parameters
- char sep = ' ' ;
- size_t width = 8 ;
- // other
- size_t n = 0 ;
- size_t n_tot = matrix.size()*matrix[0].size() ;
-
-
- for(const auto& row : matrix)
- { for(const auto x : row)
- { stream << std::setw(width) << x << sep ;
- n++ ;
- }
- if(n<n_tot)
- { stream << std::endl ; }
- }
- return stream ;
-}
-
-std::ostream& operator << (std::ostream& stream, const matrix2d_d& matrix)
-{
- // set streams
- stream.setf(std::ios::left) ;
- // display parameters
- char sep = ' ' ;
- size_t width = 8 ;
- // other
- size_t n = 0 ;
- size_t n_tot = matrix.size()*matrix[0].size() ;
-
-
- for(const auto& row : matrix)
- { for(const auto x : row)
- { stream << std::setw(width) << x << sep ;
- n++ ;
- }
- if(n<n_tot)
- { stream << std::endl ; }
- }
- return stream ;
-}
-
-std::ostream& operator << (std::ostream& stream, const matrix3d_d& matrix)
-{
- size_t x_dim = matrix.size() ;
- size_t y_dim = matrix[0].size() ;
- size_t z_dim = matrix[0][0].size() ;
-
- // display parameters
- char sep = ' ' ;
- size_t width = 8 ;
- size_t precision = 4 ;
- // set streams
- stream.setf(std::ios::left) ;
- stream << std::setprecision(precision) << std::fixed ;
- // other
- size_t n = 0 ;
- size_t n_tot = x_dim*y_dim*z_dim ;
-
- // if the matrix has at least one 0 dimension (no data), don't do anything
- if(x_dim==0 or y_dim==0 or z_dim==0)
- { return stream ; }
-
- stream.setf(std::ios::left) ;
- stream << std::setprecision(precision) << std::fixed ;
-
- for(size_t z=0; z<z_dim; z++)
- { stream << ",," << z << std::endl ;
- for(size_t x=0; x<x_dim; x++)
- { for(size_t y=0; y<y_dim; y++, n++)
- { stream << std::setw(width) << matrix[x][y][z] << sep ; }
- if(n<n_tot)
- { stream << std::endl ; }
- }
- }
- return stream ;
-}
-
-std::ostream& operator << (std::ostream& stream, const matrix4d_d& matrix)
-{
- size_t dim_0 = matrix.size() ;
- size_t dim_1 = matrix[0].size() ;
- size_t dim_2 = matrix[0][0].size() ;
- size_t dim_3 = matrix[0][0][0].size() ;
-
- // if the matrix has at least one 0 dimension (no data), don't do anything
- if(dim_0==0 or dim_1==0 or dim_2==0 or dim_3==0)
- { return stream ; }
-
- // display parameters
- char sep = ' ' ;
- size_t width = 8 ;
- size_t precision = 4 ;
- // set streams
- stream.setf(std::ios::left) ;
- stream << std::setprecision(precision) << std::fixed ;
- // other
- size_t n = 0 ;
- size_t n_tot = dim_0*dim_1*dim_2*dim_3 ;
-
- for(size_t x_3=0; x_3<dim_3; x_3++)
- { stream << ",,," << x_3 << std::endl ;
- for(size_t x_2=0; x_2<dim_2; x_2++)
- { stream << ",," << x_2 << std::endl ;
- for(size_t x_0=0; x_0<dim_0; x_0++)
- { for(size_t x_1=0; x_1<dim_1; x_1++, n++)
- { stream << std::setw(width) << matrix[x_0][x_1][x_2][x_3] << sep ; }
- // avoids terminal eol
- if(n < n_tot)
- { stream << std::endl ; }
- }
- }
- }
- return stream ;
-}
diff --git a/src/main_test.cpp b/src/main_test.cpp
index de4471a..2256a7a 100644
--- a/src/main_test.cpp
+++ b/src/main_test.cpp
@@ -1,148 +1,457 @@
#include <iostream>
#include <string>
+#include <vector>
+#include <utility>
+#include <cmath>
#include <Matrix2D.hpp>
+#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
-#include <CorrelationMatrixCreator.hpp>
-#include <SequenceMatrixCreator.hpp>
+#include <ConsensusSequenceModelComputer.hpp>
+#include <EMConsensusSequence.hpp>
+#include <SequenceModelComputer.hpp>
+#include <EMSequence.hpp>
#include <dna_utility.hpp>
-#include <EMJoint.hpp>
-using namespace std ;
template<class T>
std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
+
Matrix3D<double> compute_consensus_sequences(const Matrix2D<int>& seq,
const Matrix4D<double>& prob,
size_t class_k)
{
size_t n_seq = seq.get_nrow() ;
size_t l_seq = seq.get_ncol() ;
// size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
bool flip = n_flip == 2 ;
size_t l_model = l_seq - n_shift + 1 ;
Matrix3D<double> cons_seq(n_seq, l_model, 4, 0.) ;
// aggregate
for(size_t i=0; i<n_seq; i++)
{ std::cerr << "i " << i << std::endl ;
for(size_t s=0; s<n_shift; s++)
{ //std::cerr << "s " << s << std::endl ;
double tot = 0. ;
// aggregate
for(size_t j=0; j<l_model; j++)
{ // std::cerr << "j " << j << std::endl ;
// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
int base = seq(i,s+j) ;
// N should participate to A,C,G,T equivalently -> same as doing nothing
if(base == dna::char_to_int('N'))
{ continue ; }
int base_rv = 4 - base - 1 ;
// --------- forward ---------
// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
cons_seq(i,j,base) += prob(i,class_k,s,0) ;
tot += prob(i,class_k,s,0) ;
// --------- reverse ---------
if(flip)
{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ;
tot += prob(i,class_k,s,1) ;
}
}
}
}
// normalize
for(size_t i=0; i<n_seq; i++)
{ for(size_t j=0; j<l_model; j++)
{ double tot = 0. ;
for(size_t k=0; k<4; k++)
{ tot += cons_seq(i,j,k) ; }
for(size_t k=0; k<4; k++)
{ cons_seq(i,j,k) /= tot ; }
}
}
return cons_seq ;
}
+Matrix2D<int> char_to_int(const Matrix2D<char>& seq)
+{
+ size_t n_row = seq.get_nrow() ;
+ size_t n_col = seq.get_ncol() ;
-int main()
+ Matrix2D<int> iseq(n_row, n_col, 0.) ;
+
+ for(size_t i=0; i<n_row; i++)
+ { for(size_t j=0; j<n_col; j++)
+ { iseq(i,j) = dna::char_to_int(seq(i,j)) ; }
+ }
+ return iseq ;
+}
+
+Matrix3D<double> sequence_to_consensus(const Matrix2D<int>& seq)
{
- // sequences
- Matrix2D<int> seq(4, 7, 0) ;
- seq(0,0) = 0 ; seq(0,1) = 3 ; seq(0,2) = 3 ; seq(0,3) = 3 ; seq(0,4) = 1 ; seq(0,5) = 1 ; seq(0,6) = 1 ;
- seq(1,0) = 1 ; seq(1,1) = 0 ; seq(1,2) = 3 ; seq(1,3) = 3 ; seq(1,4) = 3 ; seq(1,5) = 1 ; seq(1,6) = 1 ;
- seq(2,0) = 1 ; seq(2,1) = 1 ; seq(2,2) = 3 ; seq(2,3) = 0 ; seq(2,4) = 0 ; seq(2,5) = 0 ; seq(2,6) = 1 ;
- seq(3,0) = 1 ; seq(3,1) = 1 ; seq(3,2) = 1 ; seq(3,3) = 3 ; seq(3,4) = 0 ; seq(3,5) = 0 ; seq(3,6) = 0 ;
-
- // probabilities
- Matrix4D<double> prob(4, 1, 4, 2) ;
- // row 1, cluster 1
- prob(0,0,0,0) = 999 ; prob(0,0,0,1) = .01 ; // shift 1, both flips
- prob(0,0,1,0) = .01 ; prob(0,0,1,1) = .01 ; // shift 2, both flips
- prob(0,0,2,0) = .01 ; prob(0,0,2,1) = .01 ; // shift 3, both flips
- prob(0,0,3,0) = .01 ; prob(0,0,3,1) = .01 ; // shift 4, both flips
- // row 2, cluster 1
- prob(1,0,0,0) = .01 ; prob(1,0,0,1) = .01 ; // shift 1, both flips
- prob(1,0,1,0) = 999 ; prob(1,0,1,1) = .01 ; // shift 2, both flips
- prob(1,0,2,0) = .01 ; prob(1,0,2,1) = .01 ; // shift 3, both flips
- prob(1,0,3,0) = .01 ; prob(1,0,3,1) = .01 ; // shift 4, both flips
- // row 3, cluster 1
- prob(2,0,0,0) = .01 ; prob(2,0,0,1) = .01 ; // shift 1, both flips
- prob(2,0,1,0) = .01 ; prob(2,0,1,1) = .01 ; // shift 2, both flips
- prob(2,0,2,0) = .01 ; prob(2,0,2,1) = 999 ; // shift 3, both flips
- prob(2,0,3,0) = .01 ; prob(2,0,3,1) = .01 ; // shift 4, both flips
- // row 4, cluster 1
- prob(3,0,0,0) = .01 ; prob(3,0,0,1) = .01 ; // shift 1, both flips
- prob(3,0,1,0) = .01 ; prob(3,0,1,1) = .01 ; // shift 2, both flips
- prob(3,0,2,0) = .01 ; prob(3,0,2,1) = .01 ; // shift 3, both flips
- prob(3,0,3,0) = .01 ; prob(3,0,3,1) = 999 ; // shift 4, both flips
- //normalize
- for(size_t i=0; i<prob.get_dim()[0]; i++)
- { double tot = 0. ;
- for(size_t j=0; j<prob.get_dim()[1]; j++)
- { for(size_t k=0; k<prob.get_dim()[2]; k++)
- { for(size_t l=0; l<prob.get_dim()[3]; l++)
- { tot += prob(i,j,k,l) ; }
+ size_t n_row = seq.get_nrow() ;
+ size_t n_col = seq.get_ncol() ;
+ size_t n_dim3 = 4 ; // A, C, G, T
+
+ int base_n = dna::char_to_int('N') ;
+
+ Matrix3D<double> cseq(n_row, n_col, n_dim3, 1e-100) ;
+
+
+ double prob = 1. - 3*(1e-100) ; // p(base) - p(!base)
+
+ for(size_t i=0; i<n_row; i++)
+ { for(size_t j=0; j<n_col; j++)
+ { int base = seq(i,j) ;
+ // N
+ if(base == base_n)
+ { cseq(i,j,0) = 0.25 ;
+ cseq(i,j,1) = 0.25 ;
+ cseq(i,j,2) = 0.25 ;
+ cseq(i,j,3) = 0.25 ;
}
+ // A C G T
+ else
+ { cseq(i,j, base) = prob ; }
+ }
+ }
+ return cseq ;
+}
+
+double score_consensus_subseq(const Matrix3D<double>& cons_seq,
+ size_t row,
+ size_t start,
+ const Matrix2D<double>& model)
+{ size_t ncol = cons_seq.get_dim()[1] ;
+ // size_t dim1 = cons_seq.get_dim()[0] ;
+ // size_t dim2 = cons_seq.get_dim()[1] ;
+ size_t dim3 = cons_seq.get_dim()[2] ;
+
+ if(start > ncol - model.get_nrow())
+ { char msg[4096] ;
+ sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
+ start, ncol - model.get_nrow()) ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(model.get_nrow() > ncol)
+ { char msg[4096] ;
+ sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
+ model.get_nrow(), ncol) ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(model.get_ncol() != 4)
+ { char msg[4096] ;
+ sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
+ model.get_ncol()) ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(dim3 != 4)
+ { char msg[4096] ;
+ sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
+ dim3) ;
+ throw std::invalid_argument(msg) ;
+ }
+
+ /*
+
+ double ll = 0 ;
+ for(size_t j_seq=from, j_mod=0; j_seq<to; j_seq++, j_mod++)
+ { for(size_t k=0; k<dim3; k++)
+ { std::cerr << ll
+ << " *= "
+ << "("
+ << model(j_mod,k)
+ << " * "
+ << cons_seq(row,j_seq,k)
+ << ")"
+ << std::endl ;
+ ll *= model(j_mod,k) * cons_seq(row,j_seq,k) ;
+ }
+ }*/
+
+ size_t from = start ;
+ size_t to = from + model.get_nrow() ; // will score [from, to)
+
+ double ll = 0. ;
+
+ for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
+ { double sum = 0. ;
+ for(size_t k=0; k<dim3; k++)
+ { std::cerr << sum
+ << "+= ("
+ << cons_seq(row, j_seq, k)
+ << " * "
+ << model(j_mod, k)
+ << ")"
+ << std::endl ;
+ sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ;
}
- for(size_t j=0; j<prob.get_dim()[1]; j++)
- { for(size_t k=0; k<prob.get_dim()[2]; k++)
- { for(size_t l=0; l<prob.get_dim()[3]; l++)
- { prob(i,j,k,l) /= tot ; }
+ std::cerr << sum << std::endl ;
+ std::cerr << "---------------" << std::endl ;
+ ll += log(sum) ;
+ }
+
+ return ll ;
+}
+
+
+Matrix2D<double> test_consensus(const std::string& file,
+ size_t n_class,
+ size_t n_iter,
+ size_t n_shift,
+ bool flip,
+ bool bckg_class,
+ const std::string& seed,
+ size_t n_thread)
+{ size_t n_flip = flip + 1 ;
+
+ Matrix2D<int> seq(file) ;
+ // Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
+ Matrix3D<double> cseq = sequence_to_consensus(seq) ;
+
+ // partition
+ EMConsensusSequence em(cseq,
+ n_class,
+ n_iter,
+ n_shift,
+ flip,
+ bckg_class,
+ seed,
+ n_thread) ;
+ em.classify() ;
+
+ // get post prob
+ Matrix4D<double> prob = em.get_post_prob() ;
+
+ std::cerr << prob << std::endl ;
+
+ // compute models
+ ConsensusSequenceModelComputer mc(std::move(cseq),
+ prob,
+ bckg_class,
+ n_thread) ;
+
+ Matrix2D<double> model = mc.get_model() ;
+
+ // compute the class prob
+ size_t n_row = prob.get_dim()[0] ;
+ std::vector<double> class_prob(n_class, 0.) ;
+ double p_tot = 0. ;
+ for(size_t i=0; i<n_row; i++)
+ { for(size_t j=0; j<n_class; j++)
+ { for(size_t k=0; k<n_shift; k++)
+ { for(size_t l=0; l<n_flip; l++)
+ { class_prob[j] += prob(i,j,k,l) ;
+ p_tot += prob(i,j,k,l) ;
+ }
}
}
}
+ for(auto& prob : class_prob)
+ { prob /= p_tot ; }
+ // create a matrix containing the class prob in the 1st
+ // column and the model in the remaining columns
+ Matrix2D<double> model_final(model.get_nrow(),
+ model.get_ncol() + 1) ;
+ size_t i_class = 0 ;
+ for(size_t i=0; i<model_final.get_nrow(); i++)
+ { if((i != 0) and (i % 4 == 0))
+ { i_class++ ; }
+ model_final(i,0) = class_prob[i_class] ;
+ }
+ // fill the remaining with the model parameters
+ for(size_t i=0; i<model.get_nrow(); i++)
+ { for(size_t j=0; j<model.get_ncol(); j++)
+ { model_final(i,j+1) = model(i,j) ; }
+ }
+ return model_final ;
+}
- /*
- size_t class_k = 0 ;
- Matrix3D<double> cons_seq = compute_consensus_sequences(seq,
- prob,
- class_k) ;
- // print
- for(size_t i=0; i<cons_seq.get_dim()[0]; i++)
- { for(size_t k=0; k<cons_seq.get_dim()[2]; k++)
- { for(size_t j=0; j<cons_seq.get_dim()[1]; j++)
- { std::cout << std::setprecision(4) << cons_seq(i,j,k) << " " ;
+Matrix2D<double> test_sequence(const std::string& file,
+ size_t n_class,
+ size_t n_iter,
+ size_t n_shift,
+ bool flip,
+ bool bckg_class,
+ const std::string& seed,
+ size_t n_thread)
+{ size_t n_flip = flip + 1 ;
+
+ // Matrix2D<int> seq(file) ;
+ Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
+
+ // partition
+ EMSequence em(seq,
+ n_class,
+ n_iter,
+ n_shift,
+ flip,
+ bckg_class,
+ seed,
+ n_thread) ;
+ em.classify() ;
+
+ // get post prob
+ Matrix4D<double> prob = em.get_post_prob() ;
+
+ std::cerr << prob << std::endl ;
+
+ // compute models
+ SequenceModelComputer mc(std::move(seq),
+ prob,
+ bckg_class,
+ n_thread) ;
+
+ Matrix2D<double> model = mc.get_model() ;
+
+ // compute the class prob
+ size_t n_row = prob.get_dim()[0] ;
+ std::vector<double> class_prob(n_class, 0.) ;
+ double p_tot = 0. ;
+ for(size_t i=0; i<n_row; i++)
+ { for(size_t j=0; j<n_class; j++)
+ { for(size_t k=0; k<n_shift; k++)
+ { for(size_t l=0; l<n_flip; l++)
+ { class_prob[j] += prob(i,j,k,l) ;
+ p_tot += prob(i,j,k,l) ;
+ }
}
- std::cout << std::endl ;
}
- std::cout << std::endl << std::endl ;
}
+ for(auto& prob : class_prob)
+ { prob /= p_tot ; }
+ // create a matrix containing the class prob in the 1st
+ // column and the model in the remaining columns
+ Matrix2D<double> model_final(model.get_nrow(),
+ model.get_ncol() + 1) ;
+ size_t i_class = 0 ;
+ for(size_t i=0; i<model_final.get_nrow(); i++)
+ { if((i != 0) and (i % 4 == 0))
+ { i_class++ ; }
+ model_final(i,0) = class_prob[i_class] ;
+ }
+ // fill the remaining with the model parameters
+ for(size_t i=0; i<model.get_nrow(); i++)
+ { for(size_t j=0; j<model.get_ncol(); j++)
+ { model_final(i,j+1) = model(i,j) ; }
+ }
+ return model_final ;
+}
+
+Matrix2D<double> test_aggregate(const std::string& file)
+{ Matrix2D<int> seq(file) ;
+ size_t nrow = seq.get_nrow() ;
+ size_t ncol = seq.get_ncol() ;
+
+ Matrix2D<double> model_final(4, ncol+1, 1e-100) ;
+
+ for(size_t i=0; i<4; i++)
+ { model_final(i,0) = 1. ; }
+
+ std::vector<double> sum(ncol+1, 0.) ;
+
+ for(size_t i=0; i<nrow; i++)
+ { for(size_t j=0; j<ncol; j++)
+ { int base = seq(i,j) ;
+ if(base == dna::char_to_int('N'))
+ { continue ; }
+ model_final(base, j+1) += 1. ;
+ sum[j+1] += 1. ;
+ }
+ }
+
+ for(size_t i=0; i<model_final.get_nrow(); i++)
+ { for(size_t j=1; j<model_final.get_ncol()+1; j++)
+ { model_final(i,j) /= sum[j] ; }
+ }
+
+
+ return model_final ;
+}
+
+int main()
+{
+ size_t n_class = 2 ;
+ size_t n_iter = 50 ;
+ size_t n_shift = 3 ;
+ bool flip = true ;
+ bool bckg_class = false ;
+ std::string seed = "" ;
+ size_t n_thread = 1 ;
+
+ /*
+ std::string file = "/local/groux/scATAC-seq/data/"
+ "10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
+ */
+
+ // std::string file = "/local/groux/scATAC-seq/test.mat" ;
+ // std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_noflip.mat" ;
+ // std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_flip.mat" ;
+ std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
+
+ /*
+ Matrix2D<double> model_final = test_consensus(file,
+ n_class,
+ n_iter,
+ n_shift,
+ flip,
+ bckg_class,
+ seed,
+ n_thread) ;
+ */
+
+ Matrix2D<double> model_final = test_sequence(file,
+ n_class,
+ n_iter,
+ n_shift,
+ flip,
+ bckg_class,
+ seed,
+ n_thread) ;
+
+ /*
+ Matrix2D<double> model_final = test_aggregate(file) ;
*/
- prob.save("test_prob_artificial.mat4d") ;
- std::cout << seq << std::endl ;
+ std::cout << model_final << std::endl ;
+
+
+ /*
+ double p0 = 0 ;
+ // double l0 = log(p0) ;
+
+ double p1 = 1. - (3.*p0) ;
+ // double l1 = log(p1) ;
+
+ // Matrix3D<double> cseq(1, 4, 4) ;
+ // cseq(0, 0, 0) = p1 ; cseq(0, 0, 1) = p0 ; cseq(0, 0, 2) = p0 ; cseq(0, 0, 3) = p0 ;
+ // cseq(0, 1, 0) = p0 ; cseq(0, 1, 1) = p1 ; cseq(0, 1, 2) = p0 ; cseq(0, 1, 3) = p0 ;
+ // cseq(0, 2, 0) = p0 ; cseq(0, 2, 1) = p0 ; cseq(0, 2, 2) = p1 ; cseq(0, 2, 3) = p0 ;
+ // cseq(0, 3, 0) = p0 ; cseq(0, 3, 1) = p0 ; cseq(0, 3, 2) = p0 ; cseq(0, 3, 3) = p1 ;
+ Matrix2D<int> seq(1, 4, -1) ;
+ seq(0, 0) = 0 ; seq(0, 1) = 1 ; seq(0, 2) = 2 ; seq(0, 3) = 3 ;
+ Matrix3D<double> cseq = sequence_to_consensus(seq) ;
+
+ std::cout << cseq << std::endl << std::endl ;
+
+ Matrix2D<double> mod(4, 4, -1) ;
+ mod(0,0) = p1 ; mod(0,1) = p0 ; mod(0,2) = p0 ; mod(0,3) = p0 ;
+ mod(1,0) = p0 ; mod(1,1) = p1 ; mod(1,2) = p0 ; mod(1,3) = p0 ;
+ mod(2,0) = p0 ; mod(2,1) = p0 ; mod(2,2) = p1 ; mod(2,3) = p0 ;
+ mod(3,0) = p0 ; mod(3,1) = p0 ; mod(3,2) = p0 ; mod(3,3) = p1 ;
+
+ std::cout << mod << std::endl ;
+
+ std::cout << score_consensus_subseq(cseq, 0,0,mod)
+ << std::endl ;
+ */
return EXIT_SUCCESS ;
}

Event Timeline