Page MenuHomec4science

No OneTemporary

File Metadata

Created
Thu, Oct 31, 13:30
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh
index f5a0b24..d32dffe 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh
@@ -1,52 +1,52 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=28
+n_core=32
# open chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
# 1nucl chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh
index 431cde7..04c51f3 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh
@@ -1,52 +1,52 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=28
+n_core=32
# open chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat'
file_mod3=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat'
file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
# 1nucl chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat'
file_mod3=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat'
file_aic=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh
index ec4656a..c0104d9 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh
@@ -1,53 +1,53 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/myc_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=28
+n_core=32
# open chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d'
file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat'
file_mod3=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat'
file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
# 1nucl chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat'
file_mod3=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat'
file_aic=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R
index 5971095..d30e6b1 100644
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R
@@ -1,95 +1,95 @@
setwd(file.path("/", "local", "groux", "scATAC-seq"))
# libraries
library(RColorBrewer)
# functions
source(file.path("scripts", "functions.R"))
# the minimum number of classes searched
k.min = 1
# the maximum number of classes searched
k.max = 10
# path to the images for the logo
path.a = file.path("res/A.png")
path.c = file.path("res/C.png")
path.g = file.path("res/G.png")
path.t = file.path("res/T.png")
################## sequence patterns around sp1 motifs ##################
for(k in k.min:k.max)
{
# open chromatin
data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1",
sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k)))
model.open = data$models
model.prob = data$prob
data = NULL
# nucleosomes
model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1",
- sprintf("sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models
+ sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_1nucl_fragment_center_model.mat", k)))$models
# sequence
model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1",
sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models
# plot classes
col = brewer.pal(3, "Set1")
# X11(width=17, height=10)
png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1",
sprintf("sp1_motifs_10e-7_classification_open_bin1bp_%dclass.png", k)),
units="in", res=720, width=18, height=12)
m = matrix(1:10, nrow=5, ncol=2, byrow=F)
layout(m)
# order from most to least probable class
ord = order(model.prob, decreasing=T)
ref.open = model.open[ord,, drop=F]
ref.nucl = model.nucl[ord,, drop=F]
ref.seq = model.seq[,,ord, drop=F]
prob = model.prob[ord]
class = c(1:nrow(ref.open))[ord]
for(i in 1:nrow(ref.open))
{ # plot logo
plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
main=sprintf("class %d (p=%.2f)", class[i], prob[i]))
# x-axis
x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3)
x.at = (x.lab + ncol(ref.open)) / 2
axis(1, at=x.at, labels=x.lab)
# y-axis is [0,1] for min/max signal
x.at = seq(0, 1, 0.5)
axis(2, at=x.at, labels=x.at)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
}
row_n = 1 # row counter
col_n = 1 # column counter
for(i in 1:nrow(ref.open))
{ # plot logo center
right = 0.5*col_n - 0.01
left = right - 0.2
bottom = 1-(row_n*(0.2))+0.05
top = bottom + 0.15
par(fig=c(left, right, bottom, top), new=T)
idx = 380:420
plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
# xaxis
x.at = 1:length(idx)
axis(1, at=x.at, labels=x.at)
# yaxis
x.at = seq(0, 2, by=1)
axis(2, at=x.at, labels=x.at)
row_n = row_n + 1
if(i %% 5 == 0)
{ col_n = col_n + 1
row_n = 1
}
}
dev.off()
}
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh
index d9104d0..6c12400 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh
@@ -1,52 +1,52 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat"
## file with seeds
file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=28
+n_core=32
# open chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_sequences_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
# 1nucl chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh
index 247a840..013fa9f 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh
@@ -1,36 +1,36 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=24
+n_core=28
# open chromatin and nucleosomes
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh
index 2204ba9..e938ca5 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh
@@ -1,36 +1,36 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=24
+n_core=28
# open chromatin and nucleosomes
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'ebf1_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh
index 6ea2fcf..345c0cd 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh
@@ -1,36 +1,36 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/myc_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=12
+n_core=28
# open chromatin and nucleosomes
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'myc_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh
index 5f561db..ccd2282 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh
@@ -1,35 +1,35 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat"
## file with seeds
file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=12
+n_core=28
# open chromatin and nucleosomes
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh
index 61a8ded..bcaa4f4 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh
@@ -1,38 +1,38 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_3'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=24
+n_core=28
# sequences
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat'
file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh
index 54f1c72..ca5821c 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh
@@ -1,38 +1,38 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_3'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat"
## file with seeds
file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=24
+n_core=28
# sequences
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat'
file_mod4=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh
index a34f0ec..f9a59cf 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh
@@ -1,38 +1,38 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_4'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='1'
-n_core=24
+n_core=28
# sequences
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat'
file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh
index cf0d6b3..82b61b4 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh
@@ -1,38 +1,38 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_4'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat"
## file with seeds
file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='1'
-n_core=24
+n_core=28
# sequences
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat'
file_mod4=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4
done
diff --git a/src/Clustering/ConsensusSequenceLayer.cpp b/src/Clustering/ConsensusSequenceLayer.cpp
index 5eab0b1..05c7e1e 100644
--- a/src/Clustering/ConsensusSequenceLayer.cpp
+++ b/src/Clustering/ConsensusSequenceLayer.cpp
@@ -1,377 +1,371 @@
#include <ConsensusSequenceLayer.hpp>
#include <vector>
#include <future> // std::promise, std::future
#include <stdexcept> // std::invalid_argument
#include <cmath> // log()
#include <functional> // std::bind(), std::ref()
#include <ThreadPool.hpp>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <dna_utility.hpp> // dna::base_composition()
double ConsensusSequenceLayer::score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t start,
const Matrix2D<double>& model)
{ size_t ncol = cons_seq.get_dim()[1] ;
size_t dim3 = cons_seq.get_dim()[2] ;
if(start > ncol - model.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, ncol - model.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_nrow() > ncol)
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model.get_nrow(), ncol) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(dim3 != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
dim3) ;
throw std::invalid_argument(msg) ;
}
size_t from = start ;
size_t to = from + model.get_nrow() ; // will score [from, to)
double ll = 0 ;
for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ double sum = 0. ;
for(size_t k=0; k<dim3; k++)
{ sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ; }
ll += log(sum) ;
}
return ll ;
}
ConsensusSequenceLayer::ConsensusSequenceLayer(const Matrix3D<double>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data3DLayer(data, n_class, n_shift, flip,bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
ConsensusSequenceLayer::ConsensusSequenceLayer(Matrix3D<double>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data3DLayer(std::move(data), n_class, n_shift, flip, bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
ConsensusSequenceLayer::~ConsensusSequenceLayer()
{ ; }
void ConsensusSequenceLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads) const
{
// dimension checks
this->check_loglikelihood_dim(loglikelihood) ;
this->check_loglikelihood_max_dim(loglikelihood_max) ;
// compute the prob model and the prob reverse-complement model
std::vector<Matrix2D<double>> model(this->n_class,
Matrix2D<double>(this->l_model,
this->n_category,
0.)) ;
std::vector<Matrix2D<double>> model_rev = model ;
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ // forward
model[i](j,k) = this->model(i,j,k) ;
// reverse
model_rev[i](this->l_model-j-1,this->n_category-k-1)
= this->model(i,j,k) ;
}
}
}
// don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihoods_routine(0, this->n_dim1,
loglikelihood,
loglikelihood_max,
model,
model_rev,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_dim1, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ConsensusSequenceLayer::compute_loglikelihoods_routine,
this,
slice.first,
slice.second,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
std::ref(model),
std::ref(model_rev),
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void ConsensusSequenceLayer::compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model,
const std::vector<Matrix2D<double>>& model_rev,
std::promise<bool>& done) const
{
// compute log likelihood
for(size_t i=from; i<to; i++)
{
// set max to min possible value
loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;
for(size_t j=0; j<this->n_class; j++)
{
for(size_t s=0; s<this->n_shift; s++)
{ // forward strand
- { double ll_fw = std::max(score_consensus_subseq(this->data, i, s, model[j]),
- ConsensusSequenceLayer::p_min_log);
- // loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
- // apply lower bound here -> log(1e-100)
+ { double ll_fw = score_consensus_subseq(this->data, i, s, model[j]) ;
loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// keep track of max per row
if(ll_fw > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_fw ; }
}
// reverse
if(this->flip)
- { double ll_rev = std::max(score_consensus_subseq(this->data, i, s, model_rev[j]),
- ConsensusSequenceLayer::p_min_log) ;
- // loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
- // apply lower bound here -> log(1e-100)
+ { double ll_rev = score_consensus_subseq(this->data, i, s, model_rev[j]) ;
loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// keep track of max per row
if(ll_rev > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_rev ; }
}
}
}
}
done.set_value(true) ;
}
void ConsensusSequenceLayer::update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads)
{ // don't parallelize
if(threads == nullptr)
{ std::promise<Matrix3D<double>> promise ;
std::future<Matrix3D<double>> future = promise.get_future() ;
this->update_model_routine(0,
this->n_dim1,
posterior_prob,
promise) ;
// this->model = future.get() ;
auto model = future.get() ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = model(i,j,k) ; }
}
}
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_dim1, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ConsensusSequenceLayer::update_model_routine,
this,
slice.first,
slice.second,
std::ref(posterior_prob),
std::ref(promises[i])))) ;
}
// reinitialise the model
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = 0. ; }
}
}
// wait until all threads are done working
// and update the model
for(auto& future : futures)
{ Matrix3D<double> model_part = future.get() ;
// for(size_t i=0; i<this->n_class; i++)
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) += model_part(i,j,k) ; }
}
}
}
// -------------------------- threads stop ---------------------------
}
// make sure to have no 0 values
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) =
std::max(this->model(i,j,k), ConsensusSequenceLayer::p_min) ;
}
}
}
// normalize to get probs
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ double sum = 0. ;
for(size_t k=0; k<this->n_category; k++)
{ sum += this->model(i,j,k) ; }
for(size_t k=0; k<this->n_category; k++)
{ double p = this->model(i,j,k) / sum ;
this->model(i,j,k) = p ;
}
}
}
}
void ConsensusSequenceLayer::update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& promise) const
{ // dimension checks
this->check_posterior_prob_dim(posterior_prob) ;
Matrix3D<double> model(this->n_class,
this->l_model,
this->n_category,
0.) ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t k=0; k < n_class_to_update; k++)
{ for(size_t s=0; s<this->n_shift; s++)
{ for(size_t j=0; j<this->l_model; j++)
{ // base prob on fw and rv strand
vector_d base_prob_fw(this->n_category, 0.) ;
vector_d base_prob_rv(this->n_category, 0.) ;
for(size_t i=from; i<to; i++)
{ for(size_t base=0, base_rv=3; base<this->n_dim3; base++, base_rv--)
{ // --------------- forward ---------------
{ base_prob_fw[base] +=
this->data(i,s+j,base) *
posterior_prob(i,k,s,ConsensusSequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
{
base_prob_rv[base_rv] +=
this->data(i,s+j,base) *
posterior_prob(i,k,s,ConsensusSequenceLayer::REVERSE) ;
}
}
}
// update this position of the model
for(size_t i=0,i_rev=base_prob_fw.size()-1; i<base_prob_fw.size(); i++,i_rev--)
{ // --------------- forward ---------------
{ model(k,j,i) += base_prob_fw[i] ; }
// --------------- reverse ---------------
if(this->flip)
{ model(k,this->l_model-j-1,i) += base_prob_rv[i] ; }
}
}
}
}
promise.set_value(model) ;
}
void ConsensusSequenceLayer::create_bckg_class()
{ // sequence composition
std::vector<double> base_comp =
dna::base_composition(this->data,
this->flip) ;
// set last class as background class
for(size_t i=0; i<this->n_category; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ this->model(this->n_class-1,j,i) = base_comp[i] ; }
}
}
diff --git a/src/Clustering/ConsensusSequenceLayer.hpp b/src/Clustering/ConsensusSequenceLayer.hpp
index 5eb71e8..c0a18b8 100644
--- a/src/Clustering/ConsensusSequenceLayer.hpp
+++ b/src/Clustering/ConsensusSequenceLayer.hpp
@@ -1,196 +1,202 @@
#ifndef CONSENSUSSEQUENCELAYER_HPP
#define CONSENSUSSEQUENCELAYER_HPP
#include <Data3DLayer.hpp>
#include <vector>
#include <future> // std::promise
#include <ThreadPool.hpp>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
typedef std::vector<double> vector_d ;
class ConsensusSequenceLayer : public Data3DLayer
{
public:
/*!
* \brief Computes the log-likelihood of the sub-
* sequence - stored in a given row (1st dimension) -
* and starting at the offset <col> (2nd dimension) in
* the given sequence matrix.
* The subsequence length is determined by the model
* lenght.
* \param cons_seq a probability matrix containing
* the consensus sequence. Its dimensions are :
* 1st the different sequences
* 2nd the sequences length
* 3rd 4 for A,C,G,T.
* \param row the row containing the sequence of
* interest.
* \param col the index at which the sub-sequence
* is starting (1st index inside the subsequence
* of interest).
* \param model a model containing the probability
* model.
* \return the log-likelihood of the sub-sequence
* given the model.
* \throw std::invalid_argument if 1) the offset is
* invalid, 2) the sequence and the model have
* incompatible dimensions or 3) the model 2n dimension
* is not 4 (A,C,G,T).
*/
static double score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t col,
const Matrix2D<double>& model) ;
public:
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
ConsensusSequenceLayer(const Matrix3D<double>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
ConsensusSequenceLayer(Matrix3D<double>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Destructor.
*/
virtual ~ConsensusSequenceLayer() ;
/*!
* \brief Computes the log likelihood of the data
* given the current model parameters.
* \param logliklihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of loglikelihood.
* Its length should be equal to the data row number.
* \throw std::invalid_argument if the dimensions are
* incorrect.
*/
virtual void compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads=nullptr) const override ;
/*!
* \brief Updates the model given the posterior
* probabilities (the probabilities of each row
* in the data to be assigned to each class,
* for each shift and flip state).
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is SequenceLayer::p_min.
* \param posterior_prob the data assignment probabilities to
* the different classes.
*/
virtual void update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads=nullptr) override ;
protected:
/*!
* \brief The routine that effectively performs the
* loglikelihood computations.
* \param from the index of the first row of the data
* to considered.
* \param to the index of the past last row of the data
* to considered.
* \param loglikelihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of log_likelihood.
* Its length should be equal to the data row number.
* \param model_log a vector containing the matrices with
* the log values of the model for each class.
* \param model_log_rev a vector containing the matrices with
* the log values of the reverse strand model for each class
* (the 1st position in the model becomes the last in the
* reverse model and probabilities are swapped A<->T and C<->G).
* \param done a promise to be filled when the routine
* is done running.
*/
void compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model_log,
const std::vector<Matrix2D<double>>& model_log_rev,
std::promise<bool>& done) const ;
/*!
* \brief The routine that effectively update the model.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is SequenceLayer::p_min.
* \param from the index of the first row of the
* posterior probabilities to considered.
* \param to the index of the past last row of the
* posterior probabilities to considered.
* \param posterior_prob the data assignment probabilities
* to the different classes.
* \param
* \param done a promise containing the partial model
* computed from the given data slice. If several routines
* work together at updating the model, they need to be
* summed and normalized (by the column sum) to get the
* final model.
*/
void update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& done) const ;
/*!
* \brief Sets the last class as background class with the
* mean base probababilities of the data at each position.
*/
void create_bckg_class() ;
protected:
} ;
#endif // CONSENSUSSEQUENCELAYER_HPP
diff --git a/src/Clustering/DataLayer.cpp b/src/Clustering/DataLayer.cpp
index a1d8de4..6ab5ff5 100644
--- a/src/Clustering/DataLayer.cpp
+++ b/src/Clustering/DataLayer.cpp
@@ -1,60 +1,60 @@
#include <DataLayer.hpp>
#include <stdexcept> // std::invalid_argument
#include <cmath> // log()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ThreadPool.hpp>
DataLayer::DataLayer()
{ ; }
DataLayer::DataLayer(size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: flip(flip),
n_class(n_class),
n_shift(n_shift),
n_flip(flip + 1),
bckg_class(bckg_class)
{ // models cannot be initialise here
// as the number of categories depend
// on the exact class
}
DataLayer::DataLayer(const Matrix3D<double>& model,
bool flip,
bool bckg_class)
: model(model),
flip(flip),
n_class(this->model.get_dim()[0]),
l_model(this->model.get_dim()[1]),
n_category(this->model.get_dim()[2]),
n_flip(flip + 1),
bckg_class(bckg_class)
{ ; }
DataLayer::DataLayer(Matrix3D<double>&& model,
bool flip,
bool bckg_class)
: model(model),
flip(flip),
n_class(this->model.get_dim()[0]),
l_model(this->model.get_dim()[1]),
n_category(this->model.get_dim()[2]),
n_flip(flip + 1),
bckg_class(bckg_class)
{ ; }
DataLayer::~DataLayer()
{ ; }
Matrix3D<double> DataLayer::get_model() const
{ return this->model ; }
-const double DataLayer::p_min = 1e-100 ;
+const double DataLayer::p_min = 1e-300 ;
const double DataLayer::p_min_log = log(DataLayer::p_min) ;
diff --git a/src/Clustering/EMConsensusSequence.cpp b/src/Clustering/EMConsensusSequence.cpp
index 9cac9ee..1dd3343 100644
--- a/src/Clustering/EMConsensusSequence.cpp
+++ b/src/Clustering/EMConsensusSequence.cpp
@@ -1,293 +1,294 @@
#include <EMConsensusSequence.hpp>
#include <string>
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <ConsensusSequenceLayer.hpp> // SequenceLayer
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
#include <ThreadPool.hpp> // ThreadPool
#include <dna_utility.hpp> // dna::base_composition()
EMConsensusSequence::EMConsensusSequence(const Matrix3D<double>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_dim()[0],
seq_matrix.get_dim()[1],
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
cseq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->cseq_layer = new ConsensusSequenceLayer(seq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->cseq_layer->update_model(this->post_prob,
this->threads) ;
}
EMConsensusSequence::EMConsensusSequence(Matrix3D<double>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_dim()[0],
seq_matrix.get_dim()[1],
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
cseq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->cseq_layer = new ConsensusSequenceLayer(std::move(seq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->cseq_layer->update_model(this->post_prob,
this->threads) ;
}
EMConsensusSequence::~EMConsensusSequence()
{ if(this->cseq_layer != nullptr)
{ delete this->cseq_layer ;
this->cseq_layer = nullptr ;
}
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
}
Matrix3D<double> EMConsensusSequence::get_sequence_models() const
{ return this->cseq_layer->get_model() ; }
EMConsensusSequence::exit_codes EMConsensusSequence::classify()
{
size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{ // E-step
this->compute_loglikelihood() ;
this->compute_post_prob() ;
// M-step
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return EMConsensusSequence::exit_codes::ITER_MAX ;
}
void EMConsensusSequence::compute_loglikelihood()
{ // compute the loglikelihood
this->cseq_layer->compute_loglikelihoods(this->loglikelihood,
this->loglikelihood_max,
this->threads) ;
// rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMConsensusSequence::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void EMConsensusSequence::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{
// rescale the values
for(size_t i=from; i<to; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
- (this->loglikelihood(i,j,k,l) -
- this->loglikelihood_max[i]) ;
+ std::max(this->loglikelihood(i,j,k,l) -
+ this->loglikelihood_max[i],
+ ConsensusSequenceLayer::p_min_log) ;
}
}
}
}
done.set_value(true) ;
}
void EMConsensusSequence::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMConsensusSequence::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMConsensusSequence::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// reset grand total
// this->post_prob_tot = 0 ;
// this->post_prob_colsum = vector_d(n_class, 0) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
ConsensusSequenceLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMConsensusSequence::update_models()
{ this->cseq_layer->update_model(this->post_prob,
this->threads) ;
}
diff --git a/src/Clustering/EMConsensusSequence.hpp b/src/Clustering/EMConsensusSequence.hpp
index 8ba881c..01ff753 100644
--- a/src/Clustering/EMConsensusSequence.hpp
+++ b/src/Clustering/EMConsensusSequence.hpp
@@ -1,184 +1,199 @@
#ifndef EMCONSENSUSSEQUENCE_HPP
#define EMCONSENSUSSEQUENCE_HPP
#include <EMBase.hpp>
#include <vector>
#include <string>
#include <future> // std::promise
#include <Matrix3D.hpp>
#include <ConsensusSequenceLayer.hpp>
typedef std::vector<double> vector_d ;
class EMConsensusSequence : public EMBase
{
public:
/*!
* \brief Constructs an object to partition the
* given consensus sequences (rows) according to
* their motif content.
* The sequences models are initialised randomly.
* \param sequence_matrix a matrix containing the
* consensus sequences in a probability matrix.
* Its dimensions are :
* 1st the number of consensus sequences
* 2nd the length of the consensus sequences
* 3rd 4 for A,C,G,T
* The sums over the 1st and 2nd dimensions should
* be 1. The overall sum of the matrix values should
* be the st dimension.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the background
* by setting all its parameters, at all positions, to the
* background base probabilties. Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMConsensusSequence(const Matrix3D<double>& sequence_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* given consensus sequences (rows) according to
* their motif
* content.
* The sequences models are initialised randomly.
* \param sequence_matrix a matrix containing the
* consensus sequences in a probability matrix.
* Its dimensions are :
* 1st the number of consensus sequences
* 2nd the length of the consensus sequences
* 3rd 4 for A,C,G,T
* The sums over the 1st and 2nd dimensions should
* be 1. The overall sum of the matrix values should
* be the st dimension.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the background
* by setting all its parameters, at all positions, to the
* background base probabilties. Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMConsensusSequence(Matrix3D<double>&& sequence_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
EMConsensusSequence(const EMConsensusSequence& other) = delete ;
/*!
* \brief Destructor.
*/
virtual ~EMConsensusSequence() override ;
/*!
* \brief Returns the class sequence model.
* \return the class sequence model.
*/
Matrix3D<double> get_sequence_models() const ;
/*!
* \brief Runs the sequence model optimization and
* the data classification.
* \return a code indicating how the optimization
* ended.
*/
virtual EMConsensusSequence::exit_codes classify() override ;
private:
/*!
* \brief Computes the data log likelihood given the
* current models, for all layers and the joint
* likelihood for each state (the sum of the layer
* likelihoods for all layers, for a given state).
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ConsensusSequenceLayer::p_min_log.
+ * Any value below is replaced by this one.
*/
virtual void compute_loglikelihood() override ;
/*!
* \brief This is a routine of compute_loglikelihood().
* This method rescales the loglikelihood values by
* substacting to each value the maximum loglikelihood
* value found in the same data row.
- * This method
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ConsensusSequenceLayer::p_min_log.
+ * Any value below is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done a promise to fill when the method
* is done.
*/
void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Computes the data posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ConsensusSequenceLayer::p_min. Any value below
+ * is replaced by this one.
*/
virtual void compute_post_prob() override ;
/*!
* \brief The routine that effectively computes
* the posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ConsensusSequenceLayer::p_min. Any value below
+ * is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done the partial column (over the classes)
* sum of posterior probabilities. If several routines
* are running together, the colsums are retrieved by
* summing up the vectors together.
*/
void compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum) ;
/*!
* \brief Update the data models for all layers, given
* the current posterior and class probabilities.
*/
virtual void update_models() override ;
/*!
* \brief the max loglikelihood value for
* each data row.
*/
std::vector<double> loglikelihood_max ;
/*!
* \brief A pointer to the object managing
* the data and their model.
*/
ConsensusSequenceLayer* cseq_layer ;
} ;
#endif // EMCONSENSUSSEQUENCE_HPP
diff --git a/src/Clustering/EMJoint.cpp b/src/Clustering/EMJoint.cpp
index 7ff0a16..c15a79b 100644
--- a/src/Clustering/EMJoint.cpp
+++ b/src/Clustering/EMJoint.cpp
@@ -1,837 +1,814 @@
#include <EMJoint.hpp>
#include <string>
#include <vector>
#include <stdexcept>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <cmath> // exp()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ReadLayer.hpp>
#include <SequenceLayer.hpp>
#include <ThreadPool.hpp>
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
template<class T>
std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
EMJoint::EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()),
loglikelihood_layer(n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{
// check data matrices and their dimensions
if(this->n_layer == 0)
{ throw std::invalid_argument("Error! No data layer given!") ; }
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! Read layers have variable row numbers "
"(%zu and %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! Read layers have variable column numbers "
"(%zu and %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
// initialise post prob randomly
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{ // create the layer
this->read_layers.push_back(new ReadLayer(matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
}
EMJoint::EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()),
loglikelihood_layer(n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{
// check data matrices and their dimensions
if(this->n_layer == 0)
{ throw std::invalid_argument("Error! No data layer given!") ; }
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! Read layers have variable row numbers "
"(%zu and %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! Read layers have variable column numbers "
"(%zu and %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
// initialise post prob randomly
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{
// create the layer
this->read_layers.push_back(new ReadLayer(std::move(matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
}
EMJoint::EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
const Matrix2D<int>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()+1),
loglikelihood_layer(this->n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{ // check data matrices and their dimensions
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix row number is different than expected "
"(%zu instead of %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix column number is different than expected "
"(%zu instead of %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
if(seq_matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix row number is different than expected "
"(%zu instead of %zu)!",
seq_matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(seq_matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix column number is different than expected "
"(%zu instead of %zu)!",
seq_matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{ // create the layer
this->read_layers.push_back(new ReadLayer(matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
// create sequence layer and initialise the models from the post prob
this->seq_layer = new SequenceLayer(seq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
EMJoint::EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
Matrix2D<int>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()+1),
loglikelihood_layer(this->n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{ // check data matrices and their dimensions
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix row number is different than expected "
"(%zu instead of %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix column number is different than expected "
"(%zu instead of %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
if(seq_matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix row number is different than expected "
"(%zu instead of %zu)!",
seq_matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(seq_matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix column number is different than expected "
"(%zu instead of %zu)!",
seq_matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{
// create the layer
this->read_layers.push_back(new ReadLayer(std::move(matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
// create sequence layer and initialise the models from the post prob
this->seq_layer = new SequenceLayer(std::move(seq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
EMJoint::EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
const Matrix3D<double>& consseq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()+1),
loglikelihood_layer(this->n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{ // check data matrices and their dimensions
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix row number is different than expected "
"(%zu instead of %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix column number is different than expected "
"(%zu instead of %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
if(consseq_matrix.get_dim()[0] != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! The consensus sequence matrix row number is different than expected "
"(%zu instead of %zu)!",
consseq_matrix.get_dim()[0], this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(consseq_matrix.get_dim()[1] != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix column number is different than expected "
"(%zu instead of %zu)!",
consseq_matrix.get_dim()[0], this->n_col) ;
throw std::invalid_argument(msg) ;
}
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{ // create the layer
this->read_layers.push_back(new ReadLayer(matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
// create consensus sequence layer and initialise the models from the post prob
{
this->consseq_layer = new ConsensusSequenceLayer(consseq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
this->consseq_layer->update_model(this->post_prob,
this->threads) ;
}
}
EMJoint::EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
Matrix3D<double>&& consseq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()+1),
loglikelihood_layer(this->n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{ // check data matrices and their dimensions
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix row number is different than expected "
"(%zu instead of %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix column number is different than expected "
"(%zu instead of %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
if(consseq_matrix.get_dim()[0] != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! The consensus sequence matrix row number is different than expected "
"(%zu instead of %zu)!",
consseq_matrix.get_dim()[0], this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(consseq_matrix.get_dim()[1] != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix column number is different than expected "
"(%zu instead of %zu)!",
consseq_matrix.get_dim()[0], this->n_col) ;
throw std::invalid_argument(msg) ;
}
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{ // create the layer
this->read_layers.push_back(new ReadLayer(std::move(matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
// create consensus sequence layer and initialise the models from the post prob
{
this->consseq_layer = new ConsensusSequenceLayer(std::move(consseq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
this->consseq_layer->update_model(this->post_prob,
this->threads) ;
}
}
EMJoint::~EMJoint()
{ // join the threads in case
// deleted by EMBase destructor
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
// read data and models
for(auto& ptr : this->read_layers)
{ if(ptr != nullptr)
{ delete ptr ;
ptr = nullptr ;
}
}
// sequence data and models
if(this->seq_layer != nullptr)
{ delete seq_layer ;
this->seq_layer = nullptr ;
}
// consensus sequence data and models
if(this->consseq_layer != nullptr)
{ delete this->consseq_layer ;
this->consseq_layer = nullptr ;
}
}
std::vector<Matrix3D<double>> EMJoint::get_read_models() const
{ std::vector<Matrix3D<double>> models ;
for(const auto& ptr : this->read_layers)
{ models.push_back(ptr->get_model()) ; }
return models ;
}
Matrix3D<double> EMJoint::get_sequence_models() const
{ if(this->seq_layer != nullptr)
{ return this->seq_layer->get_model() ; }
return Matrix3D<double>() ;
}
Matrix3D<double> EMJoint::get_consensus_sequence_models() const
{ if(this->consseq_layer != nullptr)
{ return this->consseq_layer->get_model() ; }
return Matrix3D<double>() ;
}
EMJoint::exit_codes EMJoint::classify()
{ size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{ // E-step
this->compute_loglikelihood() ;
this->compute_post_prob() ;
// M-step
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return EMJoint::exit_codes::ITER_MAX ;
}
void EMJoint::compute_loglikelihood()
{ // compute the loglikelihood for each layer
size_t i = 0 ;
for(auto& ptr : this->read_layers)
{ ptr->compute_loglikelihoods(this->loglikelihood_layer[i],
this->loglikelihood_max[i],
this->threads) ;
i++ ;
}
if(this->seq_layer != nullptr)
{ this->seq_layer->compute_loglikelihoods(this->loglikelihood_layer[i],
this->loglikelihood_max[i],
this->threads) ;
i++ ;
}
if(this->consseq_layer != nullptr)
{ this->consseq_layer->compute_loglikelihoods(this->loglikelihood_layer[i],
this->loglikelihood_max[i],
this->threads) ;
i++ ;
}
// sum the likelihood for each state, over all layers
// and rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t j=0; j<n_threads; j++)
{ futures[j] = promises[j].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t j=0; j<n_threads; j++)
{ auto slice = slices[j] ;
this->threads->addJob(std::move(
std::bind(&EMJoint::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[j])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
-/*
-void EMJoint::compute_loglikelihood_routine(size_t from,
- size_t to,
- std::promise<bool>& done)
-{
- // limite value range
- for(size_t i=from; i<to; i++)
- { for(size_t j=0; j<this->n_class; j++)
- { for(size_t k=0; k<this->n_shift; k++)
- { for(size_t l=0; l<this->n_flip; l++)
- {
- // reset
- this->loglikelihood(i,j,k,l) = 0. ;
- // sum
- for(size_t m=0; m<this->n_layer; m++)
- { this->loglikelihood(i,j,k,l) +=
- (this->loglikelihood_layer[m](i,j,k,l) -
- this->loglikelihood_max[m][i]) ;
- }
- }
- }
- }
- }
- done.set_value(true) ;
-}
-*/
-
void EMJoint::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{ // the max likelihood found per row
std::vector<double> rowmax(to-from, std::numeric_limits<double>::lowest()) ;
// sum over layers
for(size_t i=from, i_rowmax=0; i<to; i++, i_rowmax++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{
// reset
this->loglikelihood(i,j,k,l) = 0. ;
// sum
for(size_t m=0; m<this->n_layer; m++)
{ // add rescaled layer value
this->loglikelihood(i,j,k,l) +=
(this->loglikelihood_layer[m](i,j,k,l) -
this->loglikelihood_max[m][i]) ;
}
// keep track of max
if(this->loglikelihood(i,j,k,l) > rowmax[i_rowmax])
{ rowmax[i_rowmax] = this->loglikelihood(i,j,k,l) ; }
}
}
}
}
// rescale
for(size_t i=from, i_rowmax=0; i<to; i++, i_rowmax++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
- { this->loglikelihood(i,j,k,l) -= rowmax[i_rowmax] ; }
+ { // this->loglikelihood(i,j,k,l) -= rowmax[i_rowmax] ;
+ this->loglikelihood(i,j,k,l) =
+ std::max(this->loglikelihood(i,j,k,l) - rowmax[i_rowmax],
+ ReadLayer::p_min_log) ;
+ }
}
}
}
done.set_value(true) ;
}
void EMJoint::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMJoint::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMJoint::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
// double p = std::max(exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
// this->post_state_prob(n_class,n_shift,n_flip),
// ReadLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
ReadLayer::p_min) ;
// double p = this->post_prob(i,n_class,n_shift,n_flip) /
// this->post_prob_rowsum[i] ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMJoint::update_models()
{ // read data and models
for(auto& ptr : this->read_layers)
{ ptr->update_model(this->post_prob,
this->post_prob_colsum,
this->threads) ;
}
// sequence data and models
if(this->seq_layer != nullptr)
{ this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
// consensus sequence data and models
if(this->consseq_layer != nullptr)
{ this->consseq_layer->update_model(this->post_prob,
this->threads) ;
}
}
diff --git a/src/Clustering/EMJoint.hpp b/src/Clustering/EMJoint.hpp
index aeea97b..3d9cb80 100644
--- a/src/Clustering/EMJoint.hpp
+++ b/src/Clustering/EMJoint.hpp
@@ -1,408 +1,424 @@
#ifndef EMJOINT_HPP
#define EMJOINT_HPP
#include <EMBase.hpp>
#include <vector>
#include <string>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ReadLayer.hpp>
#include <SequenceLayer.hpp>
#include <ConsensusSequenceLayer.hpp>
typedef std::vector<double> vector_d ;
class EMJoint : public EMBase
{
public:
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* with the given shifting and flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* with the given shifting and flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* and sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
const Matrix2D<int>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* and sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
Matrix2D<int>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* consensus sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param consseq_matrix a matrix containing the DNA
* consensus sequences for the regions of interest as
* a probability matrix. Its dimensions are:
* 1st the number of regions,
* 2nd the region length.
* 3rd 4 for A,C,G,T
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
const Matrix3D<double>& consseq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* consensus sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param consseq_matrix a matrix containing the DNA
* consensus sequences for the regions of interest as
* a probability matrix. Its dimensions are:
* 1st the number of regions,
* 2nd the region length.
* 3rd 4 for A,C,G,T
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
Matrix3D<double>&& consseq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
EMJoint(const EMJoint& other) = delete ;
/*!
* \brief Destructor.
*/
virtual ~EMJoint() override ;
/*!
* \brief Returns all layer read models.
* The models are in the same order
* as the data were given to the
* constructor.
* \return a vector containing the
* models.
*/
std::vector<Matrix3D<double>> get_read_models() const ;
/*!
* \brief Returns the sequence models.
* \return a vector containing the
* models.
*/
Matrix3D<double> get_sequence_models() const ;
/*!
* \brief Returns the consensus sequence
* models.
* \return a vector containing the
* models.
*/
Matrix3D<double> get_consensus_sequence_models() const ;
/*!
* \brief Runs the sequence model optimization and
* the data classification.
* \return a code indicating how the optimization
* ended.
*/
virtual EMJoint::exit_codes classify() override ;
private:
/*!
* \brief Computes the data log likelihood given the
* current models, for all layers and the joint
* likelihood for each state (the sum of the layer
* likelihoods for all layers, for a given state).
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
*/
virtual void compute_loglikelihood() override ;
/*!
* \brief This is a routine of compute_loglikelihood() that
* computes the joint loglikelihood by summing the
* individual loglikelihood obtained from each data layer.
* At the same time, this method rescales the loglikelihood
* values by substacting to each value the maximum
* loglikelihood value found in the same data row,
* for each layer.
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done a promise to fill when the method
* is done.
*/
void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Computes the data posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ReadLayer::p_min. Any value below is replaced
+ * by this one.
*/
virtual void compute_post_prob() override ;
/*!
* \brief The routine that effectively computes
* the posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ReadLayer::p_min. Any value below is replaced
+ * by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done the partial column (over the classes)
* sum of posterior probabilities. If several routines
* are running together, the colsums are retrieved by
* summing up the vectors together.
*/
void compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum) ;
/*!
* \brief Update the data models for all layers, given
* the current posterior and class probabilities.
*/
virtual void update_models() override ;
/*!
* \brief the number of data layers.
*/
size_t n_layer ;
/*!
* \brief the log likelihood buffers for each individual
* layer (one element per layer).
*/
std::vector<Matrix4D<double>> loglikelihood_layer ;
/*!
* \brief the max loglikelihood value for
* each each data layer (1st dimension)
* and each data row of the given layer
* (2nd dimension).
*/
std::vector<vector_d> loglikelihood_max ;
/*!
* \brief A vector containing the pointers
* to the objects managing all the read
* data and models.
*/
std::vector<ReadLayer*> read_layers ;
/*!
* \brief A pointer to the object managing
* the sequence data and their models.
*/
SequenceLayer* seq_layer ;
/*!
* \brief A pointer to the object managing
* the consensus sequences data and their
* models.
*/
ConsensusSequenceLayer* consseq_layer ;
} ;
#endif // EMJOINT_HPP
diff --git a/src/Clustering/EMRead.cpp b/src/Clustering/EMRead.cpp
index a8cb150..580f10f 100644
--- a/src/Clustering/EMRead.cpp
+++ b/src/Clustering/EMRead.cpp
@@ -1,314 +1,315 @@
#include <EMRead.hpp>
#include <string>
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <cmath> // exp()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <ReadLayer.hpp> // ReadLayer
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
#include <ThreadPool.hpp> // ThreadPool
EMRead::EMRead(const Matrix2D<int>& read_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrix.get_nrow(),
read_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
read_layer(nullptr)
{ this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
this->set_post_prob_random(seed) ;
// data and models
this->read_layer = new ReadLayer(read_matrix,
this->n_class,
this->n_shift,
flip,
bckg_class,
this->threads) ;
// intialise the models with the post prob
this->read_layer->update_model(this->post_prob,
this->threads) ;
}
EMRead::EMRead(Matrix2D<int>&& read_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrix.get_nrow(),
read_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
read_layer(nullptr)
{ this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
this->set_post_prob_random(seed) ;
// data and models
this->read_layer = new ReadLayer(std::move(read_matrix),
this->n_class,
this->n_shift,
flip,
bckg_class,
this->threads) ;
// intialise the models with the post prob
this->read_layer->update_model(this->post_prob,
this->threads) ;
}
EMRead::~EMRead()
{ if(this->read_layer!= nullptr)
{ delete this->read_layer ;
this->read_layer = nullptr ;
}
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
}
Matrix3D<double> EMRead::get_read_models() const
{ return read_layer->get_model() ; }
EMRead::exit_codes EMRead::classify()
{
size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{
// E-step
this->compute_loglikelihood() ;
// std::cerr << this->post_prob_rowsum << std::endl ;
// std::cerr << this->post_prob_colsum << std::endl ;
this->compute_post_prob() ;
// M-step
// std::cerr << this->post_prob_rowsum << std::endl ;
// std::cerr << this->post_prob_colsum << std::endl ;
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return EMRead::exit_codes::ITER_MAX ;
}
void EMRead::compute_loglikelihood()
{ // compute the loglikelihood
this->read_layer->compute_loglikelihoods(this->loglikelihood,
this->loglikelihood_max,
this->threads) ;
/*
// rescale the values
for(size_t i=0; i<this->n_row; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
(this->loglikelihood(i,j,k,l) -
this->loglikelihood_max[i]) ;
}
}
}
}
*/
// rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMRead::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void EMRead::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{
// rescale the values
for(size_t i=from; i<to; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
- (this->loglikelihood(i,j,k,l) -
- this->loglikelihood_max[i]) ;
+ std::max(this->loglikelihood(i,j,k,l) -
+ this->loglikelihood_max[i],
+ ReadLayer::p_min_log) ;
}
}
}
}
done.set_value(true) ;
}
void EMRead::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMRead::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMRead::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// reset grand total
// this->post_prob_tot = 0 ;
// this->post_prob_colsum = vector_d(n_class, 0) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{ // avoid p=0. by rounding errors
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
ReadLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMRead::update_models()
{ this->read_layer->update_model(this->post_prob,
this->post_prob_colsum,
this->threads) ;
}
diff --git a/src/Clustering/EMRead.hpp b/src/Clustering/EMRead.hpp
index 0c49b6a..e3e727a 100644
--- a/src/Clustering/EMRead.hpp
+++ b/src/Clustering/EMRead.hpp
@@ -1,173 +1,188 @@
#ifndef EMREAD_HPP
#define EMREAD_HPP
#include <EMBase.hpp>
#include <vector>
#include <string>
#include <future> // std::promise
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <ReadLayer.hpp>
typedef std::vector<double> vector_d ;
class EMRead : public EMBase
{
public:
/*!
* \brief Constructs an object to partition the
* region (rows) according to the shape of the signal
* with the given shifting and flipping freedom.
* \param read_matrix a matrix containing the read
* densitiy (ChIP-seq or related signal) for the
* regions of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the
* background by setting all its parameters, at all
* positions, to the mean number of counts. Since
* the background is constant, this class will never
* be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMRead(const Matrix2D<int>& read_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region (rows) according to the shape of the signal
* with the given shifting and flipping freedom.
* \param read_matrix a matrix containing the read
* densitiy (ChIP-seq or related signal) for the
* regions of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the
* background by setting all its parameters, at all
* positions, to the mean number of counts. Since
* the background is constant, this class will never
* be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMRead(Matrix2D<int>&& read_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
EMRead(const EMRead& other) = delete ;
/*!
* \brief Destructor.
*/
virtual ~EMRead() override ;
/*!
* \brief Returns the class read signal model.
* \return the class read signal model.
*/
Matrix3D<double> get_read_models() const ;
/*!
* \brief Runs the read signal model optimization and
* the data classification.
* \return a code indicating how the optimization
* ended.
*/
virtual EMRead::exit_codes classify() override ;
private:
/*!
* \brief Computes the data log likelihood given the
* current models, for all layers and the joint
* likelihood for each state (the sum of the layer
* likelihoods for all layers, for a given state).
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
*/
virtual void compute_loglikelihood() override ;
/*!
* \brief This is a routine of compute_loglikelihood().
* This method rescales the loglikelihood values by
* substacting to each value the maximum loglikelihood
* value found in the same data row.
- * This method
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done a promise to fill when the method
* is done.
*/
void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Computes the data posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ReadLayer::p_min. Any value below is replaced
+ * by this one.
*/
virtual void compute_post_prob() override ;
/*!
* \brief The routine that effectively computes
* the posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ReadLayer::p_min. Any value below is replaced
+ * by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done the partial column (over the classes)
* sum of posterior probabilities. If several routines
* are running together, the colsums are retrieved by
* summing up the vectors together.
*/
void compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum) ;
/*!
* \brief Update the data models for all layers, given
* the current posterior and class probabilities.
*/
virtual void update_models() override ;
/*!
* \brief the max loglikelihood value for
* each data row.
*/
std::vector<double> loglikelihood_max ;
/*!
* \brief A pointer to the object managing
* the data and their model.
*/
ReadLayer* read_layer ;
} ;
#endif // EMREAD_HPP
diff --git a/src/Clustering/EMSequence.cpp b/src/Clustering/EMSequence.cpp
index 3f5bebb..4844f2c 100644
--- a/src/Clustering/EMSequence.cpp
+++ b/src/Clustering/EMSequence.cpp
@@ -1,357 +1,358 @@
#include <EMSequence.hpp>
#include <string>
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <cmath> // exp()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <SequenceLayer.hpp> // SequenceLayer
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
#include <ThreadPool.hpp> // ThreadPool
EMSequence::EMSequence(const Matrix2D<int>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->seq_layer = new SequenceLayer(seq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
EMSequence::EMSequence(Matrix2D<int>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->seq_layer = new SequenceLayer(std::move(seq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
EMSequence::EMSequence(const Matrix2D<int>& seq_matrix,
const Matrix3D<double>& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
motifs.get_dim()[0],
n_iter,
seq_matrix.get_ncol() - motifs.get_dim()[1] + 1,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// data and models
// background motif (if any) is the last of the given motifs
this->seq_layer = new SequenceLayer(seq_matrix,
motifs,
this->flip,
bckg_class) ;
// intialise the class prob uniformly
this->set_state_prob_uniform() ;
}
EMSequence::EMSequence(Matrix2D<int>&& seq_matrix,
Matrix3D<double>&& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
motifs.get_dim()[0],
n_iter,
seq_matrix.get_ncol() - motifs.get_dim()[1] + 1,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// data and models
// background motif (if any) is the last of the given motifs
this->seq_layer = new SequenceLayer(std::move(seq_matrix),
std::move(motifs),
this->flip,
bckg_class) ;
// intialise the class prob uniformly
this->set_state_prob_uniform() ;
}
EMSequence::~EMSequence()
{ if(this->seq_layer != nullptr)
{ delete this->seq_layer ;
this->seq_layer = nullptr ;
}
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
}
Matrix3D<double> EMSequence::get_sequence_models() const
{ return seq_layer->get_model() ; }
EMSequence::exit_codes EMSequence::classify()
{ size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{ // E-step
this->compute_loglikelihood() ;
this->compute_post_prob() ;
// M-step
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return EMSequence::exit_codes::ITER_MAX ;
}
void EMSequence::compute_loglikelihood()
{ // compute the loglikelihood
this->seq_layer->compute_loglikelihoods(this->loglikelihood,
this->loglikelihood_max,
this->threads) ;
// rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMSequence::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void EMSequence::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{
// rescale the values
for(size_t i=from; i<to; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
- (this->loglikelihood(i,j,k,l) -
- this->loglikelihood_max[i]) ;
+ std::max(this->loglikelihood(i,j,k,l) -
+ this->loglikelihood_max[i],
+ SequenceLayer::p_min_log) ;
}
}
}
}
done.set_value(true) ;
}
void EMSequence::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMSequence::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMSequence::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// reset grand total
// this->post_prob_tot = 0 ;
// this->post_prob_colsum = vector_d(n_class, 0) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
SequenceLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMSequence::update_models()
{ this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
diff --git a/src/Clustering/EMSequence.hpp b/src/Clustering/EMSequence.hpp
index 622532b..537c285 100644
--- a/src/Clustering/EMSequence.hpp
+++ b/src/Clustering/EMSequence.hpp
@@ -1,240 +1,255 @@
#ifndef EMSEQUENCE_HPP
#define EMSEQUENCE_HPP
#include <EMBase.hpp>
#include <vector>
#include <string>
#include <future> // std::promise
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <SequenceLayer.hpp>
typedef std::vector<double> vector_d ;
class EMSequence : public EMBase
{
public:
/*!
* \brief Constructs an object to partition the
* given sequences (rows) according to their motif
* content.
* The sequences models are initialised randomly.
* \param sequence_matrix a matrix containing the sequences
* of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the background
* by setting all its parameters, at all positions, to the
* background base probabilties. Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMSequence(const Matrix2D<int>& sequence_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* given sequences (rows) according to their motif
* content.
* The sequences models are initialised randomly.
* \param sequence_matrix a matrix containing the sequences
* of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the background
* by setting all its parameters, at all positions, to the
* background base probabilties. Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMSequence(Matrix2D<int>&& sequence_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* given sequences (rows) according to their motif
* content.
* The sequences class models are initialised using
* the given motifs. The class probabilities are
* initialised uniformlly.
* The shifting freedom is set to (data number
* of columns) - (the model 2nd dimension)
* + 1.
* \param sequence_matrix a matrix containing the sequences
* of interest.
* \param motifs a matrix containing the different initial
* class models with the following dimensions :
* dim1 the number of classes
* dim2 the model length
* dim3 4 for A,C,G,T
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param flip whether flipping is allowed.
* \param bckg_class indicates that the last class in the
* given motifs is used to model the background and it
* should never be updated.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMSequence(const Matrix2D<int>& sequence_matrix,
const Matrix3D<double>& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* given sequences (rows) according to their motif
* content.
* The sequences class models are initialised using
* the given motifs. The class probabilities are
* initialised uniformlly.
* The shifting freedom is set to (data number
* of columns) - (the model 2nd dimension)
* + 1.
* \param sequence_matrix a matrix containing the sequences
* of interest.
* \param motifs a matrix containing the different initial
* class models with the following dimensions :
* dim1 the number of classes
* dim2 the model length
* dim3 4 for A,C,G,T
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param flip whether flipping is allowed.
* \param bckg_class indicates that the last class in the
* given motifs is used to model the background and it
* should never be updated.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMSequence(Matrix2D<int>&& sequence_matrix,
Matrix3D<double>&& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads=0) ;
EMSequence(const EMSequence& other) = delete ;
/*!
* \brief Destructor.
*/
virtual ~EMSequence() override ;
/*!
* \brief Returns the class sequence model.
* \return the class sequence model.
*/
Matrix3D<double> get_sequence_models() const ;
/*!
* \brief Runs the sequence model optimization and
* the data classification.
* \return a code indicating how the optimization
* ended.
*/
virtual EMSequence::exit_codes classify() override ;
private:
/*!
* \brief Computes the data log likelihood given the
* current models, for all layers and the joint
* likelihood for each state (the sum of the layer
* likelihoods for all layers, for a given state).
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is SequenceLayer::p_min_log. Any
+ * value below is replaced by this one.
*/
virtual void compute_loglikelihood() override ;
/*!
* \brief This is a routine of compute_loglikelihood().
* This method rescales the loglikelihood values by
* substacting to each value the maximum loglikelihood
* value found in the same data row.
- * This method
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done a promise to fill when the method
* is done.
*/
void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Computes the data posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * SequenceLayer::p_min. Any value below is replaced
+ * by this one.
*/
virtual void compute_post_prob() override ;
/*!
* \brief The routine that effectively computes
* the posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * SequenceLayer::p_min. Any value below is replaced
+ * by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done the partial column (over the classes)
* sum of posterior probabilities. If several routines
* are running together, the colsums are retrieved by
* summing up the vectors together.
*/
void compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum) ;
/*!
* \brief Update the data models for all layers, given
* the current posterior and class probabilities.
*/
virtual void update_models() override ;
/*!
* \brief the max loglikelihood value for
* each data row.
*/
std::vector<double> loglikelihood_max ;
/*!
* \brief A pointer to the object managing
* the data and their model.
*/
SequenceLayer* seq_layer ;
} ;
#endif // EMSEQUENCE_HPP
diff --git a/src/Clustering/ReadLayer.cpp b/src/Clustering/ReadLayer.cpp
index 3c0e70d..9370245 100644
--- a/src/Clustering/ReadLayer.cpp
+++ b/src/Clustering/ReadLayer.cpp
@@ -1,492 +1,496 @@
#include <ReadLayer.hpp>
#include <stdexcept> // std::invalid_argument
#include <limits> // numeric_limits
#include <cmath> // log(), exp(), pow()
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <Statistics.hpp> // poisson_pmf()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ThreadPool.hpp>
#include <iostream>
typedef std::vector<double> vector_d ;
ReadLayer::ReadLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class,
ThreadPool* threads)
: Data2DLayer(data, n_class, n_shift, flip, bckg_class),
window_means(n_row, n_shift, 0.)
{ this->n_category = 1 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
// compute window means
this->compute_window_means(threads) ;
}
ReadLayer::ReadLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class,
ThreadPool* threads)
: Data2DLayer(std::move(data), n_class, n_shift, flip, bckg_class),
window_means(n_row, n_shift, 0.)
{ this->n_category = 1 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
// compute window means
this->compute_window_means(threads) ;
}
ReadLayer::ReadLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool bckg_class,
ThreadPool* threads)
: Data2DLayer(data, model, flip, bckg_class),
window_means(n_row, n_shift, 0.)
{ // check that the model only has one category
if(this->n_category > 1)
{ char msg[4096] ;
sprintf(msg,
"Error! model is expected to have length 1 on "
"3rd dimension, not %zu",
this->n_category) ;
throw std::invalid_argument(msg) ;
}
// compute window means
this->compute_window_means(threads) ;
}
ReadLayer::ReadLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool bckg_class,
ThreadPool* threads)
: Data2DLayer(std::move(data), std::move(model), flip, bckg_class),
window_means(n_row, n_shift, 0.)
{ // check that the model only has one category
if(this->n_category > 1)
{ char msg[4096] ;
sprintf(msg,
"Error! model is expected to have length 1 on "
"3rd dimension, not %zu",
this->n_category) ;
throw std::invalid_argument(msg) ;
}
// compute window means
this->compute_window_means(threads) ;
}
ReadLayer::~ReadLayer()
{}
void ReadLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads) const
{ // dimension checks
this->check_loglikelihood_dim(loglikelihood) ;
this->check_loglikelihood_max_dim(loglikelihood_max) ;
// don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihoods_routine(0,
this->n_row,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ReadLayer::compute_loglikelihoods_routine,
this,
slice.first,
slice.second,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void ReadLayer::compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
std::promise<bool>& done) const
{
// normalize the models
Matrix3D<double> model_norm = this->model ;
for(size_t i=0; i<this->n_class; i++)
{ double mean = 0. ;
for(size_t j=0; j<this->l_model; j++)
{ mean += model_norm(i,j,0) ; }
mean /= this->l_model ;
for(size_t j=0; j<this->l_model; j++)
{ model_norm(i,j,0) /= mean ; }
}
// compute log likelihood
for(size_t i=from; i<to; i++)
{
// set max to min possible value
loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;
for(size_t j=0; j<this->n_class; j++)
{ for(size_t s_fw=0, s_rev=this->n_shift-1;
s_fw<this->n_shift; s_fw++, s_rev--)
{ // slice is [from_fw,to)
// from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw]
// fw |---------->>>----------|
// ----------------------------------> data
// rev |----------<<<----------| [from_dat_rev, to_dat_rev]
// to_dat_rev can be -1 -> int
// to_dat_rev from_dat_rev
// log likelihood
double ll_fw = 0. ;
double ll_rev = 0. ;
// --------------- forward ---------------
size_t from_dat_fw = s_fw ;
size_t to_dat_fw = from_dat_fw + this->l_model - 1 ;
// --------------- reverse ---------------
size_t from_dat_rev = this->n_col - 1 - s_fw ;
// size_t to_dat_rev = from_dat_rev - (this->l_model - 1) ;
for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev;
j_dat_fw<to_dat_fw;
j_dat_fw++, j_ref_fw++, j_dat_rev--)
{
double ll ;
// --------------- forward ---------------
ll = log(poisson_pmf(this->data(i,j_dat_fw),
model_norm(j,j_ref_fw,0)*
this->window_means(i,s_fw))) ;
- // ll_fw += ll ;
// p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf
- ll_fw += std::max(ll, ReadLayer::p_min_log) ;
+ // prevent this by applying this
+ if(isinf(ll))
+ { ll = ReadLayer::p_min_log ; }
+ ll_fw += ll ;
// --------------- reverse ---------------
if(this->flip)
{ ll = log(poisson_pmf(this->data(i,j_dat_rev),
model_norm(j,j_ref_fw,0)*
this->window_means(i,s_rev))) ;
- // ll_rev += ll ;
// p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf
- ll_rev += std::max(ll, ReadLayer::p_min_log) ;
+ // prevent this by applying this
+ if(isinf(ll))
+ { ll = ReadLayer::p_min_log ; }
+ ll_rev += ll ;
}
}
loglikelihood(i,j,from_dat_fw,flip_states::FORWARD) = ll_fw ;
// keep track of the max per row
if(ll_fw > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_fw ; }
if(this->flip)
{ loglikelihood(i,j,from_dat_fw,flip_states::REVERSE) = ll_rev ;
// keep track of the max per row
if(ll_rev > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_rev ; }
}
}
}
}
done.set_value(true) ;
}
void ReadLayer::update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads)
{
// computing sum over the columns (classes)
size_t n_row = posterior_prob.get_dim()[0] ;
size_t n_class = posterior_prob.get_dim()[1] ;
size_t n_shift = posterior_prob.get_dim()[2] ;
size_t n_flip = posterior_prob.get_dim()[3] ;
vector_d colsum(n_class, 0.) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ colsum[j] += posterior_prob(i,j,k,l) ; }
}
}
}
this->update_model(posterior_prob,
colsum,
threads) ;
}
void ReadLayer::update_model(const Matrix4D<double>& posterior_prob,
const vector_d& posterior_prob_colsum,
ThreadPool* threads)
{
// don't parallelize
if(threads == nullptr)
{ std::promise<Matrix3D<double>> promise ;
std::future<Matrix3D<double>> future = promise.get_future() ;
this->update_model_routine(0,
this->n_row,
posterior_prob,
posterior_prob_colsum,
promise) ;
// this->model = future.get() ;
auto model = future.get() ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = model(i,j,k) ; }
}
}
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ReadLayer::update_model_routine,
this,
slice.first,
slice.second,
std::ref(posterior_prob),
std::ref(posterior_prob_colsum),
std::ref(promises[i])))) ;
}
// reinitialise the model
/*
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
*/
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = 0. ; }
}
}
// wait until all threads are done working
// and update the mode
for(auto& future : futures)
{ Matrix3D<double> model_part = future.get() ;
// for(size_t i=0; i<this->n_class; i++)
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) +=
model_part(i,j,k) ;
}
}
}
}
// -------------------------- threads stop ---------------------------
}
// avoid 0's in the model to ensure that pmf_poisson() never
// return 0
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) =
std::max(this->model(i,j,k), ReadLayer::p_min) ;
}
}
}
}
void ReadLayer::update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
const vector_d& posterior_prob_colsum,
std::promise<Matrix3D<double>>& promise) const
{
// dimension checks
this->check_posterior_prob_dim(posterior_prob) ;
this->check_posterior_prob_colsum_dim(posterior_prob_colsum) ;
// partial model
Matrix3D<double> model(this->n_class,
this->l_model,
this->n_category,
0.) ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t n_class=0; n_class < n_class_to_update; n_class++)
{ for(size_t i=from; i<to; i++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ // --------------- forward ---------------
int from_dat_fw = n_shift ;
int to_dat_fw = from_dat_fw + this->l_model - 1 ;
for(int j_dat_fw=from_dat_fw, j_ref_fw=0;
j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++)
{ model(n_class,j_ref_fw,0) +=
(posterior_prob(i,n_class,n_shift,flip_states::FORWARD) *
this->data(i,j_dat_fw)) /
posterior_prob_colsum[n_class] ;
}
// --------------- reverse ---------------
if(this->flip)
{ int from_dat_rev = this->n_col - 1 - n_shift ;
int to_dat_rev = from_dat_rev - (this->l_model - 1) ;
for(int j_dat_rev=from_dat_rev, j_ref_fw=0;
j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++)
{ model(n_class,j_ref_fw,0) +=
(posterior_prob(i,n_class,n_shift,flip_states::REVERSE) *
this->data(i,j_dat_rev)) /
posterior_prob_colsum[n_class] ;
}
}
}
}
}
promise.set_value(model) ;
}
void ReadLayer::compute_window_means(ThreadPool* threads)
{ // don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_window_means_routine(0, this->n_row, promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ReadLayer::compute_window_means_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void ReadLayer::compute_window_means_routine(size_t from,
size_t to,
std::promise<bool>& done)
{ double l_window = double(this->l_model) ;
for(size_t i=from; i<to; i++)
{ for(size_t from=0; from<this->n_shift; from++)
{ double sum = 0. ;
// slice is [from,to)
size_t to = from + this->l_model ;
for(size_t j=from; j<to; j++)
{ sum += this->data(i,j) ;}
this->window_means(i,from) = sum / l_window ;
}
}
done.set_value(true) ;
}
void ReadLayer::check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const
{ if(posterior_prob_colsum.size() != this->n_class)
{ char msg[4096] ;
sprintf(msg,
"Error! posterior_class_prob matrix size is not "
"equal to model class number : %zu / %zu",
posterior_prob_colsum.size(), this->n_class) ;
throw std::invalid_argument(msg) ;
}
}
void ReadLayer::create_bckg_class()
{
// mean count
double mean = 0. ;
double n = (double)this->data.get_nrow() *
(double)this->data.get_ncol() ;
for(size_t i=0; i<this->data.get_nrow(); i++)
{ for(size_t j=0; j<this->data.get_ncol(); j++)
{ mean += ((double)this->data(i,j)) / n ; }
}
// set last class as background class
for(size_t j=0; j<this->l_model; j++)
{ this->model(this->n_class-1,j,0) = mean ; }
}
diff --git a/src/Clustering/ReadLayer.hpp b/src/Clustering/ReadLayer.hpp
index 5d52c92..cd001e4 100644
--- a/src/Clustering/ReadLayer.hpp
+++ b/src/Clustering/ReadLayer.hpp
@@ -1,269 +1,278 @@
#ifndef READLAYER_HPP
#define READLAYER_HPP
#include <Data2DLayer.hpp>
#include <future> // std::promise
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ThreadPool.hpp>
typedef std::vector<double> vector_d ;
class ReadLayer : public Data2DLayer
{
public:
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
ReadLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class,
ThreadPool* threads = nullptr) ;
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
ReadLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class,
ThreadPool* threads = nullptr) ;
/*!
* \brief Construct an object with the
* given data and model.
* \param data the data.
* \param the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
ReadLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool bckg_class,
ThreadPool* threads = nullptr) ;
/*!
* \brief Construct an object with the
* given data and model.
* \param data the data.
* \param the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
ReadLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool bckg_class,
ThreadPool* threads = nullptr) ;
/*!
* Destructor
*/
virtual ~ReadLayer() override ;
/*!
* \brief Computes the log likelihood of the data
* given the current model parameters.
* During this process, a normalized version of the
* models, having a sum of signal of 1 count in average,
* is used (a copy of the models is normalized, meaning
* that the original models can still be retrieved the
* dedicated getter).
* \param logliklihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of loglikelihood.
* Its length should be equal to the data row number.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
* \throw std::invalid_argument if the dimensions are
* incorrect.
*/
virtual void compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads=nullptr) const override ;
/*!
* \brief Updates the model given the posterior
* probabilities (the probabilities of each row
* in the data to be assigned to each class,
* for each shift and flip state).
- * \param posterior_prob the data assignment probabilities to
- * the different classes.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is ReadLayer::p_min.
+ * \param posterior_prob the data assignment
+ * probabilities to the different classes.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
virtual void update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads=nullptr) override ;
/*!
* \brief Updates the model given the posterior
* probabilities (the probabilities of each row
* in the data to be assigned to each class,
* for each shift and flip state).
* This method does the same as the virtual method it
* overloads. The only difference is that, for run time
* gain, it is given the sum over the columns of the
* posterior_prob matrix which is computed by the virtual
* method.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is ReadLayer::p_min.
* \param posterior_prob the data assignment probabilities to
* the different classes.
* \param posterior_prob_colsum the sum over the columns
* (classes) of the posterior_prob matrix.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
void update_model(const Matrix4D<double>& posterior_prob,
const vector_d& posterior_prob_colsum,
ThreadPool* threads=nullptr) ;
protected:
/*!
* \brief The routine that effectively performs the
* loglikelihood computations.
* \param from the index of the first row of the data
* to considered.
* \param to the index of the past last row of the data
* to considered.
* \param loglikelihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of log_likelihood.
* Its length should be equal to the data row number.
* \param done a promise to be filled when the routine
* is done running.
*/
void compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
std::promise<bool>& done) const ;
/*!
* \brief The routine that effectively update the model.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is ReadLayer::p_min.
* \param from the index of the first row of the
* posterior probabilities to considered.
* \param to the index of the past last row of the
* posterior probabilities to considered.
* \param posterior_prob the data assignment probabilities
* to the different classes.
* \param
* \param promise a promise containing the partial model
* computed from the given data slice. If several routines
* work together to update the model, the promise matrices
* need to be summed up to get the final model.
*/
void update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
const vector_d& posterior_prob_colsum,
std::promise<Matrix3D<double>>& promise) const ;
/*!
* \brief Computes the mean number of reads present in
* each slice (of length l_model), in each row
* of the data and store them in this->window_means.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
void compute_window_means(ThreadPool* threads) ;
/*!
* \brief The routine that effectively computes the
* window means.
* \param from the index of the first row of the
* data to considered.
* \param to the index of the past last row of the
* data to considered.
* \param done a promise to fill when the routine
* is done running.
*/
void compute_window_means_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Checks that the argument has compatible
* dimensions with the data and models. If this is
* not the case, throw a std::invalid_argument with
* a relevant message.
* \param posterior_class_prob a vector containing the
* class probabilities.
* It should have a length equal to the number of
* classes.
* \throw std::invalid_argument if the dimensions are
* incorrect.
*/
void check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const ;
/*!
* \brief Sets the last class as background class with the mean
* number of counts in the data at each position.
*/
void create_bckg_class() ;
/*!
* \brief contains the data means, for
* each window of size l_model.
*/
Matrix2D<double> window_means ;
} ;
#endif // READLAYER_HPP
diff --git a/src/Clustering/SequenceLayer.cpp b/src/Clustering/SequenceLayer.cpp
index 8906c96..d14859f 100644
--- a/src/Clustering/SequenceLayer.cpp
+++ b/src/Clustering/SequenceLayer.cpp
@@ -1,432 +1,432 @@
#include <SequenceLayer.hpp>
#include <stdexcept> // std::invalid_argument
#include <limits> // numeric_limits
#include <cmath> // log(), pow()
#include <vector>
#include <algorithm> // std::max_element()
#include <future> // std::promise, std::future
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <dna_utility.hpp> // dna::base_composition()
double SequenceLayer::score_subseq(const Matrix2D<int>& seq,
size_t row,
size_t start,
const Matrix2D<double>& model_log)
{
if(start > seq.get_ncol() - model_log.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, seq.get_ncol() - model_log.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model_log.get_nrow() > seq.get_ncol())
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model_log.get_nrow(), seq.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(model_log.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model_log.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
size_t from = start ;
size_t to = from + model_log.get_nrow() ; // will score [from, to)
int n_code = dna::char_to_int('N') ;
double ll = 0 ;
for(size_t i=from, j=0; i<to; i++, j++)
{ int base = seq(row,i) ;
// N char -> get max score
if(base == n_code)
{ std::vector<double> row = model_log.get_row(j) ;
ll += *(std::max_element(std::begin(row),
std::end(row))) ;
}
// A,C,G,T -> get its score
else
{ ll += model_log(j,base) ; }
}
return ll ;
}
SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data2DLayer(data, n_class, n_shift, flip,bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
SequenceLayer::SequenceLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data2DLayer(std::move(data), n_class, n_shift, flip, bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool bckg_class)
: Data2DLayer(data, model,flip, bckg_class)
{}
SequenceLayer::SequenceLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool bckg_class)
: Data2DLayer(std::move(data), std::move(model),flip, bckg_class)
{}
SequenceLayer::~SequenceLayer()
{ ; }
void SequenceLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads) const
{
// dimension checks
this->check_loglikelihood_dim(loglikelihood) ;
this->check_loglikelihood_max_dim(loglikelihood_max) ;
// compute the log prob model and the log prob reverse-complement model
std::vector<Matrix2D<double>> model_log(this->n_class,
Matrix2D<double>(this->l_model,
this->n_category,
0.)) ;
std::vector<Matrix2D<double>> model_log_rev = model_log ;
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ // forward
model_log[i](j,k) = log(this->model(i,j,k)) ;
// reverse
model_log_rev[i](this->l_model-j-1,this->n_category-k-1)
= log(this->model(i,j,k)) ;
}
}
}
// don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihoods_routine(0, this->n_row,
loglikelihood,
loglikelihood_max,
model_log,
model_log_rev,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&SequenceLayer::compute_loglikelihoods_routine,
this,
slice.first,
slice.second,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
std::ref(model_log),
std::ref(model_log_rev),
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void SequenceLayer::compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model_log,
const std::vector<Matrix2D<double>>& model_log_rev,
std::promise<bool>& done) const
{
// compute log likelihood
for(size_t i=from; i<to; i++)
{
// set max to min possible value
loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;
for(size_t j=0; j<this->n_class; j++)
- { for(size_t s=0; s<this->n_shift; s++)
+ {
+ for(size_t s=0; s<this->n_shift; s++)
{ // forward strand
- { double ll_fw = std::max(score_subseq(this->data, i, s, model_log[j]),
- SequenceLayer::p_min_log) ;
+ { double ll_fw = score_subseq(this->data, i, s, model_log[j]) ;
// loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// apply lower bound here -> log(1e-100)
loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// keep track of max per row
if(ll_fw > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_fw ; }
}
// reverse
if(this->flip)
- { double ll_rev = std::max(score_subseq(this->data, i, s, model_log_rev[j]),
- SequenceLayer::p_min_log) ;
+ { double ll_rev = score_subseq(this->data, i, s, model_log_rev[j]) ;
// loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// apply lower bound here -> log(1e-100)
loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// keep track of max per row
if(ll_rev > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_rev ; }
}
}
}
}
done.set_value(true) ;
}
+
void SequenceLayer::update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads)
{ // don't parallelize
if(threads == nullptr)
{ std::promise<Matrix3D<double>> promise ;
std::future<Matrix3D<double>> future = promise.get_future() ;
this->update_model_routine(0,
this->n_row,
posterior_prob,
promise) ;
// this->model = future.get() ;
auto model = future.get() ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = model(i,j,k) ; }
}
}
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&SequenceLayer::update_model_routine,
this,
slice.first,
slice.second,
std::ref(posterior_prob),
std::ref(promises[i])))) ;
}
// reinitialise the model
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = 0. ; }
}
}
// wait until all threads are done working
// and update the model
for(auto& future : futures)
{ Matrix3D<double> model_part = future.get() ;
// for(size_t i=0; i<this->n_class; i++)
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) += model_part(i,j,k) ; }
}
}
}
// -------------------------- threads stop ---------------------------
}
// make sure to have no 0 values
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) =
std::max(this->model(i,j,k), SequenceLayer::p_min) ;
}
}
}
// normalize to get probs
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ double sum = 0. ;
for(size_t k=0; k<this->n_category; k++)
{ sum += this->model(i,j,k) ; }
for(size_t k=0; k<this->n_category; k++)
{ double p = this->model(i,j,k) / sum ;
this->model(i,j,k) = p ;
}
}
}
}
void SequenceLayer::update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& promise) const
{ // dimension checks
this->check_posterior_prob_dim(posterior_prob) ;
Matrix3D<double> model(this->n_class,
this->l_model,
this->n_category,
0.) ;
// the int code of A, C, G, T, N
static int a_code = dna::char_to_int('A') ;
static int c_code = dna::char_to_int('C') ;
static int g_code = dna::char_to_int('G') ;
static int t_code = dna::char_to_int('T') ;
static int n_code = dna::char_to_int('N') ;
// the int code of the reverse complement of A, C, G, T
static int a_code_r = dna::char_to_int('A', true) ;
static int c_code_r = dna::char_to_int('C', true) ;
static int g_code_r = dna::char_to_int('G', true) ;
static int t_code_r = dna::char_to_int('T', true) ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t k=0; k<n_class_to_update; k++)
{ for(size_t s=0; s<this->n_shift; s++)
{ // for(size_t j=0; j<this->l_model; j++)
for(size_t j=0, j_rv=this->l_model-1; j<this->l_model; j++, j_rv--)
{ // base prob on fw and rv strand
vector_d base_prob_fw(this->n_category, 0.) ;
vector_d base_prob_rv(this->n_category, 0.) ;
for(size_t i=from; i<to; i++)
{ int base = this->data(i,s+j) ;
int base_rv = this->n_category - base - 1 ; // complement
// int base_rv = this->n_category - this->data(i,s+j_rv) - 1 ;
// std::cerr << k << " "
// << s << " "
// << j << "/" << j_rv << " "
// << base << "/" << base_rv
// << std::endl ;
// N
if(base == n_code)
{ // --------------- forward ---------------
{ base_prob_fw[a_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[c_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[g_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[t_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
{ base_prob_rv[a_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[c_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[g_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[t_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
}
}
// A, C, G, T
else
{ // --------------- forward ---------------
{ base_prob_fw[base] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
{ base_prob_rv[base_rv] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
}
}
}
// update this position of the model
for(size_t i=0; i<base_prob_fw.size(); i++)
{ // --------------- forward ---------------
{ model(k,j,i) += base_prob_fw[i] ; }
// --------------- reverse ---------------
if(this->flip)
{ model(k,this->l_model-j-1,i) += base_prob_rv[i] ;
/* model(k,j_rv,i) += base_prob_rv[i] ; */
}
}
}
}
}
promise.set_value(model) ;
}
void SequenceLayer::create_bckg_class()
{ // sequence composition
std::vector<double> base_comp =
dna::base_composition(this->data,
this->flip) ;
// set last class as background class
for(size_t i=0; i<this->n_category; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ this->model(this->n_class-1,j,i) = base_comp[i] ; }
}
}
diff --git a/src/Clustering/SequenceLayer.hpp b/src/Clustering/SequenceLayer.hpp
index 2238ff4..4cd5bf8 100644
--- a/src/Clustering/SequenceLayer.hpp
+++ b/src/Clustering/SequenceLayer.hpp
@@ -1,231 +1,237 @@
#ifndef SEQUENCELAYER_HPP
#define SEQUENCELAYER_HPP
#include <Data2DLayer.hpp>
#include <iostream>
#include <string>
#include <future> // std::promise
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ThreadPool.hpp>
typedef std::vector<double> vector_d ;
class SequenceLayer : public Data2DLayer
{
public:
/*!
* \brief Computes the log-likelihood of the sub-
* sequence - stored in a given row - and starting
* at the offset <col> in the given sequence matrix.
* The subsequence length is determined by the model
* lenght.
* \param seq the sequences in integer format.
* \param row the row containing the sequence of
* interest.
* \param col the index at which the sub-sequence
* is starting (1st index inside the subsequence
* of interest).
* \param model_log a model containing the log
* probability model.
* \return the log-likelihood of the sub-sequence
* given the model.
* \throw std::invalid_argument if 1) the offset is
* invalid, 2) the sequence and the model have
* incompatible dimensions or 3) the model 2n dimension
* is not 4 (A,C,G,T).
*/
static double score_subseq(const Matrix2D<int>& seq,
size_t row,
size_t col,
const Matrix2D<double>& model_log) ;
public:
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
SequenceLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
SequenceLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Construct an object with the
* given data and model.
* The shifting freedom is set to (data number
* of columns) - (the model 2nd dimension)
* + 1.
* \param data the data. The sequences
* should be stored as integer values :
* A:0, C:1, G:2, T:3, else:5.
* \param model the model with the following
* dimensions :
* dim1 the number of classes
* dim2 the model length
* dim3 4 (A,C,G,T)
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
SequenceLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool bckg_class) ;
/*!
* \brief Construct an object with the
* given data and model.
* The shifting freedom is set to (data number
* of columns) - (the model 2nd dimension)
* + 1.
* \param data the data. The sequences
* should be stored as integer values :
* A:0, C:1, G:2, T:3, else:5.
* \param model the model with the following
* dimensions :
* dim1 the number of classes
* dim2 the model length
* dim3 4 (A,C,G,T)
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
SequenceLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool bckg_class) ;
/*!
* Destructor
*/
virtual ~SequenceLayer() override ;
/*!
* \brief Computes the log likelihood of the data
* given the current model parameters.
* \param logliklihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of loglikelihood.
* Its length should be equal to the data row number.
* \throw std::invalid_argument if the dimensions are
* incorrect.
*/
virtual void compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads=nullptr) const override ;
/*!
* \brief Updates the model given the posterior
* probabilities (the probabilities of each row
* in the data to be assigned to each class,
* for each shift and flip state).
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is SequenceLayer::p_min.
* \param posterior_prob the data assignment probabilities to
* the different classes.
*/
virtual void update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads=nullptr) override ;
protected:
/*!
* \brief The routine that effectively performs the
* loglikelihood computations.
* \param from the index of the first row of the data
* to considered.
* \param to the index of the past last row of the data
* to considered.
* \param loglikelihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of log_likelihood.
* Its length should be equal to the data row number.
* \param model_log a vector containing the matrices with
* the log values of the model for each class.
* \param model_log_rev a vector containing the matrices with
* the log values of the reverse strand model for each class
* (the 1st position in the model becomes the last in the
* reverse model and probabilities are swapped A<->T and C<->G).
* \param done a promise to be filled when the routine
* is done running.
*/
void compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model_log,
const std::vector<Matrix2D<double>>& model_log_rev,
std::promise<bool>& done) const ;
/*!
* \brief The routine that effectively update the model.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is SequenceLayer::p_min.
* \param from the index of the first row of the
* posterior probabilities to considered.
* \param to the index of the past last row of the
* posterior probabilities to considered.
* \param posterior_prob the data assignment probabilities
* to the different classes.
* \param
* \param done a promise containing the partial model
* computed from the given data slice. If several routines
* work together at updating the model, they need to be
* summed and normalized (by the column sum) to get the
* final model.
*/
void update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& done) const ;
/*!
* \brief Sets the last class as background class with the
* mean base probababilities of the data at each position.
*/
void create_bckg_class() ;
} ;
#endif // SEQUENCELAYER_HPP
diff --git a/src/main_test.cpp b/src/main_test.cpp
index 2256a7a..7e2a898 100644
--- a/src/main_test.cpp
+++ b/src/main_test.cpp
@@ -1,457 +1,453 @@
#include <iostream>
#include <string>
#include <vector>
#include <utility>
#include <cmath>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ConsensusSequenceModelComputer.hpp>
#include <EMConsensusSequence.hpp>
#include <SequenceModelComputer.hpp>
#include <EMSequence.hpp>
#include <dna_utility.hpp>
template<class T>
std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
Matrix3D<double> compute_consensus_sequences(const Matrix2D<int>& seq,
const Matrix4D<double>& prob,
size_t class_k)
{
size_t n_seq = seq.get_nrow() ;
size_t l_seq = seq.get_ncol() ;
// size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
bool flip = n_flip == 2 ;
size_t l_model = l_seq - n_shift + 1 ;
Matrix3D<double> cons_seq(n_seq, l_model, 4, 0.) ;
// aggregate
for(size_t i=0; i<n_seq; i++)
{ std::cerr << "i " << i << std::endl ;
for(size_t s=0; s<n_shift; s++)
{ //std::cerr << "s " << s << std::endl ;
double tot = 0. ;
// aggregate
for(size_t j=0; j<l_model; j++)
{ // std::cerr << "j " << j << std::endl ;
// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
int base = seq(i,s+j) ;
// N should participate to A,C,G,T equivalently -> same as doing nothing
if(base == dna::char_to_int('N'))
{ continue ; }
int base_rv = 4 - base - 1 ;
// --------- forward ---------
// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
cons_seq(i,j,base) += prob(i,class_k,s,0) ;
tot += prob(i,class_k,s,0) ;
// --------- reverse ---------
if(flip)
{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ;
tot += prob(i,class_k,s,1) ;
}
}
}
}
// normalize
for(size_t i=0; i<n_seq; i++)
{ for(size_t j=0; j<l_model; j++)
{ double tot = 0. ;
for(size_t k=0; k<4; k++)
{ tot += cons_seq(i,j,k) ; }
for(size_t k=0; k<4; k++)
{ cons_seq(i,j,k) /= tot ; }
}
}
return cons_seq ;
}
Matrix2D<int> char_to_int(const Matrix2D<char>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
Matrix2D<int> iseq(n_row, n_col, 0.) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ iseq(i,j) = dna::char_to_int(seq(i,j)) ; }
}
return iseq ;
}
Matrix3D<double> sequence_to_consensus(const Matrix2D<int>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
size_t n_dim3 = 4 ; // A, C, G, T
int base_n = dna::char_to_int('N') ;
Matrix3D<double> cseq(n_row, n_col, n_dim3, 1e-100) ;
double prob = 1. - 3*(1e-100) ; // p(base) - p(!base)
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ int base = seq(i,j) ;
// N
if(base == base_n)
{ cseq(i,j,0) = 0.25 ;
cseq(i,j,1) = 0.25 ;
cseq(i,j,2) = 0.25 ;
cseq(i,j,3) = 0.25 ;
}
// A C G T
else
{ cseq(i,j, base) = prob ; }
}
}
return cseq ;
}
double score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t start,
const Matrix2D<double>& model)
{ size_t ncol = cons_seq.get_dim()[1] ;
// size_t dim1 = cons_seq.get_dim()[0] ;
// size_t dim2 = cons_seq.get_dim()[1] ;
size_t dim3 = cons_seq.get_dim()[2] ;
if(start > ncol - model.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, ncol - model.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_nrow() > ncol)
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model.get_nrow(), ncol) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(dim3 != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
dim3) ;
throw std::invalid_argument(msg) ;
}
/*
double ll = 0 ;
for(size_t j_seq=from, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ for(size_t k=0; k<dim3; k++)
{ std::cerr << ll
<< " *= "
<< "("
<< model(j_mod,k)
<< " * "
<< cons_seq(row,j_seq,k)
<< ")"
<< std::endl ;
ll *= model(j_mod,k) * cons_seq(row,j_seq,k) ;
}
}*/
size_t from = start ;
size_t to = from + model.get_nrow() ; // will score [from, to)
double ll = 0. ;
for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ double sum = 0. ;
for(size_t k=0; k<dim3; k++)
{ std::cerr << sum
<< "+= ("
<< cons_seq(row, j_seq, k)
<< " * "
<< model(j_mod, k)
<< ")"
<< std::endl ;
sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ;
}
std::cerr << sum << std::endl ;
std::cerr << "---------------" << std::endl ;
ll += log(sum) ;
}
return ll ;
}
Matrix2D<double> test_consensus(const std::string& file,
- size_t n_class,
- size_t n_iter,
- size_t n_shift,
- bool flip,
- bool bckg_class,
- const std::string& seed,
- size_t n_thread)
+ size_t n_class,
+ size_t n_iter,
+ size_t n_shift,
+ bool flip,
+ bool bckg_class,
+ const std::string& seed,
+ size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
// partition
EMConsensusSequence em(cseq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
- std::cerr << prob << std::endl ;
-
// compute models
ConsensusSequenceModelComputer mc(std::move(cseq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_sequence(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
- // Matrix2D<int> seq(file) ;
- Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
+ Matrix2D<int> seq(file) ;
+ // Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
// partition
EMSequence em(seq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
- std::cerr << prob << std::endl ;
-
// compute models
SequenceModelComputer mc(std::move(seq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_aggregate(const std::string& file)
{ Matrix2D<int> seq(file) ;
size_t nrow = seq.get_nrow() ;
size_t ncol = seq.get_ncol() ;
Matrix2D<double> model_final(4, ncol+1, 1e-100) ;
for(size_t i=0; i<4; i++)
{ model_final(i,0) = 1. ; }
std::vector<double> sum(ncol+1, 0.) ;
for(size_t i=0; i<nrow; i++)
{ for(size_t j=0; j<ncol; j++)
{ int base = seq(i,j) ;
if(base == dna::char_to_int('N'))
{ continue ; }
model_final(base, j+1) += 1. ;
sum[j+1] += 1. ;
}
}
for(size_t i=0; i<model_final.get_nrow(); i++)
{ for(size_t j=1; j<model_final.get_ncol()+1; j++)
{ model_final(i,j) /= sum[j] ; }
}
return model_final ;
}
int main()
{
size_t n_class = 2 ;
size_t n_iter = 50 ;
- size_t n_shift = 3 ;
+ size_t n_shift = 1 ;
bool flip = true ;
bool bckg_class = false ;
std::string seed = "" ;
size_t n_thread = 1 ;
- /*
+
std::string file = "/local/groux/scATAC-seq/data/"
"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
- */
// std::string file = "/local/groux/scATAC-seq/test.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_noflip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_flip.mat" ;
- std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
+ // std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
/*
Matrix2D<double> model_final = test_consensus(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
- */
+ */
Matrix2D<double> model_final = test_sequence(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
+
/*
Matrix2D<double> model_final = test_aggregate(file) ;
*/
std::cout << model_final << std::endl ;
/*
double p0 = 0 ;
// double l0 = log(p0) ;
double p1 = 1. - (3.*p0) ;
// double l1 = log(p1) ;
// Matrix3D<double> cseq(1, 4, 4) ;
// cseq(0, 0, 0) = p1 ; cseq(0, 0, 1) = p0 ; cseq(0, 0, 2) = p0 ; cseq(0, 0, 3) = p0 ;
// cseq(0, 1, 0) = p0 ; cseq(0, 1, 1) = p1 ; cseq(0, 1, 2) = p0 ; cseq(0, 1, 3) = p0 ;
// cseq(0, 2, 0) = p0 ; cseq(0, 2, 1) = p0 ; cseq(0, 2, 2) = p1 ; cseq(0, 2, 3) = p0 ;
// cseq(0, 3, 0) = p0 ; cseq(0, 3, 1) = p0 ; cseq(0, 3, 2) = p0 ; cseq(0, 3, 3) = p1 ;
Matrix2D<int> seq(1, 4, -1) ;
seq(0, 0) = 0 ; seq(0, 1) = 1 ; seq(0, 2) = 2 ; seq(0, 3) = 3 ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
std::cout << cseq << std::endl << std::endl ;
Matrix2D<double> mod(4, 4, -1) ;
mod(0,0) = p1 ; mod(0,1) = p0 ; mod(0,2) = p0 ; mod(0,3) = p0 ;
mod(1,0) = p0 ; mod(1,1) = p1 ; mod(1,2) = p0 ; mod(1,3) = p0 ;
mod(2,0) = p0 ; mod(2,1) = p0 ; mod(2,2) = p1 ; mod(2,3) = p0 ;
mod(3,0) = p0 ; mod(3,1) = p0 ; mod(3,2) = p0 ; mod(3,3) = p1 ;
std::cout << mod << std::endl ;
std::cout << score_consensus_subseq(cseq, 0,0,mod)
<< std::endl ;
*/
return EXIT_SUCCESS ;
}

Event Timeline