Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90326980
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Oct 31, 13:30
Size
249 KB
Mime Type
application/octet-stream
Expires
Sat, Nov 2, 13:30 (2 d)
Engine
blob
Format
Raw Data
Handle
22053231
Attached To
R8820 scATAC-seq
View Options
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh
index f5a0b24..d32dffe 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.sh
@@ -1,52 +1,52 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=28
+n_core=32
# open chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
# 1nucl chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh
index 431cde7..04c51f3 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.sh
@@ -1,52 +1,52 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=28
+n_core=32
# open chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat'
file_mod3=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat'
file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
# 1nucl chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat'
file_mod3=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat'
file_aic=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh
index ec4656a..c0104d9 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.sh
@@ -1,53 +1,53 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/myc_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=28
+n_core=32
# open chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d'
file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat'
file_mod3=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat'
file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
# 1nucl chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat'
file_mod3=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat'
file_aic=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R
index 5971095..d30e6b1 100644
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R
@@ -1,95 +1,95 @@
setwd(file.path("/", "local", "groux", "scATAC-seq"))
# libraries
library(RColorBrewer)
# functions
source(file.path("scripts", "functions.R"))
# the minimum number of classes searched
k.min = 1
# the maximum number of classes searched
k.max = 10
# path to the images for the logo
path.a = file.path("res/A.png")
path.c = file.path("res/C.png")
path.g = file.path("res/G.png")
path.t = file.path("res/T.png")
################## sequence patterns around sp1 motifs ##################
for(k in k.min:k.max)
{
# open chromatin
data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1",
sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k)))
model.open = data$models
model.prob = data$prob
data = NULL
# nucleosomes
model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1",
- sprintf("sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models
+ sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_1nucl_fragment_center_model.mat", k)))$models
# sequence
model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1",
sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models
# plot classes
col = brewer.pal(3, "Set1")
# X11(width=17, height=10)
png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1",
sprintf("sp1_motifs_10e-7_classification_open_bin1bp_%dclass.png", k)),
units="in", res=720, width=18, height=12)
m = matrix(1:10, nrow=5, ncol=2, byrow=F)
layout(m)
# order from most to least probable class
ord = order(model.prob, decreasing=T)
ref.open = model.open[ord,, drop=F]
ref.nucl = model.nucl[ord,, drop=F]
ref.seq = model.seq[,,ord, drop=F]
prob = model.prob[ord]
class = c(1:nrow(ref.open))[ord]
for(i in 1:nrow(ref.open))
{ # plot logo
plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
main=sprintf("class %d (p=%.2f)", class[i], prob[i]))
# x-axis
x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3)
x.at = (x.lab + ncol(ref.open)) / 2
axis(1, at=x.at, labels=x.lab)
# y-axis is [0,1] for min/max signal
x.at = seq(0, 1, 0.5)
axis(2, at=x.at, labels=x.at)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
}
row_n = 1 # row counter
col_n = 1 # column counter
for(i in 1:nrow(ref.open))
{ # plot logo center
right = 0.5*col_n - 0.01
left = right - 0.2
bottom = 1-(row_n*(0.2))+0.05
top = bottom + 0.15
par(fig=c(left, right, bottom, top), new=T)
idx = 380:420
plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
# xaxis
x.at = 1:length(idx)
axis(1, at=x.at, labels=x.at)
# yaxis
x.at = seq(0, 2, by=1)
axis(2, at=x.at, labels=x.at)
row_n = row_n + 1
if(i %% 5 == 0)
{ col_n = col_n + 1
row_n = 1
}
}
dev.off()
}
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh
index d9104d0..6c12400 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.sh
@@ -1,52 +1,52 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_1'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat"
## file with seeds
file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=28
+n_core=32
# open chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_sequences_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
# 1nucl chromatin
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_sequences_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMRead --read $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh
index 247a840..013fa9f 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh
@@ -1,36 +1,36 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=24
+n_core=28
# open chromatin and nucleosomes
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh
index 2204ba9..e938ca5 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh
@@ -1,36 +1,36 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=24
+n_core=28
# open chromatin and nucleosomes
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'ebf1_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh
index 6ea2fcf..345c0cd 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh
@@ -1,36 +1,36 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/myc_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=12
+n_core=28
# open chromatin and nucleosomes
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'myc_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh
index 5f561db..ccd2282 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.sh
@@ -1,35 +1,35 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat"
## file with seeds
file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=12
+n_core=28
# open chromatin and nucleosomes
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh
index 61a8ded..bcaa4f4 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh
@@ -1,38 +1,38 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_3'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=24
+n_core=28
# sequences
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat'
file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh
index 54f1c72..ca5821c 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_sp1_motif.sh
@@ -1,38 +1,38 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_3'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat"
## file with seeds
file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='21'
-n_core=24
+n_core=28
# sequences
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat'
file_mod4=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh
index a34f0ec..f9a59cf 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh
@@ -1,38 +1,38 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_4'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat"
## file with seeds
file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='1'
-n_core=24
+n_core=28
# sequences
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d'
file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat'
file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat'
file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4
done
diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh
index cf0d6b3..82b61b4 100755
--- a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh
+++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_sp1_motif.sh
@@ -1,38 +1,38 @@
# some paths
## directories
results_dir='results/10xgenomics_PBMC_5k_motifs_classification_4'
data_dir='data/10xgenomics_PBMC_5k_motifs'
## input
file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat"
file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat"
## file with seeds
file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
mkdir -p $results_dir
touch $file_seed
# parameters
n_iter='20'
n_shift='1'
-n_core=24
+n_core=28
# sequences
for k in 1 2 3 4 5 6 7 8 9 10
do
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d'
file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat'
file_mod4=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat'
file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4
done
diff --git a/src/Clustering/ConsensusSequenceLayer.cpp b/src/Clustering/ConsensusSequenceLayer.cpp
index 5eab0b1..05c7e1e 100644
--- a/src/Clustering/ConsensusSequenceLayer.cpp
+++ b/src/Clustering/ConsensusSequenceLayer.cpp
@@ -1,377 +1,371 @@
#include <ConsensusSequenceLayer.hpp>
#include <vector>
#include <future> // std::promise, std::future
#include <stdexcept> // std::invalid_argument
#include <cmath> // log()
#include <functional> // std::bind(), std::ref()
#include <ThreadPool.hpp>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <dna_utility.hpp> // dna::base_composition()
double ConsensusSequenceLayer::score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t start,
const Matrix2D<double>& model)
{ size_t ncol = cons_seq.get_dim()[1] ;
size_t dim3 = cons_seq.get_dim()[2] ;
if(start > ncol - model.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, ncol - model.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_nrow() > ncol)
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model.get_nrow(), ncol) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(dim3 != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
dim3) ;
throw std::invalid_argument(msg) ;
}
size_t from = start ;
size_t to = from + model.get_nrow() ; // will score [from, to)
double ll = 0 ;
for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ double sum = 0. ;
for(size_t k=0; k<dim3; k++)
{ sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ; }
ll += log(sum) ;
}
return ll ;
}
ConsensusSequenceLayer::ConsensusSequenceLayer(const Matrix3D<double>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data3DLayer(data, n_class, n_shift, flip,bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
ConsensusSequenceLayer::ConsensusSequenceLayer(Matrix3D<double>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data3DLayer(std::move(data), n_class, n_shift, flip, bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
ConsensusSequenceLayer::~ConsensusSequenceLayer()
{ ; }
void ConsensusSequenceLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads) const
{
// dimension checks
this->check_loglikelihood_dim(loglikelihood) ;
this->check_loglikelihood_max_dim(loglikelihood_max) ;
// compute the prob model and the prob reverse-complement model
std::vector<Matrix2D<double>> model(this->n_class,
Matrix2D<double>(this->l_model,
this->n_category,
0.)) ;
std::vector<Matrix2D<double>> model_rev = model ;
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ // forward
model[i](j,k) = this->model(i,j,k) ;
// reverse
model_rev[i](this->l_model-j-1,this->n_category-k-1)
= this->model(i,j,k) ;
}
}
}
// don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihoods_routine(0, this->n_dim1,
loglikelihood,
loglikelihood_max,
model,
model_rev,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_dim1, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ConsensusSequenceLayer::compute_loglikelihoods_routine,
this,
slice.first,
slice.second,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
std::ref(model),
std::ref(model_rev),
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void ConsensusSequenceLayer::compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model,
const std::vector<Matrix2D<double>>& model_rev,
std::promise<bool>& done) const
{
// compute log likelihood
for(size_t i=from; i<to; i++)
{
// set max to min possible value
loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;
for(size_t j=0; j<this->n_class; j++)
{
for(size_t s=0; s<this->n_shift; s++)
{ // forward strand
- { double ll_fw = std::max(score_consensus_subseq(this->data, i, s, model[j]),
- ConsensusSequenceLayer::p_min_log);
- // loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
- // apply lower bound here -> log(1e-100)
+ { double ll_fw = score_consensus_subseq(this->data, i, s, model[j]) ;
loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// keep track of max per row
if(ll_fw > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_fw ; }
}
// reverse
if(this->flip)
- { double ll_rev = std::max(score_consensus_subseq(this->data, i, s, model_rev[j]),
- ConsensusSequenceLayer::p_min_log) ;
- // loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
- // apply lower bound here -> log(1e-100)
+ { double ll_rev = score_consensus_subseq(this->data, i, s, model_rev[j]) ;
loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// keep track of max per row
if(ll_rev > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_rev ; }
}
}
}
}
done.set_value(true) ;
}
void ConsensusSequenceLayer::update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads)
{ // don't parallelize
if(threads == nullptr)
{ std::promise<Matrix3D<double>> promise ;
std::future<Matrix3D<double>> future = promise.get_future() ;
this->update_model_routine(0,
this->n_dim1,
posterior_prob,
promise) ;
// this->model = future.get() ;
auto model = future.get() ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = model(i,j,k) ; }
}
}
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_dim1, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ConsensusSequenceLayer::update_model_routine,
this,
slice.first,
slice.second,
std::ref(posterior_prob),
std::ref(promises[i])))) ;
}
// reinitialise the model
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = 0. ; }
}
}
// wait until all threads are done working
// and update the model
for(auto& future : futures)
{ Matrix3D<double> model_part = future.get() ;
// for(size_t i=0; i<this->n_class; i++)
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) += model_part(i,j,k) ; }
}
}
}
// -------------------------- threads stop ---------------------------
}
// make sure to have no 0 values
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) =
std::max(this->model(i,j,k), ConsensusSequenceLayer::p_min) ;
}
}
}
// normalize to get probs
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ double sum = 0. ;
for(size_t k=0; k<this->n_category; k++)
{ sum += this->model(i,j,k) ; }
for(size_t k=0; k<this->n_category; k++)
{ double p = this->model(i,j,k) / sum ;
this->model(i,j,k) = p ;
}
}
}
}
void ConsensusSequenceLayer::update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& promise) const
{ // dimension checks
this->check_posterior_prob_dim(posterior_prob) ;
Matrix3D<double> model(this->n_class,
this->l_model,
this->n_category,
0.) ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t k=0; k < n_class_to_update; k++)
{ for(size_t s=0; s<this->n_shift; s++)
{ for(size_t j=0; j<this->l_model; j++)
{ // base prob on fw and rv strand
vector_d base_prob_fw(this->n_category, 0.) ;
vector_d base_prob_rv(this->n_category, 0.) ;
for(size_t i=from; i<to; i++)
{ for(size_t base=0, base_rv=3; base<this->n_dim3; base++, base_rv--)
{ // --------------- forward ---------------
{ base_prob_fw[base] +=
this->data(i,s+j,base) *
posterior_prob(i,k,s,ConsensusSequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
{
base_prob_rv[base_rv] +=
this->data(i,s+j,base) *
posterior_prob(i,k,s,ConsensusSequenceLayer::REVERSE) ;
}
}
}
// update this position of the model
for(size_t i=0,i_rev=base_prob_fw.size()-1; i<base_prob_fw.size(); i++,i_rev--)
{ // --------------- forward ---------------
{ model(k,j,i) += base_prob_fw[i] ; }
// --------------- reverse ---------------
if(this->flip)
{ model(k,this->l_model-j-1,i) += base_prob_rv[i] ; }
}
}
}
}
promise.set_value(model) ;
}
void ConsensusSequenceLayer::create_bckg_class()
{ // sequence composition
std::vector<double> base_comp =
dna::base_composition(this->data,
this->flip) ;
// set last class as background class
for(size_t i=0; i<this->n_category; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ this->model(this->n_class-1,j,i) = base_comp[i] ; }
}
}
diff --git a/src/Clustering/ConsensusSequenceLayer.hpp b/src/Clustering/ConsensusSequenceLayer.hpp
index 5eb71e8..c0a18b8 100644
--- a/src/Clustering/ConsensusSequenceLayer.hpp
+++ b/src/Clustering/ConsensusSequenceLayer.hpp
@@ -1,196 +1,202 @@
#ifndef CONSENSUSSEQUENCELAYER_HPP
#define CONSENSUSSEQUENCELAYER_HPP
#include <Data3DLayer.hpp>
#include <vector>
#include <future> // std::promise
#include <ThreadPool.hpp>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
typedef std::vector<double> vector_d ;
class ConsensusSequenceLayer : public Data3DLayer
{
public:
/*!
* \brief Computes the log-likelihood of the sub-
* sequence - stored in a given row (1st dimension) -
* and starting at the offset <col> (2nd dimension) in
* the given sequence matrix.
* The subsequence length is determined by the model
* lenght.
* \param cons_seq a probability matrix containing
* the consensus sequence. Its dimensions are :
* 1st the different sequences
* 2nd the sequences length
* 3rd 4 for A,C,G,T.
* \param row the row containing the sequence of
* interest.
* \param col the index at which the sub-sequence
* is starting (1st index inside the subsequence
* of interest).
* \param model a model containing the probability
* model.
* \return the log-likelihood of the sub-sequence
* given the model.
* \throw std::invalid_argument if 1) the offset is
* invalid, 2) the sequence and the model have
* incompatible dimensions or 3) the model 2n dimension
* is not 4 (A,C,G,T).
*/
static double score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t col,
const Matrix2D<double>& model) ;
public:
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
ConsensusSequenceLayer(const Matrix3D<double>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
ConsensusSequenceLayer(Matrix3D<double>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Destructor.
*/
virtual ~ConsensusSequenceLayer() ;
/*!
* \brief Computes the log likelihood of the data
* given the current model parameters.
* \param logliklihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of loglikelihood.
* Its length should be equal to the data row number.
* \throw std::invalid_argument if the dimensions are
* incorrect.
*/
virtual void compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads=nullptr) const override ;
/*!
* \brief Updates the model given the posterior
* probabilities (the probabilities of each row
* in the data to be assigned to each class,
* for each shift and flip state).
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is SequenceLayer::p_min.
* \param posterior_prob the data assignment probabilities to
* the different classes.
*/
virtual void update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads=nullptr) override ;
protected:
/*!
* \brief The routine that effectively performs the
* loglikelihood computations.
* \param from the index of the first row of the data
* to considered.
* \param to the index of the past last row of the data
* to considered.
* \param loglikelihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of log_likelihood.
* Its length should be equal to the data row number.
* \param model_log a vector containing the matrices with
* the log values of the model for each class.
* \param model_log_rev a vector containing the matrices with
* the log values of the reverse strand model for each class
* (the 1st position in the model becomes the last in the
* reverse model and probabilities are swapped A<->T and C<->G).
* \param done a promise to be filled when the routine
* is done running.
*/
void compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model_log,
const std::vector<Matrix2D<double>>& model_log_rev,
std::promise<bool>& done) const ;
/*!
* \brief The routine that effectively update the model.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is SequenceLayer::p_min.
* \param from the index of the first row of the
* posterior probabilities to considered.
* \param to the index of the past last row of the
* posterior probabilities to considered.
* \param posterior_prob the data assignment probabilities
* to the different classes.
* \param
* \param done a promise containing the partial model
* computed from the given data slice. If several routines
* work together at updating the model, they need to be
* summed and normalized (by the column sum) to get the
* final model.
*/
void update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& done) const ;
/*!
* \brief Sets the last class as background class with the
* mean base probababilities of the data at each position.
*/
void create_bckg_class() ;
protected:
} ;
#endif // CONSENSUSSEQUENCELAYER_HPP
diff --git a/src/Clustering/DataLayer.cpp b/src/Clustering/DataLayer.cpp
index a1d8de4..6ab5ff5 100644
--- a/src/Clustering/DataLayer.cpp
+++ b/src/Clustering/DataLayer.cpp
@@ -1,60 +1,60 @@
#include <DataLayer.hpp>
#include <stdexcept> // std::invalid_argument
#include <cmath> // log()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ThreadPool.hpp>
DataLayer::DataLayer()
{ ; }
DataLayer::DataLayer(size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: flip(flip),
n_class(n_class),
n_shift(n_shift),
n_flip(flip + 1),
bckg_class(bckg_class)
{ // models cannot be initialise here
// as the number of categories depend
// on the exact class
}
DataLayer::DataLayer(const Matrix3D<double>& model,
bool flip,
bool bckg_class)
: model(model),
flip(flip),
n_class(this->model.get_dim()[0]),
l_model(this->model.get_dim()[1]),
n_category(this->model.get_dim()[2]),
n_flip(flip + 1),
bckg_class(bckg_class)
{ ; }
DataLayer::DataLayer(Matrix3D<double>&& model,
bool flip,
bool bckg_class)
: model(model),
flip(flip),
n_class(this->model.get_dim()[0]),
l_model(this->model.get_dim()[1]),
n_category(this->model.get_dim()[2]),
n_flip(flip + 1),
bckg_class(bckg_class)
{ ; }
DataLayer::~DataLayer()
{ ; }
Matrix3D<double> DataLayer::get_model() const
{ return this->model ; }
-const double DataLayer::p_min = 1e-100 ;
+const double DataLayer::p_min = 1e-300 ;
const double DataLayer::p_min_log = log(DataLayer::p_min) ;
diff --git a/src/Clustering/EMConsensusSequence.cpp b/src/Clustering/EMConsensusSequence.cpp
index 9cac9ee..1dd3343 100644
--- a/src/Clustering/EMConsensusSequence.cpp
+++ b/src/Clustering/EMConsensusSequence.cpp
@@ -1,293 +1,294 @@
#include <EMConsensusSequence.hpp>
#include <string>
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <ConsensusSequenceLayer.hpp> // SequenceLayer
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
#include <ThreadPool.hpp> // ThreadPool
#include <dna_utility.hpp> // dna::base_composition()
EMConsensusSequence::EMConsensusSequence(const Matrix3D<double>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_dim()[0],
seq_matrix.get_dim()[1],
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
cseq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->cseq_layer = new ConsensusSequenceLayer(seq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->cseq_layer->update_model(this->post_prob,
this->threads) ;
}
EMConsensusSequence::EMConsensusSequence(Matrix3D<double>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_dim()[0],
seq_matrix.get_dim()[1],
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
cseq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->cseq_layer = new ConsensusSequenceLayer(std::move(seq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->cseq_layer->update_model(this->post_prob,
this->threads) ;
}
EMConsensusSequence::~EMConsensusSequence()
{ if(this->cseq_layer != nullptr)
{ delete this->cseq_layer ;
this->cseq_layer = nullptr ;
}
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
}
Matrix3D<double> EMConsensusSequence::get_sequence_models() const
{ return this->cseq_layer->get_model() ; }
EMConsensusSequence::exit_codes EMConsensusSequence::classify()
{
size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{ // E-step
this->compute_loglikelihood() ;
this->compute_post_prob() ;
// M-step
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return EMConsensusSequence::exit_codes::ITER_MAX ;
}
void EMConsensusSequence::compute_loglikelihood()
{ // compute the loglikelihood
this->cseq_layer->compute_loglikelihoods(this->loglikelihood,
this->loglikelihood_max,
this->threads) ;
// rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMConsensusSequence::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void EMConsensusSequence::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{
// rescale the values
for(size_t i=from; i<to; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
- (this->loglikelihood(i,j,k,l) -
- this->loglikelihood_max[i]) ;
+ std::max(this->loglikelihood(i,j,k,l) -
+ this->loglikelihood_max[i],
+ ConsensusSequenceLayer::p_min_log) ;
}
}
}
}
done.set_value(true) ;
}
void EMConsensusSequence::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMConsensusSequence::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMConsensusSequence::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// reset grand total
// this->post_prob_tot = 0 ;
// this->post_prob_colsum = vector_d(n_class, 0) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
ConsensusSequenceLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMConsensusSequence::update_models()
{ this->cseq_layer->update_model(this->post_prob,
this->threads) ;
}
diff --git a/src/Clustering/EMConsensusSequence.hpp b/src/Clustering/EMConsensusSequence.hpp
index 8ba881c..01ff753 100644
--- a/src/Clustering/EMConsensusSequence.hpp
+++ b/src/Clustering/EMConsensusSequence.hpp
@@ -1,184 +1,199 @@
#ifndef EMCONSENSUSSEQUENCE_HPP
#define EMCONSENSUSSEQUENCE_HPP
#include <EMBase.hpp>
#include <vector>
#include <string>
#include <future> // std::promise
#include <Matrix3D.hpp>
#include <ConsensusSequenceLayer.hpp>
typedef std::vector<double> vector_d ;
class EMConsensusSequence : public EMBase
{
public:
/*!
* \brief Constructs an object to partition the
* given consensus sequences (rows) according to
* their motif content.
* The sequences models are initialised randomly.
* \param sequence_matrix a matrix containing the
* consensus sequences in a probability matrix.
* Its dimensions are :
* 1st the number of consensus sequences
* 2nd the length of the consensus sequences
* 3rd 4 for A,C,G,T
* The sums over the 1st and 2nd dimensions should
* be 1. The overall sum of the matrix values should
* be the st dimension.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the background
* by setting all its parameters, at all positions, to the
* background base probabilties. Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMConsensusSequence(const Matrix3D<double>& sequence_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* given consensus sequences (rows) according to
* their motif
* content.
* The sequences models are initialised randomly.
* \param sequence_matrix a matrix containing the
* consensus sequences in a probability matrix.
* Its dimensions are :
* 1st the number of consensus sequences
* 2nd the length of the consensus sequences
* 3rd 4 for A,C,G,T
* The sums over the 1st and 2nd dimensions should
* be 1. The overall sum of the matrix values should
* be the st dimension.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the background
* by setting all its parameters, at all positions, to the
* background base probabilties. Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMConsensusSequence(Matrix3D<double>&& sequence_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
EMConsensusSequence(const EMConsensusSequence& other) = delete ;
/*!
* \brief Destructor.
*/
virtual ~EMConsensusSequence() override ;
/*!
* \brief Returns the class sequence model.
* \return the class sequence model.
*/
Matrix3D<double> get_sequence_models() const ;
/*!
* \brief Runs the sequence model optimization and
* the data classification.
* \return a code indicating how the optimization
* ended.
*/
virtual EMConsensusSequence::exit_codes classify() override ;
private:
/*!
* \brief Computes the data log likelihood given the
* current models, for all layers and the joint
* likelihood for each state (the sum of the layer
* likelihoods for all layers, for a given state).
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ConsensusSequenceLayer::p_min_log.
+ * Any value below is replaced by this one.
*/
virtual void compute_loglikelihood() override ;
/*!
* \brief This is a routine of compute_loglikelihood().
* This method rescales the loglikelihood values by
* substacting to each value the maximum loglikelihood
* value found in the same data row.
- * This method
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ConsensusSequenceLayer::p_min_log.
+ * Any value below is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done a promise to fill when the method
* is done.
*/
void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Computes the data posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ConsensusSequenceLayer::p_min. Any value below
+ * is replaced by this one.
*/
virtual void compute_post_prob() override ;
/*!
* \brief The routine that effectively computes
* the posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ConsensusSequenceLayer::p_min. Any value below
+ * is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done the partial column (over the classes)
* sum of posterior probabilities. If several routines
* are running together, the colsums are retrieved by
* summing up the vectors together.
*/
void compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum) ;
/*!
* \brief Update the data models for all layers, given
* the current posterior and class probabilities.
*/
virtual void update_models() override ;
/*!
* \brief the max loglikelihood value for
* each data row.
*/
std::vector<double> loglikelihood_max ;
/*!
* \brief A pointer to the object managing
* the data and their model.
*/
ConsensusSequenceLayer* cseq_layer ;
} ;
#endif // EMCONSENSUSSEQUENCE_HPP
diff --git a/src/Clustering/EMJoint.cpp b/src/Clustering/EMJoint.cpp
index 7ff0a16..c15a79b 100644
--- a/src/Clustering/EMJoint.cpp
+++ b/src/Clustering/EMJoint.cpp
@@ -1,837 +1,814 @@
#include <EMJoint.hpp>
#include <string>
#include <vector>
#include <stdexcept>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <cmath> // exp()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ReadLayer.hpp>
#include <SequenceLayer.hpp>
#include <ThreadPool.hpp>
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
template<class T>
std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
EMJoint::EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()),
loglikelihood_layer(n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{
// check data matrices and their dimensions
if(this->n_layer == 0)
{ throw std::invalid_argument("Error! No data layer given!") ; }
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! Read layers have variable row numbers "
"(%zu and %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! Read layers have variable column numbers "
"(%zu and %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
// initialise post prob randomly
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{ // create the layer
this->read_layers.push_back(new ReadLayer(matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
}
EMJoint::EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()),
loglikelihood_layer(n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{
// check data matrices and their dimensions
if(this->n_layer == 0)
{ throw std::invalid_argument("Error! No data layer given!") ; }
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! Read layers have variable row numbers "
"(%zu and %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! Read layers have variable column numbers "
"(%zu and %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
// initialise post prob randomly
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{
// create the layer
this->read_layers.push_back(new ReadLayer(std::move(matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
}
EMJoint::EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
const Matrix2D<int>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()+1),
loglikelihood_layer(this->n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{ // check data matrices and their dimensions
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix row number is different than expected "
"(%zu instead of %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix column number is different than expected "
"(%zu instead of %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
if(seq_matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix row number is different than expected "
"(%zu instead of %zu)!",
seq_matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(seq_matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix column number is different than expected "
"(%zu instead of %zu)!",
seq_matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{ // create the layer
this->read_layers.push_back(new ReadLayer(matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
// create sequence layer and initialise the models from the post prob
this->seq_layer = new SequenceLayer(seq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
EMJoint::EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
Matrix2D<int>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()+1),
loglikelihood_layer(this->n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{ // check data matrices and their dimensions
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix row number is different than expected "
"(%zu instead of %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix column number is different than expected "
"(%zu instead of %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
if(seq_matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix row number is different than expected "
"(%zu instead of %zu)!",
seq_matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(seq_matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix column number is different than expected "
"(%zu instead of %zu)!",
seq_matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{
// create the layer
this->read_layers.push_back(new ReadLayer(std::move(matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
// create sequence layer and initialise the models from the post prob
this->seq_layer = new SequenceLayer(std::move(seq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
EMJoint::EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
const Matrix3D<double>& consseq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()+1),
loglikelihood_layer(this->n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{ // check data matrices and their dimensions
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix row number is different than expected "
"(%zu instead of %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix column number is different than expected "
"(%zu instead of %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
if(consseq_matrix.get_dim()[0] != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! The consensus sequence matrix row number is different than expected "
"(%zu instead of %zu)!",
consseq_matrix.get_dim()[0], this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(consseq_matrix.get_dim()[1] != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix column number is different than expected "
"(%zu instead of %zu)!",
consseq_matrix.get_dim()[0], this->n_col) ;
throw std::invalid_argument(msg) ;
}
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{ // create the layer
this->read_layers.push_back(new ReadLayer(matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
// create consensus sequence layer and initialise the models from the post prob
{
this->consseq_layer = new ConsensusSequenceLayer(consseq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
this->consseq_layer->update_model(this->post_prob,
this->threads) ;
}
}
EMJoint::EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
Matrix3D<double>&& consseq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrices[0].get_nrow(),
read_matrices[0].get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
n_layer(read_matrices.size()+1),
loglikelihood_layer(this->n_layer,
Matrix4D<double>(this->n_row,
this->n_class,
this->n_shift,
this->n_flip,
0.)),
loglikelihood_max(this->n_layer,
vector_d(this->n_row, 0.)),
read_layers(),
seq_layer(nullptr),
consseq_layer(nullptr)
{ // check data matrices and their dimensions
for(const auto& matrix : read_matrices)
{ if(matrix.get_nrow() != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix row number is different than expected "
"(%zu instead of %zu)!",
matrix.get_nrow(), this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(matrix.get_ncol() != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! A read matrix column number is different than expected "
"(%zu instead of %zu)!",
matrix.get_ncol(), this->n_col) ;
throw std::invalid_argument(msg) ;
}
}
if(consseq_matrix.get_dim()[0] != this->n_row)
{ char msg[4096] ;
sprintf(msg, "Error! The consensus sequence matrix row number is different than expected "
"(%zu instead of %zu)!",
consseq_matrix.get_dim()[0], this->n_row) ;
throw std::invalid_argument(msg) ;
}
else if(consseq_matrix.get_dim()[1] != this->n_col)
{ char msg[4096] ;
sprintf(msg, "Error! The sequence matrix column number is different than expected "
"(%zu instead of %zu)!",
consseq_matrix.get_dim()[0], this->n_col) ;
throw std::invalid_argument(msg) ;
}
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
// create read layer and initialise the models from the post prob
for(auto& matrix : read_matrices)
{ // create the layer
this->read_layers.push_back(new ReadLayer(std::move(matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class,
this->threads)) ;
this->read_layers.back()->update_model(this->post_prob,
this->threads) ;
}
// create consensus sequence layer and initialise the models from the post prob
{
this->consseq_layer = new ConsensusSequenceLayer(std::move(consseq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
this->consseq_layer->update_model(this->post_prob,
this->threads) ;
}
}
EMJoint::~EMJoint()
{ // join the threads in case
// deleted by EMBase destructor
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
// read data and models
for(auto& ptr : this->read_layers)
{ if(ptr != nullptr)
{ delete ptr ;
ptr = nullptr ;
}
}
// sequence data and models
if(this->seq_layer != nullptr)
{ delete seq_layer ;
this->seq_layer = nullptr ;
}
// consensus sequence data and models
if(this->consseq_layer != nullptr)
{ delete this->consseq_layer ;
this->consseq_layer = nullptr ;
}
}
std::vector<Matrix3D<double>> EMJoint::get_read_models() const
{ std::vector<Matrix3D<double>> models ;
for(const auto& ptr : this->read_layers)
{ models.push_back(ptr->get_model()) ; }
return models ;
}
Matrix3D<double> EMJoint::get_sequence_models() const
{ if(this->seq_layer != nullptr)
{ return this->seq_layer->get_model() ; }
return Matrix3D<double>() ;
}
Matrix3D<double> EMJoint::get_consensus_sequence_models() const
{ if(this->consseq_layer != nullptr)
{ return this->consseq_layer->get_model() ; }
return Matrix3D<double>() ;
}
EMJoint::exit_codes EMJoint::classify()
{ size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{ // E-step
this->compute_loglikelihood() ;
this->compute_post_prob() ;
// M-step
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return EMJoint::exit_codes::ITER_MAX ;
}
void EMJoint::compute_loglikelihood()
{ // compute the loglikelihood for each layer
size_t i = 0 ;
for(auto& ptr : this->read_layers)
{ ptr->compute_loglikelihoods(this->loglikelihood_layer[i],
this->loglikelihood_max[i],
this->threads) ;
i++ ;
}
if(this->seq_layer != nullptr)
{ this->seq_layer->compute_loglikelihoods(this->loglikelihood_layer[i],
this->loglikelihood_max[i],
this->threads) ;
i++ ;
}
if(this->consseq_layer != nullptr)
{ this->consseq_layer->compute_loglikelihoods(this->loglikelihood_layer[i],
this->loglikelihood_max[i],
this->threads) ;
i++ ;
}
// sum the likelihood for each state, over all layers
// and rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t j=0; j<n_threads; j++)
{ futures[j] = promises[j].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t j=0; j<n_threads; j++)
{ auto slice = slices[j] ;
this->threads->addJob(std::move(
std::bind(&EMJoint::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[j])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
-/*
-void EMJoint::compute_loglikelihood_routine(size_t from,
- size_t to,
- std::promise<bool>& done)
-{
- // limite value range
- for(size_t i=from; i<to; i++)
- { for(size_t j=0; j<this->n_class; j++)
- { for(size_t k=0; k<this->n_shift; k++)
- { for(size_t l=0; l<this->n_flip; l++)
- {
- // reset
- this->loglikelihood(i,j,k,l) = 0. ;
- // sum
- for(size_t m=0; m<this->n_layer; m++)
- { this->loglikelihood(i,j,k,l) +=
- (this->loglikelihood_layer[m](i,j,k,l) -
- this->loglikelihood_max[m][i]) ;
- }
- }
- }
- }
- }
- done.set_value(true) ;
-}
-*/
-
void EMJoint::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{ // the max likelihood found per row
std::vector<double> rowmax(to-from, std::numeric_limits<double>::lowest()) ;
// sum over layers
for(size_t i=from, i_rowmax=0; i<to; i++, i_rowmax++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{
// reset
this->loglikelihood(i,j,k,l) = 0. ;
// sum
for(size_t m=0; m<this->n_layer; m++)
{ // add rescaled layer value
this->loglikelihood(i,j,k,l) +=
(this->loglikelihood_layer[m](i,j,k,l) -
this->loglikelihood_max[m][i]) ;
}
// keep track of max
if(this->loglikelihood(i,j,k,l) > rowmax[i_rowmax])
{ rowmax[i_rowmax] = this->loglikelihood(i,j,k,l) ; }
}
}
}
}
// rescale
for(size_t i=from, i_rowmax=0; i<to; i++, i_rowmax++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
- { this->loglikelihood(i,j,k,l) -= rowmax[i_rowmax] ; }
+ { // this->loglikelihood(i,j,k,l) -= rowmax[i_rowmax] ;
+ this->loglikelihood(i,j,k,l) =
+ std::max(this->loglikelihood(i,j,k,l) - rowmax[i_rowmax],
+ ReadLayer::p_min_log) ;
+ }
}
}
}
done.set_value(true) ;
}
void EMJoint::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMJoint::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMJoint::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
// double p = std::max(exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
// this->post_state_prob(n_class,n_shift,n_flip),
// ReadLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
ReadLayer::p_min) ;
// double p = this->post_prob(i,n_class,n_shift,n_flip) /
// this->post_prob_rowsum[i] ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMJoint::update_models()
{ // read data and models
for(auto& ptr : this->read_layers)
{ ptr->update_model(this->post_prob,
this->post_prob_colsum,
this->threads) ;
}
// sequence data and models
if(this->seq_layer != nullptr)
{ this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
// consensus sequence data and models
if(this->consseq_layer != nullptr)
{ this->consseq_layer->update_model(this->post_prob,
this->threads) ;
}
}
diff --git a/src/Clustering/EMJoint.hpp b/src/Clustering/EMJoint.hpp
index aeea97b..3d9cb80 100644
--- a/src/Clustering/EMJoint.hpp
+++ b/src/Clustering/EMJoint.hpp
@@ -1,408 +1,424 @@
#ifndef EMJOINT_HPP
#define EMJOINT_HPP
#include <EMBase.hpp>
#include <vector>
#include <string>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ReadLayer.hpp>
#include <SequenceLayer.hpp>
#include <ConsensusSequenceLayer.hpp>
typedef std::vector<double> vector_d ;
class EMJoint : public EMBase
{
public:
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* with the given shifting and flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* with the given shifting and flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* and sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
const Matrix2D<int>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* and sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
Matrix2D<int>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* consensus sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param consseq_matrix a matrix containing the DNA
* consensus sequences for the regions of interest as
* a probability matrix. Its dimensions are:
* 1st the number of regions,
* 2nd the region length.
* 3rd 4 for A,C,G,T
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
const Matrix3D<double>& consseq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* consensus sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param consseq_matrix a matrix containing the DNA
* consensus sequences for the regions of interest as
* a probability matrix. Its dimensions are:
* 1st the number of regions,
* 2nd the region length.
* 3rd 4 for A,C,G,T
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
Matrix3D<double>&& consseq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
EMJoint(const EMJoint& other) = delete ;
/*!
* \brief Destructor.
*/
virtual ~EMJoint() override ;
/*!
* \brief Returns all layer read models.
* The models are in the same order
* as the data were given to the
* constructor.
* \return a vector containing the
* models.
*/
std::vector<Matrix3D<double>> get_read_models() const ;
/*!
* \brief Returns the sequence models.
* \return a vector containing the
* models.
*/
Matrix3D<double> get_sequence_models() const ;
/*!
* \brief Returns the consensus sequence
* models.
* \return a vector containing the
* models.
*/
Matrix3D<double> get_consensus_sequence_models() const ;
/*!
* \brief Runs the sequence model optimization and
* the data classification.
* \return a code indicating how the optimization
* ended.
*/
virtual EMJoint::exit_codes classify() override ;
private:
/*!
* \brief Computes the data log likelihood given the
* current models, for all layers and the joint
* likelihood for each state (the sum of the layer
* likelihoods for all layers, for a given state).
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
*/
virtual void compute_loglikelihood() override ;
/*!
* \brief This is a routine of compute_loglikelihood() that
* computes the joint loglikelihood by summing the
* individual loglikelihood obtained from each data layer.
* At the same time, this method rescales the loglikelihood
* values by substacting to each value the maximum
* loglikelihood value found in the same data row,
* for each layer.
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done a promise to fill when the method
* is done.
*/
void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Computes the data posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ReadLayer::p_min. Any value below is replaced
+ * by this one.
*/
virtual void compute_post_prob() override ;
/*!
* \brief The routine that effectively computes
* the posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ReadLayer::p_min. Any value below is replaced
+ * by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done the partial column (over the classes)
* sum of posterior probabilities. If several routines
* are running together, the colsums are retrieved by
* summing up the vectors together.
*/
void compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum) ;
/*!
* \brief Update the data models for all layers, given
* the current posterior and class probabilities.
*/
virtual void update_models() override ;
/*!
* \brief the number of data layers.
*/
size_t n_layer ;
/*!
* \brief the log likelihood buffers for each individual
* layer (one element per layer).
*/
std::vector<Matrix4D<double>> loglikelihood_layer ;
/*!
* \brief the max loglikelihood value for
* each each data layer (1st dimension)
* and each data row of the given layer
* (2nd dimension).
*/
std::vector<vector_d> loglikelihood_max ;
/*!
* \brief A vector containing the pointers
* to the objects managing all the read
* data and models.
*/
std::vector<ReadLayer*> read_layers ;
/*!
* \brief A pointer to the object managing
* the sequence data and their models.
*/
SequenceLayer* seq_layer ;
/*!
* \brief A pointer to the object managing
* the consensus sequences data and their
* models.
*/
ConsensusSequenceLayer* consseq_layer ;
} ;
#endif // EMJOINT_HPP
diff --git a/src/Clustering/EMRead.cpp b/src/Clustering/EMRead.cpp
index a8cb150..580f10f 100644
--- a/src/Clustering/EMRead.cpp
+++ b/src/Clustering/EMRead.cpp
@@ -1,314 +1,315 @@
#include <EMRead.hpp>
#include <string>
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <cmath> // exp()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <ReadLayer.hpp> // ReadLayer
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
#include <ThreadPool.hpp> // ThreadPool
EMRead::EMRead(const Matrix2D<int>& read_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrix.get_nrow(),
read_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
read_layer(nullptr)
{ this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
this->set_post_prob_random(seed) ;
// data and models
this->read_layer = new ReadLayer(read_matrix,
this->n_class,
this->n_shift,
flip,
bckg_class,
this->threads) ;
// intialise the models with the post prob
this->read_layer->update_model(this->post_prob,
this->threads) ;
}
EMRead::EMRead(Matrix2D<int>&& read_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(read_matrix.get_nrow(),
read_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
read_layer(nullptr)
{ this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
this->set_post_prob_random(seed) ;
// data and models
this->read_layer = new ReadLayer(std::move(read_matrix),
this->n_class,
this->n_shift,
flip,
bckg_class,
this->threads) ;
// intialise the models with the post prob
this->read_layer->update_model(this->post_prob,
this->threads) ;
}
EMRead::~EMRead()
{ if(this->read_layer!= nullptr)
{ delete this->read_layer ;
this->read_layer = nullptr ;
}
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
}
Matrix3D<double> EMRead::get_read_models() const
{ return read_layer->get_model() ; }
EMRead::exit_codes EMRead::classify()
{
size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{
// E-step
this->compute_loglikelihood() ;
// std::cerr << this->post_prob_rowsum << std::endl ;
// std::cerr << this->post_prob_colsum << std::endl ;
this->compute_post_prob() ;
// M-step
// std::cerr << this->post_prob_rowsum << std::endl ;
// std::cerr << this->post_prob_colsum << std::endl ;
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return EMRead::exit_codes::ITER_MAX ;
}
void EMRead::compute_loglikelihood()
{ // compute the loglikelihood
this->read_layer->compute_loglikelihoods(this->loglikelihood,
this->loglikelihood_max,
this->threads) ;
/*
// rescale the values
for(size_t i=0; i<this->n_row; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
(this->loglikelihood(i,j,k,l) -
this->loglikelihood_max[i]) ;
}
}
}
}
*/
// rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMRead::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void EMRead::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{
// rescale the values
for(size_t i=from; i<to; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
- (this->loglikelihood(i,j,k,l) -
- this->loglikelihood_max[i]) ;
+ std::max(this->loglikelihood(i,j,k,l) -
+ this->loglikelihood_max[i],
+ ReadLayer::p_min_log) ;
}
}
}
}
done.set_value(true) ;
}
void EMRead::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMRead::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMRead::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// reset grand total
// this->post_prob_tot = 0 ;
// this->post_prob_colsum = vector_d(n_class, 0) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{ // avoid p=0. by rounding errors
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
ReadLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMRead::update_models()
{ this->read_layer->update_model(this->post_prob,
this->post_prob_colsum,
this->threads) ;
}
diff --git a/src/Clustering/EMRead.hpp b/src/Clustering/EMRead.hpp
index 0c49b6a..e3e727a 100644
--- a/src/Clustering/EMRead.hpp
+++ b/src/Clustering/EMRead.hpp
@@ -1,173 +1,188 @@
#ifndef EMREAD_HPP
#define EMREAD_HPP
#include <EMBase.hpp>
#include <vector>
#include <string>
#include <future> // std::promise
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <ReadLayer.hpp>
typedef std::vector<double> vector_d ;
class EMRead : public EMBase
{
public:
/*!
* \brief Constructs an object to partition the
* region (rows) according to the shape of the signal
* with the given shifting and flipping freedom.
* \param read_matrix a matrix containing the read
* densitiy (ChIP-seq or related signal) for the
* regions of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the
* background by setting all its parameters, at all
* positions, to the mean number of counts. Since
* the background is constant, this class will never
* be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMRead(const Matrix2D<int>& read_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region (rows) according to the shape of the signal
* with the given shifting and flipping freedom.
* \param read_matrix a matrix containing the read
* densitiy (ChIP-seq or related signal) for the
* regions of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the
* background by setting all its parameters, at all
* positions, to the mean number of counts. Since
* the background is constant, this class will never
* be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMRead(Matrix2D<int>&& read_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
EMRead(const EMRead& other) = delete ;
/*!
* \brief Destructor.
*/
virtual ~EMRead() override ;
/*!
* \brief Returns the class read signal model.
* \return the class read signal model.
*/
Matrix3D<double> get_read_models() const ;
/*!
* \brief Runs the read signal model optimization and
* the data classification.
* \return a code indicating how the optimization
* ended.
*/
virtual EMRead::exit_codes classify() override ;
private:
/*!
* \brief Computes the data log likelihood given the
* current models, for all layers and the joint
* likelihood for each state (the sum of the layer
* likelihoods for all layers, for a given state).
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
*/
virtual void compute_loglikelihood() override ;
/*!
* \brief This is a routine of compute_loglikelihood().
* This method rescales the loglikelihood values by
* substacting to each value the maximum loglikelihood
* value found in the same data row.
- * This method
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done a promise to fill when the method
* is done.
*/
void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Computes the data posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ReadLayer::p_min. Any value below is replaced
+ * by this one.
*/
virtual void compute_post_prob() override ;
/*!
* \brief The routine that effectively computes
* the posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * ReadLayer::p_min. Any value below is replaced
+ * by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done the partial column (over the classes)
* sum of posterior probabilities. If several routines
* are running together, the colsums are retrieved by
* summing up the vectors together.
*/
void compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum) ;
/*!
* \brief Update the data models for all layers, given
* the current posterior and class probabilities.
*/
virtual void update_models() override ;
/*!
* \brief the max loglikelihood value for
* each data row.
*/
std::vector<double> loglikelihood_max ;
/*!
* \brief A pointer to the object managing
* the data and their model.
*/
ReadLayer* read_layer ;
} ;
#endif // EMREAD_HPP
diff --git a/src/Clustering/EMSequence.cpp b/src/Clustering/EMSequence.cpp
index 3f5bebb..4844f2c 100644
--- a/src/Clustering/EMSequence.cpp
+++ b/src/Clustering/EMSequence.cpp
@@ -1,357 +1,358 @@
#include <EMSequence.hpp>
#include <string>
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <cmath> // exp()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <SequenceLayer.hpp> // SequenceLayer
#include <RandomNumberGenerator.hpp> // getRandomNumberGenerator()
#include <ConsoleProgressBar.hpp> // ConsoleProgressBar
#include <ThreadPool.hpp> // ThreadPool
EMSequence::EMSequence(const Matrix2D<int>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->seq_layer = new SequenceLayer(seq_matrix,
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
EMSequence::EMSequence(Matrix2D<int>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
n_class,
n_iter,
n_shift,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// initialise post prob randomly
// getRandomGenerator(seed) ;
this->set_post_prob_random(seed) ;
// data and models
this->seq_layer = new SequenceLayer(std::move(seq_matrix),
this->n_class,
this->n_shift,
this->flip,
bckg_class) ;
// intialise the models with the post prob
this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
EMSequence::EMSequence(const Matrix2D<int>& seq_matrix,
const Matrix3D<double>& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
motifs.get_dim()[0],
n_iter,
seq_matrix.get_ncol() - motifs.get_dim()[1] + 1,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// data and models
// background motif (if any) is the last of the given motifs
this->seq_layer = new SequenceLayer(seq_matrix,
motifs,
this->flip,
bckg_class) ;
// intialise the class prob uniformly
this->set_state_prob_uniform() ;
}
EMSequence::EMSequence(Matrix2D<int>&& seq_matrix,
Matrix3D<double>&& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads)
: EMBase(seq_matrix.get_nrow(),
seq_matrix.get_ncol(),
motifs.get_dim()[0],
n_iter,
seq_matrix.get_ncol() - motifs.get_dim()[1] + 1,
flip,
n_threads),
loglikelihood_max(n_row, 0.),
seq_layer(nullptr)
{
this->loglikelihood_max = vector_d(n_row, 0.) ;
// data and models
// background motif (if any) is the last of the given motifs
this->seq_layer = new SequenceLayer(std::move(seq_matrix),
std::move(motifs),
this->flip,
bckg_class) ;
// intialise the class prob uniformly
this->set_state_prob_uniform() ;
}
EMSequence::~EMSequence()
{ if(this->seq_layer != nullptr)
{ delete this->seq_layer ;
this->seq_layer = nullptr ;
}
if(this->threads != nullptr)
{ this->threads->join() ;
delete this->threads ;
this->threads = nullptr ;
}
}
Matrix3D<double> EMSequence::get_sequence_models() const
{ return seq_layer->get_model() ; }
EMSequence::exit_codes EMSequence::classify()
{ size_t bar_update_n = this->n_iter ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ;
// optimize the partition
for(size_t n_iter=0; n_iter<this->n_iter; n_iter++)
{ // E-step
this->compute_loglikelihood() ;
this->compute_post_prob() ;
// M-step
this->compute_class_prob() ;
this->update_models() ;
this->center_post_state_prob() ;
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return EMSequence::exit_codes::ITER_MAX ;
}
void EMSequence::compute_loglikelihood()
{ // compute the loglikelihood
this->seq_layer->compute_loglikelihoods(this->loglikelihood,
this->loglikelihood_max,
this->threads) ;
// rescale the values
// don't parallelize
if(this->threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihood_routine(0,
this->n_row,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMSequence::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void EMSequence::compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done)
{
// rescale the values
for(size_t i=from; i<to; i++)
{ for(size_t j=0; j<this->n_class; j++)
{ for(size_t k=0; k<this->n_shift; k++)
{ for(size_t l=0; l<this->n_flip; l++)
{ this->loglikelihood(i,j,k,l) =
- (this->loglikelihood(i,j,k,l) -
- this->loglikelihood_max[i]) ;
+ std::max(this->loglikelihood(i,j,k,l) -
+ this->loglikelihood_max[i],
+ SequenceLayer::p_min_log) ;
}
}
}
}
done.set_value(true) ;
}
void EMSequence::compute_post_prob()
{ // don't parallelize
if(this->threads == nullptr)
{ std::promise<vector_d> promise ;
std::future<vector_d> future = promise.get_future() ;
this->compute_post_prob_routine(0, this->n_row, promise) ;
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = future.get() ;
for(const auto& prob : this->post_prob_colsum)
{ this->post_prob_tot += prob ; }
}
// parallelize
else
{ size_t n_threads = this->threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row,n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector<std::promise<vector_d>> promises(n_threads) ;
std::vector<std::future<vector_d>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
this->threads->addJob(std::move(
std::bind(&EMSequence::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_colsum = vector_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; i<this->n_class; i++)
{ double prob = probs[i] ;
this->post_prob_colsum[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
}
void EMSequence::compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum)
{ vector_d colsums(this->n_class, 0.) ;
// reset grand total
// this->post_prob_tot = 0 ;
// this->post_prob_colsum = vector_d(n_class, 0) ;
// post prob
for(size_t i=from; i<to; i++)
{ // reset row sum to 0
this->post_prob_rowsum[i] = 0. ;
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) *
this->post_state_prob(n_class,n_shift,n_flip) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
this->post_prob_rowsum[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_class<this->n_class; n_class++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ for(size_t n_flip=0; n_flip<this->n_flip; n_flip++)
{
double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) /
this->post_prob_rowsum[i],
SequenceLayer::p_min) ;
this->post_prob(i,n_class,n_shift,n_flip) = p ;
colsums[n_class] += p ;
}
}
}
}
post_prob_colsum.set_value(colsums) ;
}
void EMSequence::update_models()
{ this->seq_layer->update_model(this->post_prob,
this->threads) ;
}
diff --git a/src/Clustering/EMSequence.hpp b/src/Clustering/EMSequence.hpp
index 622532b..537c285 100644
--- a/src/Clustering/EMSequence.hpp
+++ b/src/Clustering/EMSequence.hpp
@@ -1,240 +1,255 @@
#ifndef EMSEQUENCE_HPP
#define EMSEQUENCE_HPP
#include <EMBase.hpp>
#include <vector>
#include <string>
#include <future> // std::promise
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <SequenceLayer.hpp>
typedef std::vector<double> vector_d ;
class EMSequence : public EMBase
{
public:
/*!
* \brief Constructs an object to partition the
* given sequences (rows) according to their motif
* content.
* The sequences models are initialised randomly.
* \param sequence_matrix a matrix containing the sequences
* of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the background
* by setting all its parameters, at all positions, to the
* background base probabilties. Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMSequence(const Matrix2D<int>& sequence_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* given sequences (rows) according to their motif
* content.
* The sequences models are initialised randomly.
* \param sequence_matrix a matrix containing the sequences
* of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model the background
* by setting all its parameters, at all positions, to the
* background base probabilties. Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMSequence(Matrix2D<int>&& sequence_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* given sequences (rows) according to their motif
* content.
* The sequences class models are initialised using
* the given motifs. The class probabilities are
* initialised uniformlly.
* The shifting freedom is set to (data number
* of columns) - (the model 2nd dimension)
* + 1.
* \param sequence_matrix a matrix containing the sequences
* of interest.
* \param motifs a matrix containing the different initial
* class models with the following dimensions :
* dim1 the number of classes
* dim2 the model length
* dim3 4 for A,C,G,T
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param flip whether flipping is allowed.
* \param bckg_class indicates that the last class in the
* given motifs is used to model the background and it
* should never be updated.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMSequence(const Matrix2D<int>& sequence_matrix,
const Matrix3D<double>& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* given sequences (rows) according to their motif
* content.
* The sequences class models are initialised using
* the given motifs. The class probabilities are
* initialised uniformlly.
* The shifting freedom is set to (data number
* of columns) - (the model 2nd dimension)
* + 1.
* \param sequence_matrix a matrix containing the sequences
* of interest.
* \param motifs a matrix containing the different initial
* class models with the following dimensions :
* dim1 the number of classes
* dim2 the model length
* dim3 4 for A,C,G,T
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param flip whether flipping is allowed.
* \param bckg_class indicates that the last class in the
* given motifs is used to model the background and it
* should never be updated.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMSequence(Matrix2D<int>&& sequence_matrix,
Matrix3D<double>&& motifs,
size_t n_iter,
bool flip,
bool bckg_class,
size_t n_threads=0) ;
EMSequence(const EMSequence& other) = delete ;
/*!
* \brief Destructor.
*/
virtual ~EMSequence() override ;
/*!
* \brief Returns the class sequence model.
* \return the class sequence model.
*/
Matrix3D<double> get_sequence_models() const ;
/*!
* \brief Runs the sequence model optimization and
* the data classification.
* \return a code indicating how the optimization
* ended.
*/
virtual EMSequence::exit_codes classify() override ;
private:
/*!
* \brief Computes the data log likelihood given the
* current models, for all layers and the joint
* likelihood for each state (the sum of the layer
* likelihoods for all layers, for a given state).
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is SequenceLayer::p_min_log. Any
+ * value below is replaced by this one.
*/
virtual void compute_loglikelihood() override ;
/*!
* \brief This is a routine of compute_loglikelihood().
* This method rescales the loglikelihood values by
* substacting to each value the maximum loglikelihood
* value found in the same data row.
- * This method
+ * To avoid numerical issues when computing posterior
+ * probabilities, the lowest possible value authorized
+ * as log likelihood is ReadLayer::p_min_log. Any
+ * value below is replaced by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done a promise to fill when the method
* is done.
*/
void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Computes the data posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * SequenceLayer::p_min. Any value below is replaced
+ * by this one.
*/
virtual void compute_post_prob() override ;
/*!
* \brief The routine that effectively computes
* the posterior probabilties.
+ * To avoid numerical issues the lowest possible
+ * value authorized as posterior probability is
+ * SequenceLayer::p_min. Any value below is replaced
+ * by this one.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done the partial column (over the classes)
* sum of posterior probabilities. If several routines
* are running together, the colsums are retrieved by
* summing up the vectors together.
*/
void compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum) ;
/*!
* \brief Update the data models for all layers, given
* the current posterior and class probabilities.
*/
virtual void update_models() override ;
/*!
* \brief the max loglikelihood value for
* each data row.
*/
std::vector<double> loglikelihood_max ;
/*!
* \brief A pointer to the object managing
* the data and their model.
*/
SequenceLayer* seq_layer ;
} ;
#endif // EMSEQUENCE_HPP
diff --git a/src/Clustering/ReadLayer.cpp b/src/Clustering/ReadLayer.cpp
index 3c0e70d..9370245 100644
--- a/src/Clustering/ReadLayer.cpp
+++ b/src/Clustering/ReadLayer.cpp
@@ -1,492 +1,496 @@
#include <ReadLayer.hpp>
#include <stdexcept> // std::invalid_argument
#include <limits> // numeric_limits
#include <cmath> // log(), exp(), pow()
#include <vector>
#include <future> // std::promise, std::future
#include <utility> // std::pair, std::move()
#include <functional> // std::bind(), std::ref()
#include <Statistics.hpp> // poisson_pmf()
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ThreadPool.hpp>
#include <iostream>
typedef std::vector<double> vector_d ;
ReadLayer::ReadLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class,
ThreadPool* threads)
: Data2DLayer(data, n_class, n_shift, flip, bckg_class),
window_means(n_row, n_shift, 0.)
{ this->n_category = 1 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
// compute window means
this->compute_window_means(threads) ;
}
ReadLayer::ReadLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class,
ThreadPool* threads)
: Data2DLayer(std::move(data), n_class, n_shift, flip, bckg_class),
window_means(n_row, n_shift, 0.)
{ this->n_category = 1 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
// compute window means
this->compute_window_means(threads) ;
}
ReadLayer::ReadLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool bckg_class,
ThreadPool* threads)
: Data2DLayer(data, model, flip, bckg_class),
window_means(n_row, n_shift, 0.)
{ // check that the model only has one category
if(this->n_category > 1)
{ char msg[4096] ;
sprintf(msg,
"Error! model is expected to have length 1 on "
"3rd dimension, not %zu",
this->n_category) ;
throw std::invalid_argument(msg) ;
}
// compute window means
this->compute_window_means(threads) ;
}
ReadLayer::ReadLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool bckg_class,
ThreadPool* threads)
: Data2DLayer(std::move(data), std::move(model), flip, bckg_class),
window_means(n_row, n_shift, 0.)
{ // check that the model only has one category
if(this->n_category > 1)
{ char msg[4096] ;
sprintf(msg,
"Error! model is expected to have length 1 on "
"3rd dimension, not %zu",
this->n_category) ;
throw std::invalid_argument(msg) ;
}
// compute window means
this->compute_window_means(threads) ;
}
ReadLayer::~ReadLayer()
{}
void ReadLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads) const
{ // dimension checks
this->check_loglikelihood_dim(loglikelihood) ;
this->check_loglikelihood_max_dim(loglikelihood_max) ;
// don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihoods_routine(0,
this->n_row,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ReadLayer::compute_loglikelihoods_routine,
this,
slice.first,
slice.second,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void ReadLayer::compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
std::promise<bool>& done) const
{
// normalize the models
Matrix3D<double> model_norm = this->model ;
for(size_t i=0; i<this->n_class; i++)
{ double mean = 0. ;
for(size_t j=0; j<this->l_model; j++)
{ mean += model_norm(i,j,0) ; }
mean /= this->l_model ;
for(size_t j=0; j<this->l_model; j++)
{ model_norm(i,j,0) /= mean ; }
}
// compute log likelihood
for(size_t i=from; i<to; i++)
{
// set max to min possible value
loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;
for(size_t j=0; j<this->n_class; j++)
{ for(size_t s_fw=0, s_rev=this->n_shift-1;
s_fw<this->n_shift; s_fw++, s_rev--)
{ // slice is [from_fw,to)
// from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw]
// fw |---------->>>----------|
// ----------------------------------> data
// rev |----------<<<----------| [from_dat_rev, to_dat_rev]
// to_dat_rev can be -1 -> int
// to_dat_rev from_dat_rev
// log likelihood
double ll_fw = 0. ;
double ll_rev = 0. ;
// --------------- forward ---------------
size_t from_dat_fw = s_fw ;
size_t to_dat_fw = from_dat_fw + this->l_model - 1 ;
// --------------- reverse ---------------
size_t from_dat_rev = this->n_col - 1 - s_fw ;
// size_t to_dat_rev = from_dat_rev - (this->l_model - 1) ;
for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev;
j_dat_fw<to_dat_fw;
j_dat_fw++, j_ref_fw++, j_dat_rev--)
{
double ll ;
// --------------- forward ---------------
ll = log(poisson_pmf(this->data(i,j_dat_fw),
model_norm(j,j_ref_fw,0)*
this->window_means(i,s_fw))) ;
- // ll_fw += ll ;
// p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf
- ll_fw += std::max(ll, ReadLayer::p_min_log) ;
+ // prevent this by applying this
+ if(isinf(ll))
+ { ll = ReadLayer::p_min_log ; }
+ ll_fw += ll ;
// --------------- reverse ---------------
if(this->flip)
{ ll = log(poisson_pmf(this->data(i,j_dat_rev),
model_norm(j,j_ref_fw,0)*
this->window_means(i,s_rev))) ;
- // ll_rev += ll ;
// p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf
- ll_rev += std::max(ll, ReadLayer::p_min_log) ;
+ // prevent this by applying this
+ if(isinf(ll))
+ { ll = ReadLayer::p_min_log ; }
+ ll_rev += ll ;
}
}
loglikelihood(i,j,from_dat_fw,flip_states::FORWARD) = ll_fw ;
// keep track of the max per row
if(ll_fw > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_fw ; }
if(this->flip)
{ loglikelihood(i,j,from_dat_fw,flip_states::REVERSE) = ll_rev ;
// keep track of the max per row
if(ll_rev > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_rev ; }
}
}
}
}
done.set_value(true) ;
}
void ReadLayer::update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads)
{
// computing sum over the columns (classes)
size_t n_row = posterior_prob.get_dim()[0] ;
size_t n_class = posterior_prob.get_dim()[1] ;
size_t n_shift = posterior_prob.get_dim()[2] ;
size_t n_flip = posterior_prob.get_dim()[3] ;
vector_d colsum(n_class, 0.) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ colsum[j] += posterior_prob(i,j,k,l) ; }
}
}
}
this->update_model(posterior_prob,
colsum,
threads) ;
}
void ReadLayer::update_model(const Matrix4D<double>& posterior_prob,
const vector_d& posterior_prob_colsum,
ThreadPool* threads)
{
// don't parallelize
if(threads == nullptr)
{ std::promise<Matrix3D<double>> promise ;
std::future<Matrix3D<double>> future = promise.get_future() ;
this->update_model_routine(0,
this->n_row,
posterior_prob,
posterior_prob_colsum,
promise) ;
// this->model = future.get() ;
auto model = future.get() ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = model(i,j,k) ; }
}
}
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ReadLayer::update_model_routine,
this,
slice.first,
slice.second,
std::ref(posterior_prob),
std::ref(posterior_prob_colsum),
std::ref(promises[i])))) ;
}
// reinitialise the model
/*
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
*/
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = 0. ; }
}
}
// wait until all threads are done working
// and update the mode
for(auto& future : futures)
{ Matrix3D<double> model_part = future.get() ;
// for(size_t i=0; i<this->n_class; i++)
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) +=
model_part(i,j,k) ;
}
}
}
}
// -------------------------- threads stop ---------------------------
}
// avoid 0's in the model to ensure that pmf_poisson() never
// return 0
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) =
std::max(this->model(i,j,k), ReadLayer::p_min) ;
}
}
}
}
void ReadLayer::update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
const vector_d& posterior_prob_colsum,
std::promise<Matrix3D<double>>& promise) const
{
// dimension checks
this->check_posterior_prob_dim(posterior_prob) ;
this->check_posterior_prob_colsum_dim(posterior_prob_colsum) ;
// partial model
Matrix3D<double> model(this->n_class,
this->l_model,
this->n_category,
0.) ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t n_class=0; n_class < n_class_to_update; n_class++)
{ for(size_t i=from; i<to; i++)
{ for(size_t n_shift=0; n_shift<this->n_shift; n_shift++)
{ // --------------- forward ---------------
int from_dat_fw = n_shift ;
int to_dat_fw = from_dat_fw + this->l_model - 1 ;
for(int j_dat_fw=from_dat_fw, j_ref_fw=0;
j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++)
{ model(n_class,j_ref_fw,0) +=
(posterior_prob(i,n_class,n_shift,flip_states::FORWARD) *
this->data(i,j_dat_fw)) /
posterior_prob_colsum[n_class] ;
}
// --------------- reverse ---------------
if(this->flip)
{ int from_dat_rev = this->n_col - 1 - n_shift ;
int to_dat_rev = from_dat_rev - (this->l_model - 1) ;
for(int j_dat_rev=from_dat_rev, j_ref_fw=0;
j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++)
{ model(n_class,j_ref_fw,0) +=
(posterior_prob(i,n_class,n_shift,flip_states::REVERSE) *
this->data(i,j_dat_rev)) /
posterior_prob_colsum[n_class] ;
}
}
}
}
}
promise.set_value(model) ;
}
void ReadLayer::compute_window_means(ThreadPool* threads)
{ // don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_window_means_routine(0, this->n_row, promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&ReadLayer::compute_window_means_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void ReadLayer::compute_window_means_routine(size_t from,
size_t to,
std::promise<bool>& done)
{ double l_window = double(this->l_model) ;
for(size_t i=from; i<to; i++)
{ for(size_t from=0; from<this->n_shift; from++)
{ double sum = 0. ;
// slice is [from,to)
size_t to = from + this->l_model ;
for(size_t j=from; j<to; j++)
{ sum += this->data(i,j) ;}
this->window_means(i,from) = sum / l_window ;
}
}
done.set_value(true) ;
}
void ReadLayer::check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const
{ if(posterior_prob_colsum.size() != this->n_class)
{ char msg[4096] ;
sprintf(msg,
"Error! posterior_class_prob matrix size is not "
"equal to model class number : %zu / %zu",
posterior_prob_colsum.size(), this->n_class) ;
throw std::invalid_argument(msg) ;
}
}
void ReadLayer::create_bckg_class()
{
// mean count
double mean = 0. ;
double n = (double)this->data.get_nrow() *
(double)this->data.get_ncol() ;
for(size_t i=0; i<this->data.get_nrow(); i++)
{ for(size_t j=0; j<this->data.get_ncol(); j++)
{ mean += ((double)this->data(i,j)) / n ; }
}
// set last class as background class
for(size_t j=0; j<this->l_model; j++)
{ this->model(this->n_class-1,j,0) = mean ; }
}
diff --git a/src/Clustering/ReadLayer.hpp b/src/Clustering/ReadLayer.hpp
index 5d52c92..cd001e4 100644
--- a/src/Clustering/ReadLayer.hpp
+++ b/src/Clustering/ReadLayer.hpp
@@ -1,269 +1,278 @@
#ifndef READLAYER_HPP
#define READLAYER_HPP
#include <Data2DLayer.hpp>
#include <future> // std::promise
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ThreadPool.hpp>
typedef std::vector<double> vector_d ;
class ReadLayer : public Data2DLayer
{
public:
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
ReadLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class,
ThreadPool* threads = nullptr) ;
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
ReadLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class,
ThreadPool* threads = nullptr) ;
/*!
* \brief Construct an object with the
* given data and model.
* \param data the data.
* \param the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
ReadLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool bckg_class,
ThreadPool* threads = nullptr) ;
/*!
* \brief Construct an object with the
* given data and model.
* \param data the data.
* \param the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
ReadLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool bckg_class,
ThreadPool* threads = nullptr) ;
/*!
* Destructor
*/
virtual ~ReadLayer() override ;
/*!
* \brief Computes the log likelihood of the data
* given the current model parameters.
* During this process, a normalized version of the
* models, having a sum of signal of 1 count in average,
* is used (a copy of the models is normalized, meaning
* that the original models can still be retrieved the
* dedicated getter).
* \param logliklihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of loglikelihood.
* Its length should be equal to the data row number.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
* \throw std::invalid_argument if the dimensions are
* incorrect.
*/
virtual void compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads=nullptr) const override ;
/*!
* \brief Updates the model given the posterior
* probabilities (the probabilities of each row
* in the data to be assigned to each class,
* for each shift and flip state).
- * \param posterior_prob the data assignment probabilities to
- * the different classes.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is ReadLayer::p_min.
+ * \param posterior_prob the data assignment
+ * probabilities to the different classes.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
virtual void update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads=nullptr) override ;
/*!
* \brief Updates the model given the posterior
* probabilities (the probabilities of each row
* in the data to be assigned to each class,
* for each shift and flip state).
* This method does the same as the virtual method it
* overloads. The only difference is that, for run time
* gain, it is given the sum over the columns of the
* posterior_prob matrix which is computed by the virtual
* method.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is ReadLayer::p_min.
* \param posterior_prob the data assignment probabilities to
* the different classes.
* \param posterior_prob_colsum the sum over the columns
* (classes) of the posterior_prob matrix.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
void update_model(const Matrix4D<double>& posterior_prob,
const vector_d& posterior_prob_colsum,
ThreadPool* threads=nullptr) ;
protected:
/*!
* \brief The routine that effectively performs the
* loglikelihood computations.
* \param from the index of the first row of the data
* to considered.
* \param to the index of the past last row of the data
* to considered.
* \param loglikelihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of log_likelihood.
* Its length should be equal to the data row number.
* \param done a promise to be filled when the routine
* is done running.
*/
void compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
std::promise<bool>& done) const ;
/*!
* \brief The routine that effectively update the model.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is ReadLayer::p_min.
* \param from the index of the first row of the
* posterior probabilities to considered.
* \param to the index of the past last row of the
* posterior probabilities to considered.
* \param posterior_prob the data assignment probabilities
* to the different classes.
* \param
* \param promise a promise containing the partial model
* computed from the given data slice. If several routines
* work together to update the model, the promise matrices
* need to be summed up to get the final model.
*/
void update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
const vector_d& posterior_prob_colsum,
std::promise<Matrix3D<double>>& promise) const ;
/*!
* \brief Computes the mean number of reads present in
* each slice (of length l_model), in each row
* of the data and store them in this->window_means.
* \param threads a pointer to a thread pool to
* parallelize the computations. If nullptr is given,
* the computations are performed by the main thread.
*/
void compute_window_means(ThreadPool* threads) ;
/*!
* \brief The routine that effectively computes the
* window means.
* \param from the index of the first row of the
* data to considered.
* \param to the index of the past last row of the
* data to considered.
* \param done a promise to fill when the routine
* is done running.
*/
void compute_window_means_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Checks that the argument has compatible
* dimensions with the data and models. If this is
* not the case, throw a std::invalid_argument with
* a relevant message.
* \param posterior_class_prob a vector containing the
* class probabilities.
* It should have a length equal to the number of
* classes.
* \throw std::invalid_argument if the dimensions are
* incorrect.
*/
void check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const ;
/*!
* \brief Sets the last class as background class with the mean
* number of counts in the data at each position.
*/
void create_bckg_class() ;
/*!
* \brief contains the data means, for
* each window of size l_model.
*/
Matrix2D<double> window_means ;
} ;
#endif // READLAYER_HPP
diff --git a/src/Clustering/SequenceLayer.cpp b/src/Clustering/SequenceLayer.cpp
index 8906c96..d14859f 100644
--- a/src/Clustering/SequenceLayer.cpp
+++ b/src/Clustering/SequenceLayer.cpp
@@ -1,432 +1,432 @@
#include <SequenceLayer.hpp>
#include <stdexcept> // std::invalid_argument
#include <limits> // numeric_limits
#include <cmath> // log(), pow()
#include <vector>
#include <algorithm> // std::max_element()
#include <future> // std::promise, std::future
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <dna_utility.hpp> // dna::base_composition()
double SequenceLayer::score_subseq(const Matrix2D<int>& seq,
size_t row,
size_t start,
const Matrix2D<double>& model_log)
{
if(start > seq.get_ncol() - model_log.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, seq.get_ncol() - model_log.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model_log.get_nrow() > seq.get_ncol())
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model_log.get_nrow(), seq.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(model_log.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model_log.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
size_t from = start ;
size_t to = from + model_log.get_nrow() ; // will score [from, to)
int n_code = dna::char_to_int('N') ;
double ll = 0 ;
for(size_t i=from, j=0; i<to; i++, j++)
{ int base = seq(row,i) ;
// N char -> get max score
if(base == n_code)
{ std::vector<double> row = model_log.get_row(j) ;
ll += *(std::max_element(std::begin(row),
std::end(row))) ;
}
// A,C,G,T -> get its score
else
{ ll += model_log(j,base) ; }
}
return ll ;
}
SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data2DLayer(data, n_class, n_shift, flip,bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
SequenceLayer::SequenceLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class)
: Data2DLayer(std::move(data), n_class, n_shift, flip, bckg_class)
{
this->n_category = 4 ;
// initialise the empty model
this->model = Matrix3D<double>(this->n_class,
this->l_model,
this->n_category,
0.) ;
// background class
if(this->bckg_class)
{ this->create_bckg_class() ; }
}
SequenceLayer::SequenceLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool bckg_class)
: Data2DLayer(data, model,flip, bckg_class)
{}
SequenceLayer::SequenceLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool bckg_class)
: Data2DLayer(std::move(data), std::move(model),flip, bckg_class)
{}
SequenceLayer::~SequenceLayer()
{ ; }
void SequenceLayer::compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads) const
{
// dimension checks
this->check_loglikelihood_dim(loglikelihood) ;
this->check_loglikelihood_max_dim(loglikelihood_max) ;
// compute the log prob model and the log prob reverse-complement model
std::vector<Matrix2D<double>> model_log(this->n_class,
Matrix2D<double>(this->l_model,
this->n_category,
0.)) ;
std::vector<Matrix2D<double>> model_log_rev = model_log ;
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ // forward
model_log[i](j,k) = log(this->model(i,j,k)) ;
// reverse
model_log_rev[i](this->l_model-j-1,this->n_category-k-1)
= log(this->model(i,j,k)) ;
}
}
}
// don't parallelize
if(threads == nullptr)
{ std::promise<bool> promise ;
std::future<bool> future = promise.get_future() ;
this->compute_loglikelihoods_routine(0, this->n_row,
loglikelihood,
loglikelihood_max,
model_log,
model_log_rev,
promise) ;
future.get() ;
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<bool>> promises(n_threads) ;
std::vector<std::future<bool>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&SequenceLayer::compute_loglikelihoods_routine,
this,
slice.first,
slice.second,
std::ref(loglikelihood),
std::ref(loglikelihood_max),
std::ref(model_log),
std::ref(model_log_rev),
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
}
void SequenceLayer::compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model_log,
const std::vector<Matrix2D<double>>& model_log_rev,
std::promise<bool>& done) const
{
// compute log likelihood
for(size_t i=from; i<to; i++)
{
// set max to min possible value
loglikelihood_max[i] = std::numeric_limits<double>::lowest() ;
for(size_t j=0; j<this->n_class; j++)
- { for(size_t s=0; s<this->n_shift; s++)
+ {
+ for(size_t s=0; s<this->n_shift; s++)
{ // forward strand
- { double ll_fw = std::max(score_subseq(this->data, i, s, model_log[j]),
- SequenceLayer::p_min_log) ;
+ { double ll_fw = score_subseq(this->data, i, s, model_log[j]) ;
// loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// apply lower bound here -> log(1e-100)
loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ;
// keep track of max per row
if(ll_fw > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_fw ; }
}
// reverse
if(this->flip)
- { double ll_rev = std::max(score_subseq(this->data, i, s, model_log_rev[j]),
- SequenceLayer::p_min_log) ;
+ { double ll_rev = score_subseq(this->data, i, s, model_log_rev[j]) ;
// loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// apply lower bound here -> log(1e-100)
loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ;
// keep track of max per row
if(ll_rev > loglikelihood_max[i])
{ loglikelihood_max[i] = ll_rev ; }
}
}
}
}
done.set_value(true) ;
}
+
void SequenceLayer::update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads)
{ // don't parallelize
if(threads == nullptr)
{ std::promise<Matrix3D<double>> promise ;
std::future<Matrix3D<double>> future = promise.get_future() ;
this->update_model_routine(0,
this->n_row,
posterior_prob,
promise) ;
// this->model = future.get() ;
auto model = future.get() ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = model(i,j,k) ; }
}
}
}
// parallelize
else
{ size_t n_threads = threads->getNThread() ;
// compute the slices on which each thread will work
std::vector<std::pair<size_t,size_t>> slices =
ThreadPool::split_range(0, this->n_row, n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector<std::promise<Matrix3D<double>>> promises(n_threads) ;
std::vector<std::future<Matrix3D<double>>> futures(n_threads) ;
for(size_t i=0; i<n_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; i<n_threads; i++)
{ auto slice = slices[i] ;
threads->addJob(std::move(
std::bind(&SequenceLayer::update_model_routine,
this,
slice.first,
slice.second,
std::ref(posterior_prob),
std::ref(promises[i])))) ;
}
// reinitialise the model
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) = 0. ; }
}
}
// wait until all threads are done working
// and update the model
for(auto& future : futures)
{ Matrix3D<double> model_part = future.get() ;
// for(size_t i=0; i<this->n_class; i++)
for(size_t i=0; i<n_class_to_update; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) += model_part(i,j,k) ; }
}
}
}
// -------------------------- threads stop ---------------------------
}
// make sure to have no 0 values
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ for(size_t k=0; k<this->n_category; k++)
{ this->model(i,j,k) =
std::max(this->model(i,j,k), SequenceLayer::p_min) ;
}
}
}
// normalize to get probs
for(size_t i=0; i<this->n_class; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ double sum = 0. ;
for(size_t k=0; k<this->n_category; k++)
{ sum += this->model(i,j,k) ; }
for(size_t k=0; k<this->n_category; k++)
{ double p = this->model(i,j,k) / sum ;
this->model(i,j,k) = p ;
}
}
}
}
void SequenceLayer::update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& promise) const
{ // dimension checks
this->check_posterior_prob_dim(posterior_prob) ;
Matrix3D<double> model(this->n_class,
this->l_model,
this->n_category,
0.) ;
// the int code of A, C, G, T, N
static int a_code = dna::char_to_int('A') ;
static int c_code = dna::char_to_int('C') ;
static int g_code = dna::char_to_int('G') ;
static int t_code = dna::char_to_int('T') ;
static int n_code = dna::char_to_int('N') ;
// the int code of the reverse complement of A, C, G, T
static int a_code_r = dna::char_to_int('A', true) ;
static int c_code_r = dna::char_to_int('C', true) ;
static int g_code_r = dna::char_to_int('G', true) ;
static int t_code_r = dna::char_to_int('T', true) ;
size_t n_class_to_update = this->n_class - this->bckg_class ;
for(size_t k=0; k<n_class_to_update; k++)
{ for(size_t s=0; s<this->n_shift; s++)
{ // for(size_t j=0; j<this->l_model; j++)
for(size_t j=0, j_rv=this->l_model-1; j<this->l_model; j++, j_rv--)
{ // base prob on fw and rv strand
vector_d base_prob_fw(this->n_category, 0.) ;
vector_d base_prob_rv(this->n_category, 0.) ;
for(size_t i=from; i<to; i++)
{ int base = this->data(i,s+j) ;
int base_rv = this->n_category - base - 1 ; // complement
// int base_rv = this->n_category - this->data(i,s+j_rv) - 1 ;
// std::cerr << k << " "
// << s << " "
// << j << "/" << j_rv << " "
// << base << "/" << base_rv
// << std::endl ;
// N
if(base == n_code)
{ // --------------- forward ---------------
{ base_prob_fw[a_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[c_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[g_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
base_prob_fw[t_code] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
{ base_prob_rv[a_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[c_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[g_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
base_prob_rv[t_code_r] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
}
}
// A, C, G, T
else
{ // --------------- forward ---------------
{ base_prob_fw[base] +=
posterior_prob(i,k,s,SequenceLayer::FORWARD) ;
}
// --------------- reverse ---------------
if(this->flip)
{ base_prob_rv[base_rv] +=
posterior_prob(i,k,s,SequenceLayer::REVERSE) ;
}
}
}
// update this position of the model
for(size_t i=0; i<base_prob_fw.size(); i++)
{ // --------------- forward ---------------
{ model(k,j,i) += base_prob_fw[i] ; }
// --------------- reverse ---------------
if(this->flip)
{ model(k,this->l_model-j-1,i) += base_prob_rv[i] ;
/* model(k,j_rv,i) += base_prob_rv[i] ; */
}
}
}
}
}
promise.set_value(model) ;
}
void SequenceLayer::create_bckg_class()
{ // sequence composition
std::vector<double> base_comp =
dna::base_composition(this->data,
this->flip) ;
// set last class as background class
for(size_t i=0; i<this->n_category; i++)
{ for(size_t j=0; j<this->l_model; j++)
{ this->model(this->n_class-1,j,i) = base_comp[i] ; }
}
}
diff --git a/src/Clustering/SequenceLayer.hpp b/src/Clustering/SequenceLayer.hpp
index 2238ff4..4cd5bf8 100644
--- a/src/Clustering/SequenceLayer.hpp
+++ b/src/Clustering/SequenceLayer.hpp
@@ -1,231 +1,237 @@
#ifndef SEQUENCELAYER_HPP
#define SEQUENCELAYER_HPP
#include <Data2DLayer.hpp>
#include <iostream>
#include <string>
#include <future> // std::promise
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ThreadPool.hpp>
typedef std::vector<double> vector_d ;
class SequenceLayer : public Data2DLayer
{
public:
/*!
* \brief Computes the log-likelihood of the sub-
* sequence - stored in a given row - and starting
* at the offset <col> in the given sequence matrix.
* The subsequence length is determined by the model
* lenght.
* \param seq the sequences in integer format.
* \param row the row containing the sequence of
* interest.
* \param col the index at which the sub-sequence
* is starting (1st index inside the subsequence
* of interest).
* \param model_log a model containing the log
* probability model.
* \return the log-likelihood of the sub-sequence
* given the model.
* \throw std::invalid_argument if 1) the offset is
* invalid, 2) the sequence and the model have
* incompatible dimensions or 3) the model 2n dimension
* is not 4 (A,C,G,T).
*/
static double score_subseq(const Matrix2D<int>& seq,
size_t row,
size_t col,
const Matrix2D<double>& model_log) ;
public:
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
SequenceLayer(const Matrix2D<int>& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Constructs an object with the
* given data and an empty (0 values)
* model.
* \param data the data.
* \param n_class the number of classes
* of the model.
* \param n_shift the number of shift
* states of the model.
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
SequenceLayer(Matrix2D<int>&& data,
size_t n_class,
size_t n_shift,
bool flip,
bool bckg_class) ;
/*!
* \brief Construct an object with the
* given data and model.
* The shifting freedom is set to (data number
* of columns) - (the model 2nd dimension)
* + 1.
* \param data the data. The sequences
* should be stored as integer values :
* A:0, C:1, G:2, T:3, else:5.
* \param model the model with the following
* dimensions :
* dim1 the number of classes
* dim2 the model length
* dim3 4 (A,C,G,T)
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
SequenceLayer(const Matrix2D<int>& data,
const Matrix3D<double>& model,
bool flip,
bool bckg_class) ;
/*!
* \brief Construct an object with the
* given data and model.
* The shifting freedom is set to (data number
* of columns) - (the model 2nd dimension)
* + 1.
* \param data the data. The sequences
* should be stored as integer values :
* A:0, C:1, G:2, T:3, else:5.
* \param model the model with the following
* dimensions :
* dim1 the number of classes
* dim2 the model length
* dim3 4 (A,C,G,T)
* \param flip whether flipping is allowed.
* \param bckg_class should the last class
* of the model is constant and models the
* background.
*/
SequenceLayer(Matrix2D<int>&& data,
Matrix3D<double>&& model,
bool flip,
bool bckg_class) ;
/*!
* Destructor
*/
virtual ~SequenceLayer() override ;
/*!
* \brief Computes the log likelihood of the data
* given the current model parameters.
* \param logliklihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of loglikelihood.
* Its length should be equal to the data row number.
* \throw std::invalid_argument if the dimensions are
* incorrect.
*/
virtual void compute_loglikelihoods(Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
ThreadPool* threads=nullptr) const override ;
/*!
* \brief Updates the model given the posterior
* probabilities (the probabilities of each row
* in the data to be assigned to each class,
* for each shift and flip state).
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is SequenceLayer::p_min.
* \param posterior_prob the data assignment probabilities to
* the different classes.
*/
virtual void update_model(const Matrix4D<double>& posterior_prob,
ThreadPool* threads=nullptr) override ;
protected:
/*!
* \brief The routine that effectively performs the
* loglikelihood computations.
* \param from the index of the first row of the data
* to considered.
* \param to the index of the past last row of the data
* to considered.
* \param loglikelihood a matrix to store the
* results. It should have the following dimensions :
* 1st : same as the data number of row
* 2nd : same as the model number of classes
* 3rd : same as the number of shifts
* 4th : same as the number of flip states
* \param loglikelihood_max a vector containing the
* max value for each row of log_likelihood.
* Its length should be equal to the data row number.
* \param model_log a vector containing the matrices with
* the log values of the model for each class.
* \param model_log_rev a vector containing the matrices with
* the log values of the reverse strand model for each class
* (the 1st position in the model becomes the last in the
* reverse model and probabilities are swapped A<->T and C<->G).
* \param done a promise to be filled when the routine
* is done running.
*/
void compute_loglikelihoods_routine(size_t from,
size_t to,
Matrix4D<double>& loglikelihood,
vector_d& loglikelihood_max,
const std::vector<Matrix2D<double>>& model_log,
const std::vector<Matrix2D<double>>& model_log_rev,
std::promise<bool>& done) const ;
/*!
* \brief The routine that effectively update the model.
+ * 0 values inside the model are forbidden.
+ * The lowest possible value inside the model
+ * is SequenceLayer::p_min.
* \param from the index of the first row of the
* posterior probabilities to considered.
* \param to the index of the past last row of the
* posterior probabilities to considered.
* \param posterior_prob the data assignment probabilities
* to the different classes.
* \param
* \param done a promise containing the partial model
* computed from the given data slice. If several routines
* work together at updating the model, they need to be
* summed and normalized (by the column sum) to get the
* final model.
*/
void update_model_routine(size_t from,
size_t to,
const Matrix4D<double>& posterior_prob,
std::promise<Matrix3D<double>>& done) const ;
/*!
* \brief Sets the last class as background class with the
* mean base probababilities of the data at each position.
*/
void create_bckg_class() ;
} ;
#endif // SEQUENCELAYER_HPP
diff --git a/src/main_test.cpp b/src/main_test.cpp
index 2256a7a..7e2a898 100644
--- a/src/main_test.cpp
+++ b/src/main_test.cpp
@@ -1,457 +1,453 @@
#include <iostream>
#include <string>
#include <vector>
#include <utility>
#include <cmath>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ConsensusSequenceModelComputer.hpp>
#include <EMConsensusSequence.hpp>
#include <SequenceModelComputer.hpp>
#include <EMSequence.hpp>
#include <dna_utility.hpp>
template<class T>
std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
Matrix3D<double> compute_consensus_sequences(const Matrix2D<int>& seq,
const Matrix4D<double>& prob,
size_t class_k)
{
size_t n_seq = seq.get_nrow() ;
size_t l_seq = seq.get_ncol() ;
// size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
bool flip = n_flip == 2 ;
size_t l_model = l_seq - n_shift + 1 ;
Matrix3D<double> cons_seq(n_seq, l_model, 4, 0.) ;
// aggregate
for(size_t i=0; i<n_seq; i++)
{ std::cerr << "i " << i << std::endl ;
for(size_t s=0; s<n_shift; s++)
{ //std::cerr << "s " << s << std::endl ;
double tot = 0. ;
// aggregate
for(size_t j=0; j<l_model; j++)
{ // std::cerr << "j " << j << std::endl ;
// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
int base = seq(i,s+j) ;
// N should participate to A,C,G,T equivalently -> same as doing nothing
if(base == dna::char_to_int('N'))
{ continue ; }
int base_rv = 4 - base - 1 ;
// --------- forward ---------
// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
cons_seq(i,j,base) += prob(i,class_k,s,0) ;
tot += prob(i,class_k,s,0) ;
// --------- reverse ---------
if(flip)
{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ;
tot += prob(i,class_k,s,1) ;
}
}
}
}
// normalize
for(size_t i=0; i<n_seq; i++)
{ for(size_t j=0; j<l_model; j++)
{ double tot = 0. ;
for(size_t k=0; k<4; k++)
{ tot += cons_seq(i,j,k) ; }
for(size_t k=0; k<4; k++)
{ cons_seq(i,j,k) /= tot ; }
}
}
return cons_seq ;
}
Matrix2D<int> char_to_int(const Matrix2D<char>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
Matrix2D<int> iseq(n_row, n_col, 0.) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ iseq(i,j) = dna::char_to_int(seq(i,j)) ; }
}
return iseq ;
}
Matrix3D<double> sequence_to_consensus(const Matrix2D<int>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
size_t n_dim3 = 4 ; // A, C, G, T
int base_n = dna::char_to_int('N') ;
Matrix3D<double> cseq(n_row, n_col, n_dim3, 1e-100) ;
double prob = 1. - 3*(1e-100) ; // p(base) - p(!base)
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ int base = seq(i,j) ;
// N
if(base == base_n)
{ cseq(i,j,0) = 0.25 ;
cseq(i,j,1) = 0.25 ;
cseq(i,j,2) = 0.25 ;
cseq(i,j,3) = 0.25 ;
}
// A C G T
else
{ cseq(i,j, base) = prob ; }
}
}
return cseq ;
}
double score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t start,
const Matrix2D<double>& model)
{ size_t ncol = cons_seq.get_dim()[1] ;
// size_t dim1 = cons_seq.get_dim()[0] ;
// size_t dim2 = cons_seq.get_dim()[1] ;
size_t dim3 = cons_seq.get_dim()[2] ;
if(start > ncol - model.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, ncol - model.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_nrow() > ncol)
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model.get_nrow(), ncol) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(dim3 != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
dim3) ;
throw std::invalid_argument(msg) ;
}
/*
double ll = 0 ;
for(size_t j_seq=from, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ for(size_t k=0; k<dim3; k++)
{ std::cerr << ll
<< " *= "
<< "("
<< model(j_mod,k)
<< " * "
<< cons_seq(row,j_seq,k)
<< ")"
<< std::endl ;
ll *= model(j_mod,k) * cons_seq(row,j_seq,k) ;
}
}*/
size_t from = start ;
size_t to = from + model.get_nrow() ; // will score [from, to)
double ll = 0. ;
for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ double sum = 0. ;
for(size_t k=0; k<dim3; k++)
{ std::cerr << sum
<< "+= ("
<< cons_seq(row, j_seq, k)
<< " * "
<< model(j_mod, k)
<< ")"
<< std::endl ;
sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ;
}
std::cerr << sum << std::endl ;
std::cerr << "---------------" << std::endl ;
ll += log(sum) ;
}
return ll ;
}
Matrix2D<double> test_consensus(const std::string& file,
- size_t n_class,
- size_t n_iter,
- size_t n_shift,
- bool flip,
- bool bckg_class,
- const std::string& seed,
- size_t n_thread)
+ size_t n_class,
+ size_t n_iter,
+ size_t n_shift,
+ bool flip,
+ bool bckg_class,
+ const std::string& seed,
+ size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
// partition
EMConsensusSequence em(cseq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
- std::cerr << prob << std::endl ;
-
// compute models
ConsensusSequenceModelComputer mc(std::move(cseq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_sequence(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
- // Matrix2D<int> seq(file) ;
- Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
+ Matrix2D<int> seq(file) ;
+ // Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
// partition
EMSequence em(seq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
- std::cerr << prob << std::endl ;
-
// compute models
SequenceModelComputer mc(std::move(seq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_aggregate(const std::string& file)
{ Matrix2D<int> seq(file) ;
size_t nrow = seq.get_nrow() ;
size_t ncol = seq.get_ncol() ;
Matrix2D<double> model_final(4, ncol+1, 1e-100) ;
for(size_t i=0; i<4; i++)
{ model_final(i,0) = 1. ; }
std::vector<double> sum(ncol+1, 0.) ;
for(size_t i=0; i<nrow; i++)
{ for(size_t j=0; j<ncol; j++)
{ int base = seq(i,j) ;
if(base == dna::char_to_int('N'))
{ continue ; }
model_final(base, j+1) += 1. ;
sum[j+1] += 1. ;
}
}
for(size_t i=0; i<model_final.get_nrow(); i++)
{ for(size_t j=1; j<model_final.get_ncol()+1; j++)
{ model_final(i,j) /= sum[j] ; }
}
return model_final ;
}
int main()
{
size_t n_class = 2 ;
size_t n_iter = 50 ;
- size_t n_shift = 3 ;
+ size_t n_shift = 1 ;
bool flip = true ;
bool bckg_class = false ;
std::string seed = "" ;
size_t n_thread = 1 ;
- /*
+
std::string file = "/local/groux/scATAC-seq/data/"
"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
- */
// std::string file = "/local/groux/scATAC-seq/test.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_noflip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_flip.mat" ;
- std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
+ // std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
/*
Matrix2D<double> model_final = test_consensus(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
- */
+ */
Matrix2D<double> model_final = test_sequence(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
+
/*
Matrix2D<double> model_final = test_aggregate(file) ;
*/
std::cout << model_final << std::endl ;
/*
double p0 = 0 ;
// double l0 = log(p0) ;
double p1 = 1. - (3.*p0) ;
// double l1 = log(p1) ;
// Matrix3D<double> cseq(1, 4, 4) ;
// cseq(0, 0, 0) = p1 ; cseq(0, 0, 1) = p0 ; cseq(0, 0, 2) = p0 ; cseq(0, 0, 3) = p0 ;
// cseq(0, 1, 0) = p0 ; cseq(0, 1, 1) = p1 ; cseq(0, 1, 2) = p0 ; cseq(0, 1, 3) = p0 ;
// cseq(0, 2, 0) = p0 ; cseq(0, 2, 1) = p0 ; cseq(0, 2, 2) = p1 ; cseq(0, 2, 3) = p0 ;
// cseq(0, 3, 0) = p0 ; cseq(0, 3, 1) = p0 ; cseq(0, 3, 2) = p0 ; cseq(0, 3, 3) = p1 ;
Matrix2D<int> seq(1, 4, -1) ;
seq(0, 0) = 0 ; seq(0, 1) = 1 ; seq(0, 2) = 2 ; seq(0, 3) = 3 ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
std::cout << cseq << std::endl << std::endl ;
Matrix2D<double> mod(4, 4, -1) ;
mod(0,0) = p1 ; mod(0,1) = p0 ; mod(0,2) = p0 ; mod(0,3) = p0 ;
mod(1,0) = p0 ; mod(1,1) = p1 ; mod(1,2) = p0 ; mod(1,3) = p0 ;
mod(2,0) = p0 ; mod(2,1) = p0 ; mod(2,2) = p1 ; mod(2,3) = p0 ;
mod(3,0) = p0 ; mod(3,1) = p0 ; mod(3,2) = p0 ; mod(3,3) = p1 ;
std::cout << mod << std::endl ;
std::cout << score_consensus_subseq(cseq, 0,0,mod)
<< std::endl ;
*/
return EXIT_SUCCESS ;
}
Event Timeline
Log In to Comment