Page MenuHomec4science

No OneTemporary

File Metadata

Created
Thu, Apr 25, 06:34
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh
index 5125c78..8f4c839 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh
index a4e36e8..3c42775 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh
index a7385c4..3e642d4 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh
index 723d374..8e8b816 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh
index 52cbdab..661fa5d 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh
index 934f5ea..3bc651a 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
similarity index 85%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
index 2ef7199..f42ad19 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
@@ -1,104 +1,112 @@
setwd(file.path("/", "local", "groux", "scATAC-seq"))
# libraries
library(RColorBrewer)
# functions
source(file.path("scripts", "functions.R"))
# the number of classes searched
n.classes = c(23)
+class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1",
+ "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF",
+ "NR4A1", "EGR", "GATA", "NFAT", "RUNX")
+
# path to the images for the logo
path.a = file.path("res/A.png")
path.c = file.path("res/C.png")
path.g = file.path("res/G.png")
path.t = file.path("res/T.png")
-################## sequence patterns around ctcf motifs ##################
+################## plot architecture around TF motifs ##################
for(k in n.classes)
{
# sequence
data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k)))
model.seq = data$models
model.prob = data$prob
data = NULL
# open chromatin
model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
# nucleosomes
model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models
# plot classes
col = brewer.pal(3, "Set1")
- # X11(width=26, height=12)
+ # X11(width=24, height=12)
png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
- units="in", res=720, width=26, height=12)
+ units="in", res=720, width=24, height=12)
m = matrix(1:24, nrow=6, ncol=4, byrow=F)
layout(m)
# order from most to least probable class
ord = order(model.prob, decreasing=T)
ref.open = model.open[ord,, drop=F][,]
ref.nucl = model.nucl[ord,, drop=F][,]
ref.seq = model.seq[,,ord, drop=F][,,]
prob = model.prob[ord]
class = c(1:nrow(ref.open))[ord]
+ tf = class.tf[ord]
for(i in 1:nrow(ref.open))
{ # plot logo
plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
- main=sprintf("class %d (p=%.2f)", class[i], prob[i]))
+ main=sprintf("%s (p=%.2f)", tf[i], prob[i]))
# x-axis
x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
axis(1, at=x.at, labels=x.lab)
# y-axis is [0,1] for min/max signal
y.at = seq(0, 2, length.out=2)
y.lab = c("min", "max")
axis(2, at=y.at, labels=y.lab)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
}
# inlets with center
row_n = 1 # row counter
col_n = 1 # column counter
row_h = 1/nrow(m) # height of row
col_w = 1/ncol(m) # width of column
- row_cor = row_h / 2
+ row_cor = row_h / 3
col_cor = col_w / 3
for(i in 1:nrow(ref.open))
{ # plot logo center
left = (col_w*col_n) - col_w
right = left + col_w
left = right - col_cor
bottom = 1 - (row_h*row_n)
- top = bottom + row_h
+ top = bottom + (0.9*row_h)
bottom = top - row_cor
- par(fig=c(left, right, bottom, top), new=T)
+ p= par(fig=c(left, right, bottom, top),
+ mar=c(0,0,0,0),
+ new=T)
idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
# xaxis
# x.at = seq(1, length(idx), length.out = 3)
# x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
x.at = ceiling(length(idx)/2)
x.lab = 0
axis(1, at=x.at, labels=x.lab)
# yaxis
axis(2, at=y.at, labels=y.lab)
row_n = row_n + 1
if(i %% nrow(m) == 0)
{ col_n = col_n + 1
row_n = 1
}
+ par(p)
}
dev.off()
}
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
index f10ec18..0bfd24e 100755
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
@@ -1,76 +1,76 @@
# paths
## dir
data_dir_p="data/10xgenomics_PBMC_5k_peaks"
data_dir="data/10xgenomics_PBMC_5k"
pwm_dir="data/pwm/jaspar_2018_clustering/"
hg19_dir="data/genomes"
results_dir="results/10xgenomics_PBMC_5k_peaks_classification_6"
## matrix files
file_mat_open=$data_dir_p/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat'
file_mat_nucl=$data_dir_p/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat'
file_mat_seq=$data_dir_p/'peaks_rmsk_sampled_sequences_1kb.mat'
## file with seeds
file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt'
mkdir -p $results_dir
touch $file_seed
# EM param
n_iter='1'
n_shift='971'
-n_core=24
+n_core=32
## PWM files
jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat"
hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat"
myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat"
pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat"
cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat"
irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat"
irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat"
lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat"
foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat"
sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat"
mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat"
elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat"
stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat"
nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat"
ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat"
e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat"
ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat"
klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat"
nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat"
egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat"
gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat"
nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat"
runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat"
# classify
for k in 23
do
## results files
file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d'
file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat'
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
# extend models
file_mod1_ext=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model_extended.mat'
file_mod2_ext=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat'
file_mod3_ext=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model_extended.mat'
file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk_sampled.bed'
file_fasta=$hg19_dir/'hg19.fasta'
file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam'
file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai'
file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam'
file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai'
bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext
bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext
bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext
done
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh
index 1803c05..415a4c7 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh
@@ -1,4 +1,4 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R
deleted file mode 100644
index 0531449..0000000
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R
+++ /dev/null
@@ -1,180 +0,0 @@
-setwd(file.path("/", "local", "groux", "scATAC-seq"))
-
-# libraries
-library(RColorBrewer)
-
-# functions
-source(file.path("scripts", "functions.R"))
-
-# the number of classes searched
-n.classes = c(23)
-
-# path to the images for the logo
-path.a = file.path("res/A.png")
-path.c = file.path("res/C.png")
-path.g = file.path("res/G.png")
-path.t = file.path("res/T.png")
-
-################## sequence patterns around ctcf motifs ##################
-
-for(k in n.classes)
-{
- # sequence
- data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
- sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k)))
- model.seq = data$models
- model.prob = data$prob
- data = NULL
-
- # open chromatin
- model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
- sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
- # nucleosomes
- model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
- sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models
-
- # plot classes
- col = brewer.pal(3, "Set1")
- X11(width=26, height=12)
- # png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
- # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
- # units="in", res=720, width=18, height=12)
- m = matrix(1:24, nrow=6, ncol=4, byrow=F)
- layout(m)
- # order from most to least probable class
- ord = order(model.prob, decreasing=T)
- ref.open = model.open[ord,, drop=F][,]
- ref.nucl = model.nucl[ord,, drop=F][,]
- ref.seq = model.seq[,,ord, drop=F][,,]
- prob = model.prob[ord]
- class = c(1:nrow(ref.open))[ord]
- for(i in 1:nrow(ref.open))
- { # plot logo
- par(mar=c(2,2,2,0))
- plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
- main=sprintf("class %d (p=%.2f)", class[i], prob[i]))
- # x-axis
- x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
- x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
- axis(1, at=x.at, labels=x.lab)
- # y-axis is [0,1] for min/max signal
- y.at = seq(0, 2, length.out=2)
- y.lab = c("min", "max")
- axis(2, at=y.at, labels=y.lab)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
- lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
- }
-
- # inlets with center
- row_n = 1 # row counter
- col_n = 1 # column counter
- row_h = 1/nrow(m) # height of row
- col_w = 1/ncol(m) # width of column
- row_cor = row_h / 2
- col_cor = col_w / 3
- for(i in 1:nrow(ref.open))
- { # plot logo center
- left = (col_w*col_n) - col_w
- right = left + col_w
- left = right - col_cor
- bottom = 1 - (row_h*row_n)
- top = bottom + row_h
- bottom = top - row_cor
-
- par(fig=c(left, right, bottom, top), new=T)
- idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
- plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
- lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
- # xaxis
- # x.at = seq(1, length(idx), length.out = 3)
- # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
- x.at = ceiling(length(idx)/2)
- x.lab = 0
- axis(1, at=x.at, labels=x.lab)
- # yaxis
- axis(2, at=y.at, labels=y.lab)
- row_n = row_n + 1
- if(i %% nrow(m) == 0)
- { col_n = col_n + 1
- row_n = 1
- }
- }
- # dev.off()
-}
-
-
-# PU.1
-X11(width=18, height=9)
- par(mfrow=c(2,1))
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,,4], path.a, path.c, path.g, path.t,
- main=sprintf("class 4 (p=%.2f)", model.prob[4]))
- # x-axis
- x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3)
- x.at = seq(1, ncol(model.seq), length.out=length(x.lab))
- axis(1, at=x.at, labels=x.lab)
- # y-axis is [0,1] for min/max signal
- y.at = seq(0, 2, length.out=2)
- y.lab = c("min", "max")
- axis(2, at=y.at, labels=y.lab)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(model.open[4,] / max(model.open[4,])), lwd=1, col=col[1])
- lines(2*(model.nucl[4,] / max(model.nucl[4,])), lwd=1, col=col[2])
-
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,495:535,4], path.a, path.c, path.g, path.t,
- main=sprintf("class 4 (p=%.2f)", model.prob[4]))
- lines(2*(model.open[4,495:535] / max(model.open[4,])), lwd=1, col=col[1])
- lines(2*(model.nucl[4,495:535] / max(model.nucl[4,])), lwd=1, col=col[2])
-
-# NFE2
-X11(width=18, height=9)
- par(mfrow=c(2,1))
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,,14], path.a, path.c, path.g, path.t,
- main=sprintf("class 14 (p=%.2f)", model.prob[14]))
- # x-axis
- x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3)
- x.at = seq(1, ncol(model.seq), length.out=length(x.lab))
- axis(1, at=x.at, labels=x.lab)
- # y-axis is [0,1] for min/max signal
- y.at = seq(0, 2, length.out=2)
- y.lab = c("min", "max")
- axis(2, at=y.at, labels=y.lab)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(model.open[14,] / max(model.open[14,])), lwd=1, col=col[1])
- lines(2*(model.nucl[14,] / max(model.nucl[14,])), lwd=1, col=col[2])
-
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,490:540,14], path.a, path.c, path.g, path.t,
- main=sprintf("class 14 (p=%.2f)", model.prob[14]))
- lines(2*(model.open[14,490:540] / max(model.open[14,])), lwd=1, col=col[1])
- lines(2*(model.nucl[14,490:540] / max(model.nucl[14,])), lwd=1, col=col[2])
-
-# CEPBB
-X11(width=18, height=9)
- par(mfrow=c(2,1))
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,,5], path.a, path.c, path.g, path.t,
- main=sprintf("class 5 (p=%.2f)", model.prob[5]))
- # x-axis
- x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3)
- x.at = seq(1, ncol(model.seq), length.out=length(x.lab))
- axis(1, at=x.at, labels=x.lab)
- # y-axis is [0,1] for min/max signal
- y.at = seq(0, 2, length.out=2)
- y.lab = c("min", "max")
- axis(2, at=y.at, labels=y.lab)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(model.open[5,] / max(model.open[5,])), lwd=1, col=col[1])
- lines(2*(model.nucl[5,] / max(model.nucl[5,])), lwd=1, col=col[2])
-
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,495:535,5], path.a, path.c, path.g, path.t,
- main=sprintf("class 5 (p=%.2f)", model.prob[5]))
- lines(2*(model.open[5,495:535] / max(model.open[5,])), lwd=1, col=col[1])
- lines(2*(model.nucl[5,495:535] / max(model.nucl[5,])), lwd=1, col=col[2])
-
\ No newline at end of file
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R
new file mode 100644
index 0000000..43375ee
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R
@@ -0,0 +1,210 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+# the number of classes searched
+n.classes = c(23)
+
+class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1",
+ "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF",
+ "NR4A1", "EGR", "GATA", "NFAT", "RUNX")
+
+# path to the images for the logo
+path.a = file.path("res/A.png")
+path.c = file.path("res/C.png")
+path.g = file.path("res/G.png")
+path.t = file.path("res/T.png")
+
+################## plot architecture around TF motifs ##################
+
+for(k in n.classes)
+{
+ # sequence
+ data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k)))
+ model.seq = data$models
+ model.prob = data$prob
+ data = NULL
+
+ # open chromatin
+ model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
+ # nucleosomes
+ model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models
+
+ # plot classes
+ col = brewer.pal(3, "Set1")
+ # X11(width=24, height=12)
+ png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
+ units="in", res=720, width=18, height=12)
+ m = matrix(1:24, nrow=6, ncol=4, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(model.prob, decreasing=T)
+ ref.open = model.open[ord,, drop=F][,]
+ ref.nucl = model.nucl[ord,, drop=F][,]
+ ref.seq = model.seq[,,ord, drop=F][,,]
+ prob = model.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ tf = class.tf[ord]
+ for(i in 1:nrow(ref.open))
+ { # plot logo
+ par(mar=c(2,2,2,0))
+ plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
+ main=sprintf("%s (p=%.2f)", tf[i], prob[i]))
+ # x-axis
+ x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
+ x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
+ axis(1, at=x.at, labels=x.lab)
+ # y-axis is [0,1] for min/max signal
+ y.at = seq(0, 2, length.out=2)
+ y.lab = c("min", "max")
+ axis(2, at=y.at, labels=y.lab)
+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
+ lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
+ lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
+ }
+
+ # inlets with center
+ row_n = 1 # row counter
+ col_n = 1 # column counter
+ row_h = 1/nrow(m) # height of row
+ col_w = 1/ncol(m) # width of column
+ row_cor = row_h / 3
+ col_cor = col_w / 3
+ for(i in 1:nrow(ref.open))
+ { # plot logo center
+ left = (col_w*col_n) - col_w
+ right = left + col_w
+ left = right - col_cor
+ bottom = 1 - (row_h*row_n)
+ top = bottom + row_h
+ bottom = top - row_cor
+
+ p= par(fig=c(left, right, bottom, top),
+ mar=c(0,0,0,0),
+ new=T)
+ idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
+ plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
+ lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
+ lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
+ # xaxis
+ # x.at = seq(1, length(idx), length.out = 3)
+ # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
+ x.at = ceiling(length(idx)/2)
+ x.lab = 0
+ axis(1, at=x.at, labels=x.lab)
+ # yaxis
+ axis(2, at=y.at, labels=y.lab)
+ row_n = row_n + 1
+ if(i %% nrow(m) == 0)
+ { col_n = col_n + 1
+ row_n = 1
+ }
+ par(p)
+ }
+ dev.off()
+}
+
+
+
+################## zoom in the center ##################
+
+for(k in n.classes)
+{ idx = 516 + c(-100:+100)
+ # sequence
+ data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k)))
+ model.seq = data$models[,idx,]
+ model.prob = data$prob
+ data = NULL
+
+ # open chromatin
+ model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models[,idx]
+ # nucleosomes
+ model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models[,idx]
+
+ # plot classes
+ col = brewer.pal(3, "Set1")
+ # X11(width=24, height=12)
+ png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_sampled_sequences_%dclass_2.png", k)),
+ units="in", res=720, width=18, height=12)
+ m = matrix(1:24, nrow=6, ncol=4, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(model.prob, decreasing=T)
+ ref.open = model.open[ord,, drop=F][,]
+ ref.nucl = model.nucl[ord,, drop=F][,]
+ ref.seq = model.seq[,,ord, drop=F][,,]
+ prob = model.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ tf = class.tf[ord]
+ for(i in 1:nrow(ref.open))
+ { # plot logo
+ par(mar=c(2,2,2,0))
+ plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
+ main=sprintf("%s (p=%.2f)", tf[i], prob[i]))
+ # x-axis
+ x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
+ x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
+ axis(1, at=x.at, labels=x.lab)
+ # y-axis is [0,1] for min/max signal
+ y.at = seq(0, 2, length.out=2)
+ y.lab = c("min", "max")
+ axis(2, at=y.at, labels=y.lab)
+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
+ lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
+ lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
+ }
+
+ # inlets with center
+ row_n = 1 # row counter
+ col_n = 1 # column counter
+ row_h = 1/nrow(m) # height of row
+ col_w = 1/ncol(m) # width of column
+ row_cor = row_h / 3
+ col_cor = col_w / 3
+ for(i in 1:nrow(ref.open))
+ { # plot logo center
+ left = (col_w*col_n) - col_w
+ right = left + col_w
+ left = right - col_cor
+ bottom = 1 - (row_h*row_n)
+ top = bottom + row_h
+ bottom = top - row_cor
+
+ p= par(fig=c(left, right, bottom, top),
+ mar=c(0,0,0,0),
+ new=T)
+ idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
+ plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
+ lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
+ lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
+ # xaxis
+ # x.at = seq(1, length(idx), length.out = 3)
+ # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
+ x.at = ceiling(length(idx)/2)
+ x.lab = 0
+ axis(1, at=x.at, labels=x.lab)
+ # yaxis
+ axis(2, at=y.at, labels=y.lab)
+ row_n = row_n + 1
+ if(i %% nrow(m) == 0)
+ { col_n = col_n + 1
+ row_n = 1
+ }
+ par(p)
+ }
+ dev.off()
+}
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
index 80b9afd..33f3be9 100755
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
@@ -1,76 +1,76 @@
# paths
## dir
data_dir_p="data/10xgenomics_PBMC_5k_peaks"
data_dir="data/10xgenomics_PBMC_5k"
pwm_dir="data/pwm/jaspar_2018_clustering/"
hg19_dir="data/genomes"
results_dir="results/10xgenomics_PBMC_5k_peaks_classification_7"
## matrix files
file_mat_open=$data_dir_p/'peaks_rmsk_openchromatin_1kb_read_atac.mat'
file_mat_nucl=$data_dir_p/'peaks_rmsk_nucleosomes_1kb_fragment_center.mat'
file_mat_seq=$data_dir_p/'peaks_rmsk_sequences_1kb.mat'
## file with seeds
file_seed=$results_dir'/peaks_rmsk_seed.txt'
mkdir -p $results_dir
touch $file_seed
# EM param
n_iter='1'
n_shift='971'
-n_core=24
+n_core=32
## PWM files
jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat"
hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat"
myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat"
pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat"
cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat"
irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat"
irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat"
lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat"
foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat"
sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat"
mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat"
elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat"
stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat"
nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat"
ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat"
e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat"
ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat"
klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat"
nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat"
egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat"
gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat"
nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat"
runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat"
# classify
for k in 23
do
## results files
file_prob=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_prob.mat4d'
file_mod1=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model.mat'
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
# extend models
file_mod1_ext=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model_extended.mat'
file_mod2_ext=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat'
file_mod3_ext=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model_extended.mat'
file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk.bed'
file_fasta=$hg19_dir/'hg19.fasta'
file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam'
file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai'
file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam'
file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai'
bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext
bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext
bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext
done
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh
index 2e4fecb..bbbfd09 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh
@@ -1,4 +1,4 @@
-scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks_sampled.sh
+scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R
diff --git a/src/Applications/.ProbToModelApplication.cpp.swp b/src/Applications/.ProbToModelApplication.cpp.swp
new file mode 100644
index 0000000..ace6708
Binary files /dev/null and b/src/Applications/.ProbToModelApplication.cpp.swp differ
diff --git a/src/Applications/ProbToModelApplication.cpp b/src/Applications/ProbToModelApplication.cpp
index dce386b..8a0613f 100644
--- a/src/Applications/ProbToModelApplication.cpp
+++ b/src/Applications/ProbToModelApplication.cpp
@@ -1,320 +1,320 @@
#include <ProbToModelApplication.hpp>
#include <boost/program_options.hpp>
#include <string>
#include <vector>
#include <iostream>
#include <utility> // std::move()
#include <stdexcept> // std::invalid_argument, std::runtime_error
#include <ModelComputer.hpp>
#include <ReadModelComputer.hpp>
#include <SequenceModelComputer.hpp>
#include <ConsensusSequenceModelComputer.hpp>
#include <Matrix2D.hpp>
#include <Matrix4D.hpp>
#include <matrix_utility.hpp> // filter_rows()
namespace po = boost::program_options ;
typedef std::vector<double> vector_d ;
ProbToModelApplication::ProbToModelApplication(int argn, char** argv)
: file_read(""), file_seq(""), file_prob(""), file_filter(""),
bckg_class(false),
n_threads(0), runnable(false)
{ this->parseOptions(argn, argv) ; }
ProbToModelApplication::~ProbToModelApplication()
{}
int ProbToModelApplication::run()
{ if(this->runnable)
{
// load data
std::string file_data ;
bool read_data = false ;
bool seq_data = false ;
bool consseq_data = false ;
if(this->file_read != "")
{ file_data = this->file_read ;
seq_data = consseq_data = false ; read_data = true ;
}
else if(this->file_seq != "")
{ file_data = this->file_seq ;
read_data = consseq_data = false ; seq_data = true ;
}
else if(this->file_consseq != "")
{ file_data = this->file_consseq ;
read_data = seq_data = false ; consseq_data = true ;
}
else
{ std::string msg("Error! Could not determine the type of the data!") ;
throw std::runtime_error(msg) ;
}
// row filter
std::vector<size_t> filter ;
if(this->file_filter != "")
{ // it is a column vector, easier to use the Matrix2D interface
// to read it rather than coding a function for :)
filter = Matrix2D<size_t>(this->file_filter).get_data() ;
std::sort(filter.begin(), filter.end()) ;
}
// prob
Matrix4D<double> prob ; prob.load(this->file_prob) ;
// get the data model
ModelComputer* ptr = nullptr ;
if(read_data)
{ Matrix2D<int> data(file_data) ;
// filter out some rows if needed
if(filter.size())
{ data = filter_rows(filter, data) ; }
this->check_dimensions(data, prob) ;
ptr = new ReadModelComputer(std::move(data),
prob,
this->bckg_class,
this->n_threads) ;
}
else if(seq_data)
{
Matrix2D<int> data(file_data) ;
// filter out some rows if needed
if(filter.size())
{ data = filter_rows(filter, data) ; }
this->check_dimensions(data, prob) ;
ptr = new SequenceModelComputer(std::move(data),
prob,
this->bckg_class,
this->n_threads) ;
}
else if(consseq_data)
{ Matrix3D<double> data ;
data.load(file_data) ;
// filter out some rows if needed
if(filter.size())
{ data = filter_rows(filter, data) ; }
this->check_dimensions(data, prob) ;
ptr = new ConsensusSequenceModelComputer(std::move(data),
prob,
this->bckg_class,
this->n_threads) ;
}
Matrix2D<double> model = ptr->get_model() ;
delete ptr ;
ptr = nullptr ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
vector_d class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
// 1st column contain the class prob
if(read_data)
{ for(size_t i=0; i<model_final.get_nrow(); i++)
{ model_final(i,0) = class_prob[i] ; }
}
- else if(seq_data)
+ else if(seq_data or consseq_data)
{ size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
std::cout << model_final << std::endl ;
return 0 ;
}
else
{ return 1 ; }
}
void ProbToModelApplication::parseOptions(int argn, char** argv)
{ // no option to parse
if(argv == nullptr)
{ std::string message = "no options to parse!" ;
throw std::invalid_argument(message) ;
}
// help messages
std::string desc_msg = "\n"
"ProbToRef reconstructs the class models \n"
"from the classification results (posterior \n"
"probabilities computed by ChIPPartitioning \n"
"and the data).\n\n" ;
std::string opt_help_msg = "Produces this help message." ;
std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n "
"by default 0 (no parallelization)." ;
std::string opt_read_msg = "The path to the file containing the data, if the data are\n"
"read counts.";
std::string opt_seq_msg = "The path to the file containing the data, if the data are\n"
"sequences.";
std::string opt_consseq_msg = "The path to the file containing the data, if the data are\n"
"consensus sequences.";
std::string opt_prob_msg = "The path to the file containing the posterior \n"
"probabilities." ;
std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n"
"indices of rows to filter out in the data. Once the rows have been\n"
"filtered out from the data, its rows order should correspond to the\n"
"probabilities." ;
std::string opt_bckg_msg = "Whether the last class of the classification (posterior\n"
"probabilities) is a background class." ;
// option parser
boost::program_options::variables_map vm ;
boost::program_options::options_description desc(desc_msg) ;
std::string seeding_tmp ;
desc.add_options()
("help,h", opt_help_msg.c_str())
("read,", po::value<std::string>(&(this->file_read)), opt_read_msg.c_str())
("seq,", po::value<std::string>(&(this->file_seq)), opt_seq_msg.c_str())
("consseq,", po::value<std::string>(&(this->file_consseq)), opt_seq_msg.c_str())
("prob,", po::value<std::string>(&(this->file_prob)), opt_prob_msg.c_str())
("filter", po::value<std::string>(&(this->file_filter)), opt_filter_msg.c_str())
("bgclass", opt_bckg_msg.c_str())
("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;
// parse
try
{ po::store(po::parse_command_line(argn, argv, desc), vm) ;
po::notify(vm) ;
}
catch(std::invalid_argument& e)
{ std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
throw std::invalid_argument(msg) ;
}
catch(...)
{ throw std::invalid_argument("An unknown error occured while parsing the options") ; }
bool help = vm.count("help") ;
// checks unproper option settings
if((this->file_read == "") and
(this->file_seq == "") and
(this->file_consseq == "") and
(not help))
{ std::string msg("Error! No data file was given (none of --read --seq --consseq)!") ;
throw std::invalid_argument(msg) ;
}
else if((this->file_read != "") and
(this->file_seq != "") and
(not help))
{ std::string msg("Error! --read and --seq are mutually exclusive!") ;
throw std::invalid_argument(msg) ;
}
else if((this->file_read != "") and
(this->file_consseq != "") and
(not help))
{ std::string msg("Error! --read and --consseq are mutually exclusive!") ;
throw std::invalid_argument(msg) ;
}
else if((this->file_seq != "") and
(this->file_consseq != "") and
(not help))
{ std::string msg("Error! --seq and --consseq are mutually exclusive!") ;
throw std::invalid_argument(msg) ;
}
else if(this->file_prob == "" and (not help))
{ std::string msg("Error! No posterior probabily file was given (--prob)!") ;
throw std::invalid_argument(msg) ;
}
// set background class
if(vm.count("bgclass"))
{ this->bckg_class = true ; }
// help invoked, run() cannot be invoked
if(help)
{ std::cout << desc << std::endl ;
this->runnable = false ;
return ;
}
// everything fine, run() can be called
else
{ this->runnable = true ;
return ;
}
}
void ProbToModelApplication::check_dimensions(const Matrix2D<int>& data,
const Matrix4D<double>& prob) const
{ if(data.get_nrow() != prob.get_dim()[0])
{ char msg[4096] ;
sprintf(msg, "Error! data and prob matrices have unequal "
"row numbers (%zu / %zu)!",
data.get_nrow(), prob.get_dim()[0]) ;
throw std::runtime_error(msg) ;
}
else if(data.get_ncol() < prob.get_dim()[2])
{ char msg[4096] ;
sprintf(msg, "Error! too many shift states for the data width!"
"(%zu shift states and %zu columns in data)!",
prob.get_dim()[2], data.get_ncol()) ;
throw std::runtime_error(msg) ;
}
}
void ProbToModelApplication::check_dimensions(const Matrix3D<double>& data,
const Matrix4D<double>& prob) const
{ if(data.get_dim()[0] != prob.get_dim()[0])
{ char msg[4096] ;
sprintf(msg, "Error! data and prob matrices have unequal "
"row numbers (%zu / %zu)!",
data.get_dim()[0], prob.get_dim()[0]) ;
throw std::runtime_error(msg) ;
}
else if(data.get_dim()[1] < prob.get_dim()[2])
{ char msg[4096] ;
sprintf(msg, "Error! too many shift states for the data width!"
"(%zu shift states and %zu data width)!",
prob.get_dim()[2], data.get_dim()[1]) ;
throw std::runtime_error(msg) ;
}
else if(data.get_dim()[2]!= 4)
{ char msg[4096] ;
sprintf(msg, "Error! data 3rd dimension is not 4!"
"(%zu)!",
data.get_dim()[2]) ;
throw std::runtime_error(msg) ;
}
}
int main(int argn, char** argv)
{ ProbToModelApplication app(argn, argv) ;
return app.run() ;
}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7fe6210..3086e51 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,157 +1,158 @@
# compiler options
add_compile_options(-std=c++14)
add_compile_options(-O3)
add_compile_options(-Wall)
add_compile_options(-Wextra)
add_compile_options(-Werror)
add_compile_options(-Wfatal-errors)
add_compile_options(-pedantic)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}")
add_definitions (${SEQAN_DEFINITIONS})
# include file location
include_directories(${Boost_INCLUDE_DIRS})
include_directories(${SEQAN_INCLUDE_DIRS})
include_directories("${scATACseq_SOURCE_DIR}/src/Matrix")
include_directories("${scATACseq_SOURCE_DIR}/src/Clustering")
include_directories("${scATACseq_SOURCE_DIR}/src/Random")
include_directories("${scATACseq_SOURCE_DIR}/src/Parallel")
include_directories("${scATACseq_SOURCE_DIR}/src/Statistics")
include_directories("${scATACseq_SOURCE_DIR}/src/GUI")
include_directories("${scATACseq_SOURCE_DIR}/src/Applications")
include_directories("${scATACseq_SOURCE_DIR}/src/Matrix")
include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools")
include_directories("${scATACseq_SOURCE_DIR}/src/Utility")
# compile modules into static libraries
## set output directory
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib")
## build instructions
add_library(Clustering "Clustering/DataLayer.cpp"
"Clustering/Data2DLayer.cpp"
"Clustering/Data3DLayer.cpp"
"Clustering/ReadLayer.cpp"
"Clustering/SequenceLayer.cpp"
"Clustering/ConsensusSequenceLayer.cpp"
"Clustering/ModelComputer.cpp"
"Clustering/ReadModelComputer.cpp"
"Clustering/SequenceModelComputer.cpp"
"Clustering/ConsensusSequenceModelComputer.cpp"
"Clustering/EMBase.cpp"
"Clustering/EMRead.cpp"
"Clustering/EMSequence.cpp"
"Clustering/EMConsensusSequence.cpp"
"Clustering/EMJoint.cpp")
add_library(Random "Random/Random.cpp"
"Random/RandomNumberGenerator.cpp")
add_library(Parallel "Parallel/ThreadPool.cpp")
-add_library(Statistics "Statistics/Statistics.cpp")
+add_library(Statistics "Statistics/Statistics.cpp"
+ "Statistics/KmersStatistics.cpp")
add_library(GUI "GUI/ConsoleProgressBar.cpp"
"GUI/Diplayable.cpp"
"GUI/Updatable.cpp")
add_library(GenomicTools "GenomicTools/MatrixCreator.cpp"
"GenomicTools/ReadMatrixCreator.cpp"
"GenomicTools/CorrelationMatrixCreator.cpp"
"GenomicTools/ClassReadDataCreator.cpp"
"GenomicTools/ClassSequenceDataCreator.cpp"
"GenomicTools/SequenceMatrixCreator.cpp"
"GenomicTools/GenomeRegion.cpp")
add_library(Utility "Utility/matrix_utility.cpp"
"Utility/dna_utility.cpp")
## resolve dependencies
target_link_libraries(Utility ${SEQAN_LIBRARIES})
target_link_libraries(Clustering Utility Random Statistics GUI Parallel ${SEQAN_LIBRARIES})
target_link_libraries(Parallel Threads::Threads)
target_link_libraries(GenomicTools Utility ${SEQAN_LIBRARIES})
# executables
## a toy for seqan
set(EXE_MAIN_TEST "main_test")
add_executable(${EXE_MAIN_TEST} "main_test.cpp")
target_link_libraries(${EXE_MAIN_TEST} GenomicTools Clustering)
set_target_properties(${EXE_MAIN_TEST} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## a toy for correlation matrix
set(EXE_MAIN_CORMAT "main_cormat")
add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp")
target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} Utility GenomicTools Random)
set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an application to read binary matrices and display them in txt
set(EXE_MAIN_MATBIN2TXT "MatrixBinToTxt")
add_executable(${EXE_MAIN_MATBIN2TXT} "Applications/MatrixBinToTxtApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_MAIN_MATBIN2TXT} Utility Boost::program_options)
set_target_properties(${EXE_MAIN_MATBIN2TXT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an application to know the indices of 0-signal rows in integer 2D matrices
set(EXE_MAIN_WHICHNULLROWS "WhichNullRows")
add_executable(${EXE_MAIN_WHICHNULLROWS} "Applications/WhichNullRowsApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_MAIN_WHICHNULLROWS} Boost::program_options)
set_target_properties(${EXE_MAIN_WHICHNULLROWS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an application to create a matrix from BED and a BAM file
set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator")
add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an application to create a sequence matrix from BED and a fasta file
set(EXE_MAIN_SEQMATRIX "SequenceMatrixCreator")
add_executable(${EXE_MAIN_SEQMATRIX} "Applications/SequenceMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_MAIN_SEQMATRIX} GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_MAIN_SEQMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an EMRead standalone
set(EXE_EMREAD "EMRead")
add_executable(${EXE_EMREAD} "Applications/EMReadApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_EMREAD} Clustering Boost::program_options)
set_target_properties(${EXE_EMREAD} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an EMSequence standalone
set(EXE_EMSEQ "EMSequence")
add_executable(${EXE_EMSEQ} "Applications/EMSequenceApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_EMSEQ} Clustering Utility Boost::program_options)
set_target_properties(${EXE_EMSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an EMConsensusSequence standalone
set(EXE_EMCONSSEQ "EMConsensusSequence")
add_executable(${EXE_EMCONSSEQ} "Applications/EMConsensusSequenceApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_EMCONSSEQ} Clustering Utility Boost::program_options)
set_target_properties(${EXE_EMCONSSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an EMJoint standalone
set(EXE_EMJOINT "EMJoint")
add_executable(${EXE_EMJOINT} "Applications/EMJointApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_EMJOINT} Clustering Utility Boost::program_options)
set_target_properties(${EXE_EMJOINT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to compute data models from the data and the post prob of an EM classification
set(EXE_PROB2REF "ProbToModel")
add_executable(${EXE_PROB2REF} "Applications/ProbToModelApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_PROB2REF} Clustering Utility Boost::program_options)
set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to extend read models from an EM classification
set(EXE_READMODELEXTENDER "ReadModelExtender")
add_executable(${EXE_READMODELEXTENDER} "Applications/ReadModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_READMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_READMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to extend read models from an EM classification
set(EXE_SEQUENCEMODELEXTENDER "SequenceModelExtender")
add_executable(${EXE_SEQUENCEMODELEXTENDER} "Applications/SequenceModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_SEQUENCEMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_SEQUENCEMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to extract read data from an EM classification
set(EXE_CLASSREADDATACREATOR "ClassReadDataCreator")
add_executable(${EXE_CLASSREADDATACREATOR} "Applications/ClassReadDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_CLASSREADDATACREATOR} Clustering GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_CLASSREADDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to extract sequence data from an EM classification
set(EXE_CLASSSEQDATACREATOR "ClassSequenceDataCreator")
add_executable(${EXE_CLASSSEQDATACREATOR} "Applications/ClassSequenceDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_CLASSSEQDATACREATOR} Clustering GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_CLASSSEQDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## a test suite
set(EXE_TESTS "unittests")
add_executable(${EXE_TESTS} "unittests.cpp"
"Unittests/unittests_matrix.cpp"
"Unittests/unittests_genomictools.cpp")
target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools)
set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
diff --git a/src/Statistics/KmersStatistics.cpp b/src/Statistics/KmersStatistics.cpp
new file mode 100644
index 0000000..58533a8
--- /dev/null
+++ b/src/Statistics/KmersStatistics.cpp
@@ -0,0 +1,230 @@
+#include <KmersStatistics.hpp>
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include <Matrix2D.hpp>
+#include <Matrix3D.hpp>
+#include <Statistics.hpp> // poisson_cdf()
+#include <dna_utility.hpp> // dna::consensus_to_kmer()
+
+Matrix2D<int> kmers::compute_mononucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts)
+{
+ Matrix2D<int> comp(k, 4, 0) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ {
+ std::string kmer = kmers[i] ;
+ int count = counts[i] ;
+
+ for (size_t j=0; j<k ; j++)
+ { int base = int(kmer[j]) - int('0') ;
+ comp(j, base) += count ;
+ }
+ }
+ return(comp);
+}
+
+Matrix3D<int> kmers::compute_dinucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts)
+{
+ Matrix3D<int> comp2(4, k-1, 4) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ { std::string kmer = kmers[i] ;
+ int count = counts[i] ;
+ for (size_t j=0; j<k-1; j++)
+ { int base1 = int(kmer[j]) - int('0') ;
+ int base2 = int(kmer[j+1]) - int('0') ;
+ comp2(base1, j, base2) += count ;
+ }
+ }
+ return(comp2);
+}
+
+std::vector<double> kmers::compute_exp_values(size_t k,
+ const std::vector<std::string>& kmers,
+ const Matrix2D<int>& comp1,
+ const Matrix3D<int>& comp2)
+{ std::vector<double> expected(kmers.size()) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ { std::string kmer = kmers[i] ;
+ double exp = comp1(0, int(kmer[0]) - int('0')) ;
+
+ for(size_t j=0; j<k-1; j++)
+ { int base1 = int(kmer[j]) - int('0') ;
+ int base2 = int(kmer[j+1]) - int('0') ;
+ exp *= ((double)comp2(base1, j, base2) /
+ (double)comp1(j+1, base2)) ;
+ }
+
+ expected[i] = exp ;
+ }
+ return expected;
+}
+
+
+std::vector<double> kmers::compute_pvalues(const std::vector<int>& counts,
+ const std::vector<double>& expected)
+{
+ std::vector<double> pvalues(counts.size()) ;
+
+ for(size_t i=0; i<counts.size(); i++)
+ {
+ pvalues[i] = poisson_cdf((double)counts[i],
+ expected[i],
+ false) ;
+ }
+ return pvalues ;
+}
+
+std::unordered_map<std::string, int>
+kmers::compute_kmer_count(const Matrix3D<double>& consensus_sequence,
+ size_t length)
+{ std::unordered_map<std::string, int> kmer_count ;
+
+ size_t n_row = consensus_sequence.get_dim()[0] ;
+ size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;
+
+ for(size_t i=0; i<n_row; i++)
+ { for(size_t s=0; s<n_shift; s++)
+ { // get kmer
+ std::string kmer = dna::consensus_to_kmer(consensus_sequence,
+ i,
+ s,
+ s+length) ;
+ // insert
+ auto iter = kmer_count.find(kmer) ;
+ if(iter == kmer_count.end())
+ { kmer_count.emplace(kmer, 1) ; }
+ // update
+ else
+ { iter->second += 1 ; }
+ }
+ }
+ return kmer_count ;
+}
+
+std::pair<std::vector<std::string>, std::vector<double>>
+kmers::compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
+ size_t k)
+{
+
+ // get kmer count
+ std::unordered_map<std::string, int> kmer_count =
+ kmers::compute_kmer_count(consensus_sequences, k) ;
+
+ /*
+ consensus_sequences.get_dim() ;
+ std::unordered_map<std::string, int> kmer_count ;
+ // ANN
+ kmer_count["000"] = 1 ;
+ kmer_count["001"] = 2 ;
+ kmer_count["002"] = 3 ;
+ kmer_count["003"] = 4 ;
+ kmer_count["010"] = 5 ;
+ kmer_count["011"] = 6 ;
+ kmer_count["012"] = 7 ;
+ kmer_count["013"] = 8 ;
+ kmer_count["020"] = 9 ;
+ kmer_count["021"] = 10 ;
+ kmer_count["022"] = 11 ;
+ kmer_count["023"] = 12 ;
+ kmer_count["030"] = 13 ;
+ kmer_count["031"] = 14 ;
+ kmer_count["032"] = 15 ;
+ kmer_count["033"] = 16 ;
+ // CNN
+ kmer_count["100"] = 1 ;
+ kmer_count["101"] = 2 ;
+ kmer_count["102"] = 3 ;
+ kmer_count["103"] = 4 ;
+ kmer_count["110"] = 5 ;
+ kmer_count["111"] = 6 ;
+ kmer_count["112"] = 7 ;
+ kmer_count["113"] = 8 ;
+ kmer_count["120"] = 9 ;
+ kmer_count["121"] = 10 ;
+ kmer_count["122"] = 11 ;
+ kmer_count["123"] = 12 ;
+ kmer_count["130"] = 13 ;
+ kmer_count["131"] = 14 ;
+ kmer_count["132"] = 15 ;
+ kmer_count["133"] = 16 ;
+ // GNN
+ kmer_count["200"] = 1 ;
+ kmer_count["201"] = 2 ;
+ kmer_count["202"] = 3 ;
+ kmer_count["203"] = 4 ;
+ kmer_count["210"] = 5 ;
+ kmer_count["211"] = 6 ;
+ kmer_count["212"] = 7 ;
+ kmer_count["213"] = 8 ;
+ kmer_count["220"] = 9 ;
+ kmer_count["221"] = 10 ;
+ kmer_count["222"] = 11 ;
+ kmer_count["223"] = 12 ;
+ kmer_count["230"] = 13 ;
+ kmer_count["231"] = 14 ;
+ kmer_count["232"] = 15 ;
+ kmer_count["233"] = 16 ;
+ // TNN
+ kmer_count["300"] = 1 ;
+ kmer_count["301"] = 2 ;
+ kmer_count["302"] = 3 ;
+ kmer_count["303"] = 4 ;
+ kmer_count["310"] = 5 ;
+ kmer_count["311"] = 6 ;
+ kmer_count["312"] = 7 ;
+ kmer_count["313"] = 8 ;
+ kmer_count["320"] = 9 ;
+ kmer_count["321"] = 10 ;
+ kmer_count["322"] = 11 ;
+ kmer_count["323"] = 12 ;
+ kmer_count["330"] = 13 ;
+ kmer_count["331"] = 14 ;
+ kmer_count["332"] = 15 ;
+ kmer_count["333"] = 16 ;
+ */
+
+ // turn to vectors
+ std::vector<std::string> kmers(kmer_count.size()) ;
+ size_t i=0 ;
+ for(const auto& iter : kmer_count)
+ { kmers[i] = iter.first ;
+ i++ ;
+ }
+ std::sort(kmers.begin(), kmers.end()) ;
+ std::vector<int> counts(kmer_count.size()) ;
+ i=0 ;
+ for(const auto& kmer : kmers)
+ { counts[i] = kmer_count[kmer] ;
+ i++ ;
+ }
+ kmer_count.clear() ;
+
+ // get mononucleotide composition in kmer
+ // this is a probability matrix modeling the kmer
+ Matrix2D<int> comp1 = kmers::compute_mononucleotide(k, kmers, counts) ;
+
+ // get dinucleotide composition in kmer
+ // this is a probability matrix modeling the kmer
+ Matrix3D<int> comp2 = kmers::compute_dinucleotide(k, kmers, counts) ;
+
+ // get expected number of occurence for each kmer
+ std::vector<double> expected = kmers::compute_exp_values(k,
+ kmers,
+ comp1,
+ comp2) ;
+
+ // compute the pvalue associated to each kmer
+ std::vector<double> pvalues = kmers::compute_pvalues(counts,
+ expected) ;
+
+ return std::make_pair(kmers, pvalues) ;
+}
diff --git a/src/Statistics/KmersStatistics.hpp b/src/Statistics/KmersStatistics.hpp
new file mode 100644
index 0000000..fcc4f60
--- /dev/null
+++ b/src/Statistics/KmersStatistics.hpp
@@ -0,0 +1,133 @@
+#ifndef KMER_STATISTICS_HPP
+#define KMER_STATISTICS_HPP
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include <Matrix2D.hpp>
+#include <Matrix3D.hpp>
+
+
+namespace kmers {
+
+ /*!
+ * \brief Creates a table of all kmer found in
+ * a set of consensus sequences and return an
+ * associated pvalue for each one of them.
+ * To transform a consensus sequence into a kmer,
+ * the major base at each position is considered
+ * only.
+ * The pvalue is computed as 1 - p(count, exp)
+ * where p is the CDF of a Poisson distribution
+ * with <count> being the number of time a given
+ * kmer is observed in the data and <exp> its
+ * expected number of occurences. The expected
+ * number of occurences is computed from the
+ * observed di-nucleotide occurences in the given
+ * data.
+ * \param consensus_sequences
+ * \param k the kmer length.
+ * \return a pair of vectors containing the kmers
+ * in integer format (A:0, C:1, G:2, T:3) and
+ * their number of occurences (in the same order).
+ */
+ std::pair<std::vector<std::string>, std::vector<double>>
+ compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
+ size_t k) ;
+
+ /*!
+ * \brief Computes the number of occurences of each kmer
+ * appearing in a set of consensus sequences.
+ * To transform a consensus sequence into a kmer,
+ * the major base at each position is considered
+ * only.
+ * \param consensus_sequence a matrix containing the
+ * consensus sequences as a probability matrix. It has
+ * the following dimensions :
+ * 1st the number of consensus sequences
+ * 2nd the length of the consensus sequences
+ * 3rd 4 for A, C, G, T
+ * The summing over the 1st and 2nd dimensions should
+ * give 1s.
+ * \param k the kmer length.
+ * \return a map with the kmer found as keys and their
+ * number of occurences as value.
+ */
+ std::unordered_map<std::string, int>
+ compute_kmer_count(const Matrix3D<double>& consensus_sequence,
+ size_t k) ;
+
+ /*!
+ * \brief Compute the pvalue of a set of kmers given their
+ * number of occurences and their expected number of
+ * occurences.
+ * \param counts the number of occurences of each kmer.
+ * \param expected the expected number of occurences of
+ * each kmer.
+ * \return the pvalue of each kmer.
+ */
+ std::vector<double> compute_pvalues(const std::vector<int>& counts,
+ const std::vector<double>& expected) ;
+
+ /*!
+ * \brief Computes the number of expected occurences of a set
+ * of kmers given the mononucleotide and dinucleotide composition
+ * of the kmers.
+ * \param k the length of the kmers.
+ * \param kmers the kmers in integer format (A:0, C:1, G:2, T:3)
+ * \param comp1 a matrix with the number of counts of each
+ * base at each position in the kmers. It has the following
+ * dimensions :
+ * 1st dim k
+ * 2nd dim 4 for A, C, G, T
+ * \param comp2 a matrix containing the number of counts of
+ * each pair of dinucleotide in the kmers. It has the
+ * following dimensions :
+ * 1st 4 for A, C, G, T (1st base of the dinucleotide)
+ * 2nd k-1
+ * 3rd 4 for A, C, G, T (2n base of the dinucleotide)
+ * \return the expected number of occurences for each
+ * kmer.
+ */
+ std::vector<double> compute_exp_values(size_t k,
+ const std::vector<std::string>& kmers,
+ const Matrix2D<int>& comp1,
+ const Matrix3D<int>& comp2) ;
+
+ /*!
+ * \brief Computes the number of occurences of each dinucleotide
+ * pair at each position of a given set of kmers.
+ * \param k the length of the kmers.
+ * \param kmers the kmers in integer format (A:0, C:1, G:2, T:3).
+ * \param counts the number of occurences of each kmer.
+ * \return a matrix containing the number of counts of
+ * each pair of dinucleotide in the kmers. It has the
+ * following dimensions :
+ * 1st 4 for A, C, G, T (1st base of the dinucleotide)
+ * 2nd k-1
+ * 3rd 4 for A, C, G, T (2n base of the dinucleotide)
+ */
+ Matrix3D<int> compute_dinucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts) ;
+
+ /*!
+ * \brief Computes the number of occurences of each nucleotide
+ * at each position of a given set of kmers.
+ * \param k the length of the kmers.
+ * \param kmers the kmers in integer format (A:0, C:1, G:2, T:3).
+ * \param counts the number of occurences of each kmer
+ * \return a matrix with the number of counts of each
+ * base at each position in the kmers. It has the following
+ * dimensions :
+ * 1st dim k
+ * 2nd dim 4 for A, C, G, T
+ */
+ Matrix2D<int> compute_mononucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts) ;
+
+}
+
+#endif // KMER_STATISTICS_HPP
diff --git a/src/Statistics/Statistics.cpp b/src/Statistics/Statistics.cpp
index c26f789..c671237 100644
--- a/src/Statistics/Statistics.cpp
+++ b/src/Statistics/Statistics.cpp
@@ -1,31 +1,45 @@
#include <Statistics.hpp>
#include <cmath> // M_PI, pow(), sqrt(), log(), lgamma()
#include <boost/math/distributions.hpp> // beta_distribution
#include <boost/random.hpp>
double normal_pmf(double x, double mean, double sd)
{ static double pi_2 = 2.*M_PI ;
return ( 1. / ( sd * sqrt(pi_2) )) * exp(-0.5 * pow((x-mean)/sd, 2.0 ) );
}
double poisson_pmf(int x, double lambda)
{ if((x != 0) and (lambda == 0))
{ return 0. ; }
else if((x == 0) and (lambda == 0))
{ return 1. ; }
// if(x + lambda == 0)
// { return 1. ; }
return exp(x * log(lambda) - lgamma(x + 1.0) - lambda);
}
+double poisson_cdf(int x, double lambda, bool lower_tail)
+{ double prob = 1. ;
+ if(lambda != 0)
+ { boost::math::poisson_distribution<> pois_dist(lambda) ;
+ prob = cdf(pois_dist, x) ;
+ }
+ // prob of an observation as big as x
+ if(lower_tail)
+ { return prob ; }
+ // prob of an observation at least as big as x (pvalue)
+ else
+ { return 1 - prob ; }
+}
+
double beta_pmf(double x, double alpha, double beta)
{
boost::math::beta_distribution<> beta_dist(alpha, beta) ;
double y = quantile(beta_dist, x) ;
return y ;
}
diff --git a/src/Statistics/Statistics.hpp b/src/Statistics/Statistics.hpp
index fdc9808..eb0a966 100755
--- a/src/Statistics/Statistics.hpp
+++ b/src/Statistics/Statistics.hpp
@@ -1,291 +1,307 @@
#ifndef STATISTCS_HPP
#define STATISTICS_HPP
#include <vector>
#include <numeric>
#include <cmath> // pow(), sqrt()
#include <assert.h>
#include <iostream>
/*!
* \brief Computes the density of x given a gaussian
* distribution with the given mean and standard
* deviation parameters.
* \param x the value of interest.
* \param mean the gaussian mean parameter value.
* \param sd the gaussian standard deviation value.
* \return the probability of x p(x|N(mean,sd))
*/
double normal_pmf(double x, double mean, double sd) ;
/*!
* \brief Computes the probability of x given a Poisson
* distribution with the given lambda parameter.
* \param x the value of interest.
* \param lambda the lambda parameter.
* \return the probability of the value given the
* distribution.
*/
double poisson_pmf(int x, double lambda) ;
+/*!
+ * \brief Computes the probability of observing a
+ * value samller or equal to x, given a poisson
+ * distribution of parameter lambda.
+ * \param x the value.
+ * \param lambda lambda the lambda parameter.
+ * \param lower_tail whether the lower tail of the
+ * distribution should be returned. If true, the
+ * probability p of observing a value smaller or as
+ * big as x is returned. Otherwised the probability
+ * of observing a value at least as big as x
+ * is returned (1-p).
+ * \return the probability of observing a value
+ * smaller or as big as x.
+ */
+double poisson_cdf(int x, double lambda, bool lower_tail=true) ;
/*!
* \brief Computes the probability of x given a Beta
* distribution with the given alpha and beta parameters.
* \param x the value of interest
* \param alpha the alpha parameter.
* \param beta the beta parameter.
* \return the probability of the value given the
* distribution.
*/
double beta_pmf(double x, double alpha, double beta) ;
/*!
* \brief Computes the weighted mean of a vector of measures <x> given their
* probability <p>. The sum of <p> is expected to be 1, if not it will
* be normalized.
* \param x a vector of measures.
* \param p a vector of probability associated to each element of <x>.
* \param from the starting index, if different from -1.
* \param to the ending index (not included), if different from -1.
* \return the mean.
*/
template<class T>
inline double mean(const std::vector<T>& x, const std::vector<double>& p) ;
/*!
* \brief Computes the standard deviation of a vector of measures <x> given
* their probability <p>.
* \param x a vector of measures.
* \param p a vector of probability associated to each element of <x>.
* \param unbiased whether the unbiased standard deviation should be returned
* (the square root of the variance is divided by a normalizing factor).
* \return the standard deviation.
*/
template<class T>
inline double sd(const std::vector<T>& x, const std::vector<double>& p, bool unbiased=true) ;
/*! \brief Computes the unbiased standard deviation of a vector of
* measures <x> given their probability <p>.
* \param x a vector of measures.
* \param p a vector of probability associated to each element of <x>.
* \return the unbiased standard deviation.
*/
template<class T>
inline double sd_unbiased(const std::vector<T> &x, const std::vector<double> &p) ;
/*! \brief Computes the biased (non-normalized) standard deviation of a vector of
* measures <x> given their probability <p>.
* \param x a vector of measures.
* \param p a vector of probability associated to each element of <x>.
* \return the biased standard deviation.
*/
template<class T>
inline double sd_biased(const std::vector<T> &x, const std::vector<double> &p) ;
/*!
* \brief Computes the pearson correlation coefficient for two given vectors
* using v1[from1-->to1) and v2[from2-->to2). If from1, to1, from2, to2 are equal
* to -1, they are not accounted and the first and last elements considered are
* the beginning and the end of the vectors.
* \param v1 the first vector.
* \param v2 the second vector.
* \param from1 the index of the first value to use in the first vector.
* \param to1 the index after the last value to use in the fist vector.
* \param from2 the index of the first value to use in the second vector.
* \param to2 the index after the last value to use in the second vector.
* \return the correlation coefficient
*/
template<class T>
inline double cor_pearson(const std::vector<T>& v1,
const std::vector<T>& v2,
int from1=-1, int to1=-1,
int from2=-1, int to2=-1) ;
/*!
* \brief Computes the pearson correlation coefficient for two given vectors
* using v1[from1-->to1) and v2(to2<--from2] (the second vector is read backward,
* from the end).
* If from1, to1, from2, to2 are equal to -1, they are not accounted and the first
* and last elements considered are the beginning and the end of the vectors (for
* the last element considered in v2 to be the 1st of the vector (the 0th), to2
* should be set to -1, which is ignored. However this enables the default behaviour
* using the 0th element as the last one).
* \param v1 the first vector.
* \param v2 the second vector.
* \param from1 the index of the first value to use in the first vector.
* \param to1 the index after the last value to use in the fist vector.
* \param from2 the index of the first value to use in the second vector.
* \param to2 the index after the last value to use in the second vector.
* \return the correlation coefficient
*/
template<class T>
inline double cor_pearson_rev(const std::vector<T>& v1,
const std::vector<T>& v2,
int from1=-1, int to1=-1,
int from2=-1, int to2=-1) ;
template<class T>
inline double mean(const std::vector<T>& x, const std::vector<double>& p)
{
assert(x.size() == p.size()) ;
double mean = 0. ;
double total = 0. ;
for(auto i : p)
{ total += i ; }
for(size_t i=0; i<x.size(); i++)
{ mean += x[i] * (p[i]/total) ; }
return mean ;
}
template<class T>
inline double sd(const std::vector<T>& x, const std::vector<double>& p, bool unbiased)
{ if(unbiased)
{ return sd_unbiased(x, p) ; }
else
{ return sd_biased(x,p) ; }
}
template<class T>
inline double sd_unbiased(const std::vector<T>& x, const std::vector<double>& p)
{
assert(x.size() == p.size()) ;
double v1 = 0. ;
double v2 = 0. ;
double m = mean(x,p) ;
double sd = 0. ;
double total = 0. ;
for(auto i : p)
{ total += i ; }
for(size_t i=0; i<x.size(); i++)
{ double p_norm = p[i] / total ;
sd += pow(x[i] - m, 2) * (p_norm) ;
v1 += p_norm ;
v2 += pow(p_norm,2) ;
}
return sqrt(sd / (v1 - (v2/v1))) ;
}
template<class T>
inline double sd_biased(const std::vector<T>& x, const std::vector<double>& p)
{
assert(x.size() == p.size()) ;
double m = mean(x,p) ;
double sd = 0. ;
double total = 0. ;
for(auto i : p)
{ total += i ; }
for(size_t i=0; i<x.size(); i++)
{ double p_norm = p[i] / total ;
sd += pow(x[i] - m, 2) * (p_norm) ;
}
return sqrt(sd) ;
}
template<class T>
inline double cor_pearson(const std::vector<T>& v1,
const std::vector<T>& v2,
int from1, int to1,
int from2, int to2)
{
if(from1 == -1)
{ from1 = 0 ; }
if(to1 == -1)
{ to1 = v1.size() ; }
if(from2 == -1)
{ from2 = 0 ; }
if(to2 == -1)
{ to2 = v2.size() ; }
// 0 <= from < to < v.size()
// if from == to -> no sense, no value : return nan
// if from > to -> no sense, backward : does a segfault
assert(from1 >= 0) ;
assert(from1 < to1) ;
assert(to1 <= v1.size()) ;
assert(from2 >= 0) ;
assert(from2 < to2) ;
assert(to2 <= v2.size()) ;
// the terms of the formula
double sum_v1v2 = 0. ;
double sum_v1 = 0. ;
double sum_v2 = 0. ;
double sum_v1_2 = 0. ;
double sum_v2_2 = 0. ;
// the effective number of values considered
double n = to1 - from1 ;
for(size_t i1=from1, i2=from2; i1<to1; i1++,i2++)
{ sum_v1v2 += v1[i1]*v2[i2] ;
sum_v1 += v1[i1] ;
sum_v2 += v2[i2] ;
sum_v1_2 += pow(v1[i1],2) ;
sum_v2_2 += pow(v2[i2],2) ;
}
return (n*sum_v1v2 - (sum_v1*sum_v2)) /
(sqrt(n*sum_v1_2 - pow(sum_v1, 2)) * sqrt(n*sum_v2_2 - pow(sum_v2, 2)) ) ;
}
template<class T>
inline double cor_pearson_rev(const std::vector<T>& v1,
const std::vector<T>& v2,
int from1, int to1,
int from2, int to2)
{
if(from1 == -1)
{ from1 = 0 ; }
if(to1 == -1)
{ to1 = v1.size() ; }
if(from2 == -1)
{ from2 = v2.size() - 1 ; }
if(to2 == -1)
{ to2 = 0 ; }
// beware -> for v1 [from1, to1)
// for v2 [to2, from2]
// use from1 and to1 for the loop conditions
// 0 <= from < to < v1.size()
// 0 <= to2 < from2 < v2.size() because for v2 [to2, from2]
// if from == to -> no sense, no value : return nan
// if from > to -> no sense, backward : does a segfault
assert(from1 >= 0) ;
assert(from1 < to1) ;
assert(to1 <= v1.size()) ;
assert(to2 >= 0) ;
assert(to2 < from2) ;
assert(from2 < v2.size()) ;
// the terms of the formula
double sum_v1v2 = 0. ;
double sum_v1 = 0. ;
double sum_v2 = 0. ;
double sum_v1_2 = 0. ;
double sum_v2_2 = 0. ;
// the effective number of values considered
double n = to1 - from1 ;
for(int i1=from1, i2=from2; i1<to1; i1++,i2--)
{ sum_v1v2 += v1[i1]*v2[i2] ;
sum_v1 += v1[i1] ;
sum_v2 += v2[i2] ;
sum_v1_2 += pow(v1[i1],2) ;
sum_v2_2 += pow(v2[i2],2) ;
}
return (n*sum_v1v2 - (sum_v1*sum_v2)) /
(sqrt(n*sum_v1_2 - pow(sum_v1, 2)) * sqrt(n*sum_v2_2 - pow(sum_v2, 2)) ) ;
}
#endif // STATISTICS_HPP
diff --git a/src/Utility/dna_utility.cpp b/src/Utility/dna_utility.cpp
index 2902fb1..3db580a 100644
--- a/src/Utility/dna_utility.cpp
+++ b/src/Utility/dna_utility.cpp
@@ -1,178 +1,210 @@
#include<dna_utility.hpp>
-#include <string>
+#include <string> // string, to_string()
#include <unordered_map>
#include <stdexcept> // std::invalid_argument
+
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <seqan/seq_io.h> // seqan::SeqFileIn
int dna::map(char base, bool rev_compl)
{
static bool init = false ;
static std::unordered_map<char,int> hash_map ;
static std::unordered_map<char,int> hash_map_rev ;
if(not init)
{ hash_map['A'] = 0 ;
hash_map['a'] = 0 ;
hash_map['C'] = 1 ;
hash_map['c'] = 1 ;
hash_map['G'] = 2 ;
hash_map['g'] = 2 ;
hash_map['T'] = 3 ;
hash_map['t'] = 3 ;
hash_map['N'] = 4 ;
hash_map['n'] = 4 ;
hash_map_rev['A'] = hash_map['T'] ;
hash_map_rev['a'] = hash_map['t'] ;
hash_map_rev['C'] = hash_map['G'] ;
hash_map_rev['c'] = hash_map['g'] ;
hash_map_rev['G'] = hash_map['C'] ;
hash_map_rev['g'] = hash_map['c'] ;
hash_map_rev['T'] = hash_map['A'] ;
hash_map_rev['t'] = hash_map['a'] ;
hash_map_rev['N'] = hash_map['N'] ;
hash_map_rev['n'] = hash_map['n'] ;
init = true ;
}
try
{ if(rev_compl)
{ return hash_map_rev.at(base) ; }
else
{ return hash_map.at(base) ; }
}
// key could not be found
catch(std::out_of_range& e)
{ char msg[256] ;
sprintf(msg, "Error! Invalid DNA base : %c", base) ;
throw std::invalid_argument(msg) ;
}
}
char dna::map(int base, bool rev_compl)
{
static bool init = false ;
static std::unordered_map<int,char> hash_map ;
static std::unordered_map<int,char> hash_map_rev ;
if(not init)
{ hash_map[0] = 'A' ;
hash_map[1] = 'C' ;
hash_map[2] = 'G' ;
hash_map[3] = 'T' ;
hash_map[4] = 'N' ;
hash_map_rev[4] = hash_map[4] ;
hash_map_rev[3] = hash_map[0] ;
hash_map_rev[2] = hash_map[1] ;
hash_map_rev[1] = hash_map[2] ;
hash_map_rev[0] = hash_map[3] ;
init = true ;
}
try
{ if(rev_compl)
{ return hash_map_rev.at(base) ; }
else
{ return hash_map.at(base) ; }
}
// key could not be found
catch(std::out_of_range& e)
{ char msg[256] ;
sprintf(msg, "Error! Invalid DNA code : %i", base) ;
throw std::invalid_argument(msg) ;
}
}
int dna::char_to_int(char c, bool rev_compl)
{ return dna::map(c, rev_compl) ; }
Matrix2D<int> dna::char_to_int(const Matrix2D<int>& matrix)
{
size_t n_row = matrix.get_nrow() ;
size_t n_col = matrix.get_ncol() ;
Matrix2D<int> data_int(n_row, n_col) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ data_int(i,j) = dna::char_to_int(matrix(i,j)) ; }
}
return data_int ;
}
char dna::int_to_char(int n, bool rev_compl)
{ return dna::map(n, rev_compl) ; }
std::vector<double> dna::base_composition(const Matrix2D<int>& sequences, bool both_strands)
{
double total = 0. ;
std::vector<double> base_comp(4,0.) ;
int base_N = dna::map('N') ;
for(size_t i=0; i<sequences.get_nrow(); i++)
{ for(size_t j=0; j<sequences.get_ncol(); j++)
{ // forward strand
int base = sequences(i,j) ;
// do not account for N's
if(base == base_N)
{ continue ; }
else
{ base_comp[base] += 1;
total += 1. ;
}
// reverse complement strand
if(both_strands)
{ // size_t c_hash_rev = dna::hash(c, true) ;
base_comp[4-base-1] += 1. ;
total += 1. ;
}
}
}
// normalize
for(auto& i : base_comp)
{ i /= total ; }
return base_comp ;
}
std::vector<double> dna::base_composition(const Matrix3D<double>& consensus_sequences, bool both_strands)
{
if(consensus_sequences.get_dim()[2] != 4)
{ char msg[4096] ;
sprintf(msg, "Error! consensus sequences 3rd dimension not equal to 4 (%zu)",
consensus_sequences.get_dim()[2]) ;
throw std::invalid_argument(msg) ;
}
double total = 0. ;
std::vector<double> base_comp(4,0.) ;
for(size_t i=0; i<consensus_sequences.get_dim()[0]; i++)
{ for(size_t j=0; j<consensus_sequences.get_dim()[1]; j++)
{ for(size_t k=0; k<4; k++)
{ // forward strand
{ base_comp[k] += consensus_sequences(i,j,k) ;
total += consensus_sequences(i,j,k) ;
}
// revers strand
if(both_strands)
{ size_t k_comp = 4 - k - 1 ;
base_comp[k_comp] += consensus_sequences(i,j,k) ;
total += consensus_sequences(i,j,k) ;
}
}
}
}
// normalize
for(auto& i : base_comp)
{ i /= total ; }
return base_comp ;
}
+
+std::string dna::consensus_to_kmer(const Matrix3D<double>& consensus_sequences,
+ size_t row,
+ size_t from,
+ size_t to)
+{ // dna letter codes
+ static int n_code = dna::char_to_int('N') ;
+
+ // some dimensions
+ size_t length = to - from ;
+ size_t n_dim3 = consensus_sequences.get_dim()[2] ;
+
+ // kmer
+ std::string kmer(length, n_code) ;
+
+
+ for(size_t i_mat=from, i_kmer=0; i_mat<to; i_mat++, i_kmer++)
+ { // get majority base
+ int max_j = 5 ;
+ double max_n = -1. ;
+ for(size_t j=0; j<n_dim3; j++)
+ { if(consensus_sequences(row,i_mat,j) > max_n)
+ { max_n = consensus_sequences(row,i_mat,j) ;
+ max_j = j ;
+ }
+ }
+ kmer[i_kmer] = std::to_string(max_j)[0] ;
+ }
+
+ return kmer ;
+}
diff --git a/src/Utility/dna_utility.hpp b/src/Utility/dna_utility.hpp
index 03bde59..d89d770 100644
--- a/src/Utility/dna_utility.hpp
+++ b/src/Utility/dna_utility.hpp
@@ -1,106 +1,133 @@
#ifndef DNA_UTILITY_HPP
#define DNA_UTILITY_HPP
+#include <string>
+
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
namespace dna
{
/*!
* \brief Contains the mapping to convert
* DNA characters to integer codes.
* Lower and capital characters are accepted.
* \param base the character of interest.
* \param rev_compl whether the reverse
* complement of the character is of interest.
* \return the corresponding DNA code.
*/
int map(char base, bool rev_compl=false) ;
/*!
* \brief Contains the mapping to convert
* DNA code to characters.
* Only capital characters are returned.
* \param base the code of interest.
* \param rev_compl whether the reverse
* complement of the code is of interest.
* \return the corresponding DNA character.
*/
char map(int base, bool rev_compl=false) ;
/*!
* \brief Converts a DNA character (A, C,
* G, T) to an integer.
* \param c the DNA character of interest.
* \return the character integer code.
* \throw std::invalid_argument if the
* given character is not a valid DNA
* character.
*/
int char_to_int(char c, bool rev_compl= false) ;
/*!
* \brief Converts a DNA character matrix (A, C,
* G, T) to an integer matrix.
* The DNA characters are converted using
* SequenceLayer::char_to_int(char).
* param file_address the address of the file to load.
* \return the corresponding int matrix.
*/
Matrix2D<int> char_to_int(const Matrix2D<int>& matrix) ;
/*!
* \brief Converts an int DNA code to
* a regular DNA character (A, C, G, T).
* This method is the reverse method of
* char_to_int(char).
* \param n the DNA code of interest.
* \return the DNA character.
* \throw std::invalid_argument if the
* given int is not a valid DNA
* code.
*/
char int_to_char(int n, bool rev_compl=false) ;
/*!
* \brief Computes the base composition of a set of
* sequences, in integer format, contained in a matrix.
* \param sequences a matrix containing the sequences
* of interest.
* \param both_strands also accounts for the reverse
* complement of the sequences.
* \throw std::invalid_argument if a non-supported
* character is found in the matrix.
* \return a vector of 4 values corresponding to the
* frequencies of A,C,G and T
* respectively.
*/
std::vector<double> base_composition(const Matrix2D<int>& sequences,
bool both_strands) ;
/*!
* \brief Computes the base composition of a set of
* consensus sequences, contained in a probability
* matrix with the following dimensions:
* 1st the number of sequences
* 2nd the length of the sequences
* 3rd 4 for A,C,G,T
* The sum over the 1st and 2nd dimension should be 1.
* The overall sum of the matrix values should be equal
* to the 1st dimension.
* \param consensus_sequences a matrix containing the
* consensus sequences of interest.
* \param both_strands also accounts for the reverse
* complement of the sequences.
* \throw std::invalid_argument if the 3rd dimension
* of the consensus sequence matrix is not equal to 4.
* \return a vector of 4 values corresponding to the
* frequencies of A,C,G and T
* respectively.
*/
std::vector<double> base_composition(const Matrix3D<double>& consensus_sequences,
bool both_strands) ;
+
+ /*!
+ * \brief Constructs a kmer from a piece of consensus
+ * sequence piece contained in a probability matrix with
+ * the following dimensions:
+ * 1st the number of sequences
+ * 2nd the length of the sequences
+ * 3rd 4 for A,C,G,T
+ * The kmer of a consensus sequence is defined as the
+ * kmer composed of the most important base at each
+ * position.
+ * \param consensus_sequences the matrix containing the
+ * consensus sequence "kmer".
+ * \param row the index of the row containing the
+ * consensus sequence piece.
+ * \param from the starting position of the consensus
+ * sequence piece.
+ * \param to the past last ending position of the
+ * consensus sequence piece.
+ * \return the kmer.
+ */
+ std::string consensus_to_kmer(const Matrix3D<double>& consensus_sequences,
+ size_t row,
+ size_t from,
+ size_t to) ;
}
#endif // DNA_UTILITY_HPP
diff --git a/src/Utility/sorting_utility.hpp b/src/Utility/sorting_utility.hpp
new file mode 100755
index 0000000..8ce0fbe
--- /dev/null
+++ b/src/Utility/sorting_utility.hpp
@@ -0,0 +1,36 @@
+#ifndef SORTING_HPP
+#define SORTING_HPP
+
+#include <vector>
+#include <algorithm>
+
+/*! \brief Given a vector of data, it returns a vector of index
+ * corresponding to the index of each element after data sorting
+ * in ascending (by default) order.
+ * \param data a vector of interest.
+ * \param ascending specifies that the index should be returned in
+ * ascending order
+ * \return the index after sorting.
+ */
+template<class T>
+std::vector<std::size_t> order(const std::vector<T>& data, bool ascending=true)
+{
+ std::vector<std::size_t> index(data.size(), 0) ;
+
+ for(std::size_t i=0; i<index.size(); i++)
+ { index[i] = i ; }
+
+ sort(index.begin(), index.end(),
+ [&](const int& a, const int& b)
+ { return (data[a] < data[b]) ; }
+ ) ;
+
+ if(not ascending)
+ { std::reverse(index.begin(), index.end()) ; }
+
+ return index ;
+}
+
+
+
+#endif // SORTING_HPP
diff --git a/src/main_test.cpp b/src/main_test.cpp
index 7e2a898..a827475 100644
--- a/src/main_test.cpp
+++ b/src/main_test.cpp
@@ -1,453 +1,713 @@
#include <iostream>
#include <string>
#include <vector>
#include <utility>
#include <cmath>
+#include <unordered_map>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ConsensusSequenceModelComputer.hpp>
#include <EMConsensusSequence.hpp>
#include <SequenceModelComputer.hpp>
#include <EMSequence.hpp>
#include <dna_utility.hpp>
+#include <Statistics.hpp>
+#include <sorting_utility.hpp>
+#include <boost/math/distributions.hpp>
+
+#include <KmersStatistics.hpp>
template<class T>
std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
+template<class T, class U>
+std::ostream& operator << (std::ostream& stream, const std::unordered_map<T,U>& m)
+{ for(const auto& bucket : m)
+ { stream << bucket.first << " " << bucket.second << std::endl ; }
+ return stream ;
+}
+
Matrix3D<double> compute_consensus_sequences(const Matrix2D<int>& seq,
const Matrix4D<double>& prob,
size_t class_k)
{
size_t n_seq = seq.get_nrow() ;
size_t l_seq = seq.get_ncol() ;
// size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
bool flip = n_flip == 2 ;
size_t l_model = l_seq - n_shift + 1 ;
Matrix3D<double> cons_seq(n_seq, l_model, 4, 0.) ;
// aggregate
for(size_t i=0; i<n_seq; i++)
{ std::cerr << "i " << i << std::endl ;
for(size_t s=0; s<n_shift; s++)
{ //std::cerr << "s " << s << std::endl ;
double tot = 0. ;
// aggregate
for(size_t j=0; j<l_model; j++)
{ // std::cerr << "j " << j << std::endl ;
// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
int base = seq(i,s+j) ;
// N should participate to A,C,G,T equivalently -> same as doing nothing
if(base == dna::char_to_int('N'))
{ continue ; }
int base_rv = 4 - base - 1 ;
// --------- forward ---------
// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
cons_seq(i,j,base) += prob(i,class_k,s,0) ;
tot += prob(i,class_k,s,0) ;
// --------- reverse ---------
if(flip)
{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ;
tot += prob(i,class_k,s,1) ;
}
}
}
}
// normalize
for(size_t i=0; i<n_seq; i++)
{ for(size_t j=0; j<l_model; j++)
{ double tot = 0. ;
for(size_t k=0; k<4; k++)
{ tot += cons_seq(i,j,k) ; }
for(size_t k=0; k<4; k++)
{ cons_seq(i,j,k) /= tot ; }
}
}
return cons_seq ;
}
Matrix2D<int> char_to_int(const Matrix2D<char>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
Matrix2D<int> iseq(n_row, n_col, 0.) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ iseq(i,j) = dna::char_to_int(seq(i,j)) ; }
}
return iseq ;
}
Matrix3D<double> sequence_to_consensus(const Matrix2D<int>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
size_t n_dim3 = 4 ; // A, C, G, T
int base_n = dna::char_to_int('N') ;
Matrix3D<double> cseq(n_row, n_col, n_dim3, 1e-100) ;
double prob = 1. - 3*(1e-100) ; // p(base) - p(!base)
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ int base = seq(i,j) ;
// N
if(base == base_n)
{ cseq(i,j,0) = 0.25 ;
cseq(i,j,1) = 0.25 ;
cseq(i,j,2) = 0.25 ;
cseq(i,j,3) = 0.25 ;
}
// A C G T
else
{ cseq(i,j, base) = prob ; }
}
}
return cseq ;
}
double score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t start,
const Matrix2D<double>& model)
{ size_t ncol = cons_seq.get_dim()[1] ;
// size_t dim1 = cons_seq.get_dim()[0] ;
// size_t dim2 = cons_seq.get_dim()[1] ;
size_t dim3 = cons_seq.get_dim()[2] ;
if(start > ncol - model.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, ncol - model.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_nrow() > ncol)
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model.get_nrow(), ncol) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(dim3 != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
dim3) ;
throw std::invalid_argument(msg) ;
}
/*
double ll = 0 ;
for(size_t j_seq=from, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ for(size_t k=0; k<dim3; k++)
{ std::cerr << ll
<< " *= "
<< "("
<< model(j_mod,k)
<< " * "
<< cons_seq(row,j_seq,k)
<< ")"
<< std::endl ;
ll *= model(j_mod,k) * cons_seq(row,j_seq,k) ;
}
}*/
size_t from = start ;
size_t to = from + model.get_nrow() ; // will score [from, to)
double ll = 0. ;
for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ double sum = 0. ;
for(size_t k=0; k<dim3; k++)
{ std::cerr << sum
<< "+= ("
<< cons_seq(row, j_seq, k)
<< " * "
<< model(j_mod, k)
<< ")"
<< std::endl ;
sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ;
}
std::cerr << sum << std::endl ;
std::cerr << "---------------" << std::endl ;
ll += log(sum) ;
}
return ll ;
}
Matrix2D<double> test_consensus(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
// partition
EMConsensusSequence em(cseq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
// compute models
ConsensusSequenceModelComputer mc(std::move(cseq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_sequence(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
// partition
EMSequence em(seq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
// compute models
SequenceModelComputer mc(std::move(seq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_aggregate(const std::string& file)
{ Matrix2D<int> seq(file) ;
size_t nrow = seq.get_nrow() ;
size_t ncol = seq.get_ncol() ;
Matrix2D<double> model_final(4, ncol+1, 1e-100) ;
for(size_t i=0; i<4; i++)
{ model_final(i,0) = 1. ; }
std::vector<double> sum(ncol+1, 0.) ;
for(size_t i=0; i<nrow; i++)
{ for(size_t j=0; j<ncol; j++)
{ int base = seq(i,j) ;
if(base == dna::char_to_int('N'))
{ continue ; }
model_final(base, j+1) += 1. ;
sum[j+1] += 1. ;
}
}
for(size_t i=0; i<model_final.get_nrow(); i++)
{ for(size_t j=1; j<model_final.get_ncol()+1; j++)
{ model_final(i,j) /= sum[j] ; }
}
return model_final ;
}
+
+std::unordered_map<std::string, int> get_kmer_count(const Matrix3D<double>& consensus_sequence,
+ size_t length)
+{ std::unordered_map<std::string, int> kmer_count ;
+
+ size_t n_row = consensus_sequence.get_dim()[0] ;
+ size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;
+
+ for(size_t i=0; i<n_row; i++)
+ { for(size_t s=0; s<n_shift; s++)
+ { // get kmer
+ std::string kmer = dna::consensus_to_kmer(consensus_sequence,
+ i,
+ s,
+ s+length) ;
+ // insert
+ auto iter = kmer_count.find(kmer) ;
+ if(iter == kmer_count.end())
+ { kmer_count.emplace(kmer, 1) ; }
+ // update
+ else
+ { iter->second += 1 ; }
+ }
+ }
+ return kmer_count ;
+}
+
+Matrix2D<int> compute_mononucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts)
+{
+ Matrix2D<int> comp(k, 4, 0) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ {
+ std::string kmer = kmers[i] ;
+ int count = counts[i] ;
+
+ for (size_t j=0; j<k ; j++)
+ { int base = int(kmer[j]) - int('0') ;
+ comp(j, base) += count ;
+ }
+ }
+ return(comp);
+}
+
+Matrix3D<int> compute_dinucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts)
+{
+ Matrix3D<int> comp2(4, k-1, 4) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ { std::string kmer = kmers[i] ;
+ int count = counts[i] ;
+ for (size_t j=0; j<k-1; j++)
+ { int base1 = int(kmer[j]) - int('0') ;
+ int base2 = int(kmer[j+1]) - int('0') ;
+ comp2(base1, j, base2) += count ;
+ }
+ }
+ return(comp2);
+}
+
+std::vector<double> compute_exp_values(size_t k,
+ const std::vector<std::string>& kmers,
+ const Matrix2D<int>& comp1,
+ const Matrix3D<int>& comp2)
+{ std::vector<double> expected(kmers.size()) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ { std::string kmer = kmers[i] ;
+ double exp = comp1(0, int(kmer[0]) - int('0')) ;
+
+ for(size_t j=0; j<k-1; j++)
+ { int base1 = int(kmer[j]) - int('0') ;
+ int base2 = int(kmer[j+1]) - int('0') ;
+ exp *= ((double)comp2(base1, j, base2) /
+ (double)comp1(j+1, base2)) ;
+ }
+
+ expected[i] = exp ;
+ }
+ return expected;
+}
+
+std::vector<double> compute_pvalues(const std::vector<int>& counts,
+ const std::vector<double>& expected)
+{
+ std::vector<double> pvalues(counts.size()) ;
+
+ for(size_t i=0; i<counts.size(); i++)
+ {
+ pvalues[i] = poisson_cdf((double)counts[i],
+ expected[i],
+ false) ;
+ }
+ return pvalues ;
+}
+
+std::pair<std::vector<std::string>, std::vector<double>>
+compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
+ size_t k)
+{
+
+ // get kmer count
+ std::unordered_map<std::string, int> kmer_count = get_kmer_count(consensus_sequences,
+ k) ;
+
+ /*
+ consensus_sequences.get_dim() ;
+ std::unordered_map<std::string, int> kmer_count ;
+ // ANN
+ kmer_count["000"] = 1 ;
+ kmer_count["001"] = 2 ;
+ kmer_count["002"] = 3 ;
+ kmer_count["003"] = 4 ;
+ kmer_count["010"] = 5 ;
+ kmer_count["011"] = 6 ;
+ kmer_count["012"] = 7 ;
+ kmer_count["013"] = 8 ;
+ kmer_count["020"] = 9 ;
+ kmer_count["021"] = 10 ;
+ kmer_count["022"] = 11 ;
+ kmer_count["023"] = 12 ;
+ kmer_count["030"] = 13 ;
+ kmer_count["031"] = 14 ;
+ kmer_count["032"] = 15 ;
+ kmer_count["033"] = 16 ;
+ // CNN
+ kmer_count["100"] = 1 ;
+ kmer_count["101"] = 2 ;
+ kmer_count["102"] = 3 ;
+ kmer_count["103"] = 4 ;
+ kmer_count["110"] = 5 ;
+ kmer_count["111"] = 6 ;
+ kmer_count["112"] = 7 ;
+ kmer_count["113"] = 8 ;
+ kmer_count["120"] = 9 ;
+ kmer_count["121"] = 10 ;
+ kmer_count["122"] = 11 ;
+ kmer_count["123"] = 12 ;
+ kmer_count["130"] = 13 ;
+ kmer_count["131"] = 14 ;
+ kmer_count["132"] = 15 ;
+ kmer_count["133"] = 16 ;
+ // GNN
+ kmer_count["200"] = 1 ;
+ kmer_count["201"] = 2 ;
+ kmer_count["202"] = 3 ;
+ kmer_count["203"] = 4 ;
+ kmer_count["210"] = 5 ;
+ kmer_count["211"] = 6 ;
+ kmer_count["212"] = 7 ;
+ kmer_count["213"] = 8 ;
+ kmer_count["220"] = 9 ;
+ kmer_count["221"] = 10 ;
+ kmer_count["222"] = 11 ;
+ kmer_count["223"] = 12 ;
+ kmer_count["230"] = 13 ;
+ kmer_count["231"] = 14 ;
+ kmer_count["232"] = 15 ;
+ kmer_count["233"] = 16 ;
+ // TNN
+ kmer_count["300"] = 1 ;
+ kmer_count["301"] = 2 ;
+ kmer_count["302"] = 3 ;
+ kmer_count["303"] = 4 ;
+ kmer_count["310"] = 5 ;
+ kmer_count["311"] = 6 ;
+ kmer_count["312"] = 7 ;
+ kmer_count["313"] = 8 ;
+ kmer_count["320"] = 9 ;
+ kmer_count["321"] = 10 ;
+ kmer_count["322"] = 11 ;
+ kmer_count["323"] = 12 ;
+ kmer_count["330"] = 13 ;
+ kmer_count["331"] = 14 ;
+ kmer_count["332"] = 15 ;
+ kmer_count["333"] = 16 ;
+ */
+
+ // turn to vectors
+ std::vector<std::string> kmers(kmer_count.size()) ;
+ size_t i=0 ;
+ for(const auto& iter : kmer_count)
+ { kmers[i] = iter.first ;
+ i++ ;
+ }
+ std::sort(kmers.begin(), kmers.end()) ;
+ std::vector<int> counts(kmer_count.size()) ;
+ i=0 ;
+ for(const auto& kmer : kmers)
+ { counts[i] = kmer_count[kmer] ;
+ i++ ;
+ }
+ kmer_count.clear() ;
+
+ // get mononucleotide composition in kmer
+ // this is a probability matrix modeling the kmer
+ Matrix2D<int> comp1 = compute_mononucleotide(k, kmers, counts) ;
+
+ // get dinucleotide composition in kmer
+ // this is a probability matrix modeling the kmer
+ Matrix3D<int> comp2 = compute_dinucleotide(k, kmers, counts) ;
+
+ // get expected number of occurence for each kmer
+ std::vector<double> expected = compute_exp_values(k,
+ kmers,
+ comp1,
+ comp2) ;
+
+ // compute the pvalue associated to each kmer
+ std::vector<double> pvalues = compute_pvalues(counts,
+ expected) ;
+
+ return std::make_pair(kmers, pvalues) ;
+}
+
int main()
{
+ // Matrix3D<double> mat(1,5,4,0.) ;
+ // mat(0,0,0) = 1. ; mat(0,1,1) = 1. ; mat(0,2,2) = 1. ; mat(0,3,3) = 1. ; mat(0,4,2) = 1. ;
+
+
+
+ std::string file = "/local/groux/scATAC-seq/data/"
+ "10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
+ Matrix3D<double> mat = sequence_to_consensus(Matrix2D<int>(file)) ;
+
+
+ size_t k = 12 ;
+ auto kmers_pvalues = kmers::compute_kmer_pvalue(mat, k) ;
+
+ // for(size_t i=0; i<kmers_pvalues.first.size(); i++)
+ // { std::cerr << kmers_pvalues.first[i] << " " << kmers_pvalues.second[i] << std::endl ; }
+
+ std::vector<size_t> index = order(kmers_pvalues.second, true) ;
+
+ for(size_t i=0 ; i<10; i++)
+ { size_t idx = index[i] ;
+ std::cout << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ;
+ }
+
+ /*
size_t n_class = 2 ;
- size_t n_iter = 50 ;
+ size_t n_iter = 50 ;sequence_to_consensus
size_t n_shift = 1 ;
bool flip = true ;
bool bckg_class = false ;
std::string seed = "" ;
size_t n_thread = 1 ;
std::string file = "/local/groux/scATAC-seq/data/"
"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
+ */
// std::string file = "/local/groux/scATAC-seq/test.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_noflip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_flip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
/*
Matrix2D<double> model_final = test_consensus(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
*/
+ /*
Matrix2D<double> model_final = test_sequence(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
+ */
/*
Matrix2D<double> model_final = test_aggregate(file) ;
*/
- std::cout << model_final << std::endl ;
+ // std::cout << model_final << std::endl ;
/*
double p0 = 0 ;
// double l0 = log(p0) ;
double p1 = 1. - (3.*p0) ;
// double l1 = log(p1) ;
// Matrix3D<double> cseq(1, 4, 4) ;
// cseq(0, 0, 0) = p1 ; cseq(0, 0, 1) = p0 ; cseq(0, 0, 2) = p0 ; cseq(0, 0, 3) = p0 ;
// cseq(0, 1, 0) = p0 ; cseq(0, 1, 1) = p1 ; cseq(0, 1, 2) = p0 ; cseq(0, 1, 3) = p0 ;
// cseq(0, 2, 0) = p0 ; cseq(0, 2, 1) = p0 ; cseq(0, 2, 2) = p1 ; cseq(0, 2, 3) = p0 ;
// cseq(0, 3, 0) = p0 ; cseq(0, 3, 1) = p0 ; cseq(0, 3, 2) = p0 ; cseq(0, 3, 3) = p1 ;
Matrix2D<int> seq(1, 4, -1) ;
seq(0, 0) = 0 ; seq(0, 1) = 1 ; seq(0, 2) = 2 ; seq(0, 3) = 3 ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
std::cout << cseq << std::endl << std::endl ;
Matrix2D<double> mod(4, 4, -1) ;
mod(0,0) = p1 ; mod(0,1) = p0 ; mod(0,2) = p0 ; mod(0,3) = p0 ;
mod(1,0) = p0 ; mod(1,1) = p1 ; mod(1,2) = p0 ; mod(1,3) = p0 ;
mod(2,0) = p0 ; mod(2,1) = p0 ; mod(2,2) = p1 ; mod(2,3) = p0 ;
mod(3,0) = p0 ; mod(3,1) = p0 ; mod(3,2) = p0 ; mod(3,3) = p1 ;
std::cout << mod << std::endl ;
std::cout << score_consensus_subseq(cseq, 0,0,mod)
<< std::endl ;
*/
+
return EXIT_SUCCESS ;
}

Event Timeline